In [19]:
import os
import logging
from pathlib import Path
from github import Github
from git import Repo
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import re
import difflib

In [20]:
# === CONFIGURATION ===
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
YOUR_REPOS = [
    "pmndrs/zustand"
    # add your own "owner/repo" strings here
]
REPOS_ROOT = Path("cloned_repos")
OUTPUT_CSV = "./data/zustand.csv"

# === LOGGING SETUP ===
logger = logging.getLogger("MergeExtractor")
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
formatter = logging.Formatter(
    fmt="%(asctime)s %(levelname)-5s [%(name)s] %(message)s",
    datefmt="%Y-%m-%dT%H:%M:%S"
)
handler.setFormatter(formatter)
logger.addHandler(handler)

# === GITHUB API CLIENT (optional, only for checks) ===
gh = Github(GITHUB_TOKEN) if GITHUB_TOKEN else None
print(gh)

None


## Clone Repo

In [21]:
def clone_repo(repo_full_name, root_dir=REPOS_ROOT):
    dest = root_dir / repo_full_name.replace("/", "_")
    if dest.exists():
        logger.info(f"Reusing existing clone: {repo_full_name}")
    else:
        logger.info(f"Cloning {repo_full_name}")
        try:
            Repo.clone_from(f"https://github.com/{repo_full_name}.git", dest)
        except Exception as e:
            logger.error(f"Failed to clone {repo_full_name}: {e}")
            return None
    return dest

In [22]:
def get_diff(a, b):
    """Return unified diff (only changed lines) between two blobs."""
    a_lines = a.splitlines() if a else []
    b_lines = b.splitlines() if b else []
    diff = list(difflib.unified_diff(a_lines, b_lines, lineterm=''))
    return "\n".join([line for line in diff if line.startswith(('+', '-')) and not line.startswith(('+++', '---'))])

In [23]:
def try_git_show(repo, blob_ref):
    try:
        return repo.git.show(blob_ref)
    except Exception:
        return ""

In [24]:
def extract_merge_data(repo_path):
    logger.info(f"Extracting merge diffs from repo: {repo_path.name}")
    repo = Repo(repo_path)
    data = []

    for commit in repo.iter_commits('--all'):
        if len(commit.parents) != 2:
            continue

        p1, p2 = commit.parents
        try:
            base_commit = repo.git.merge_base(p1, p2)
        except Exception:
            logger.debug(f"Skipping commit {commit.hexsha}: no merge_base found")
            continue

        try:
            diffs = commit.diff(p1)
        except Exception as e:
            logger.warning(f"Failed to diff {commit.hexsha} vs parent: {e}")
            continue

        for diff in diffs:
            file_path = diff.a_path or diff.b_path
            if not file_path or diff.new_file or diff.deleted_file:
                continue

            try:
                base_blob = try_git_show(repo, f"{base_commit}:{file_path}")
                left_blob = try_git_show(repo, f"{p1.hexsha}:{file_path}")
                right_blob = try_git_show(repo, f"{p2.hexsha}:{file_path}")
                merged_blob = try_git_show(repo, f"{commit.hexsha}:{file_path}")

                left_diff = get_diff(base_blob, left_blob)
                right_diff = get_diff(base_blob, right_blob)
                merged_diff = get_diff(base_blob, merged_blob)

                if any([left_diff, right_diff, merged_diff]):
                    data.append({
                        "repo": repo_path.name,
                        "commit": commit.hexsha,
                        "commit_msg": commit.message.strip().replace('\n', ' '),
                        "file": file_path,
                        "left_diff": left_diff.strip(),
                        "right_diff": right_diff.strip(),
                        "merged_diff": merged_diff.strip()
                    })
                    logger.debug(f"Captured diff for {file_path} from commit {commit.hexsha}")
            except Exception as e:
                logger.warning(f"Skipping file {file_path} at {commit.hexsha}: {e}")
    return data

In [25]:
def extract_changed_lines_from_patch(patch):
    """Extract only the actual changed lines (+ or -) from a unified diff patch."""
    lines = patch.splitlines()
    changed = []
    for line in lines:
        if line.startswith('+') and not line.startswith('+++'):
            changed.append(line)
        elif line.startswith('-') and not line.startswith('---'):
            changed.append(line)
    return '\n'.join(changed)

In [26]:
def extract_conflict_blocks(content):
    """Extract blocks with merge conflict markers."""
    pattern = re.compile(r"<<<<<<<.*?\n(.*?)=======\n(.*?)>>>>>>>.*?\n", re.DOTALL)
    matches = pattern.findall(content)
    blocks = []
    for left, right in matches:
        blocks.append(f"<<<<<<<\n{left}=======\n{right}>>>>>>>")
    return blocks

In [27]:
def main():
    logger.info("Starting workflow with custom repo list")
    REPOS_ROOT.mkdir(exist_ok=True)
    
    clone_paths = []
    for full_name in YOUR_REPOS:
        path = clone_repo(full_name)
        if path:
            clone_paths.append(path)
    
    all_data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = {executor.submit(extract_merge_data, p): p for p in clone_paths}
        for fut in tqdm(concurrent.futures.as_completed(futures),
                        total=len(futures), desc="Extracting merges"):
            path = futures[fut]
            try:
                result = fut.result()
                all_data.extend(result)
            except Exception as e:
                logger.error(f"Error processing {path.name}: {e}")
                
    Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
    df = pd.DataFrame(all_data, columns=["repo", "commit", "commit_msg", "file", "left_diff", "right_diff", "merged_diff"])
    df.to_csv(OUTPUT_CSV, index=False)
    logger.info(f"Saved {len(df)} records to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()

2025-07-27T20:19:17 INFO  [MergeExtractor] Starting workflow with custom repo list
2025-07-27T20:19:17 INFO  [MergeExtractor] Starting workflow with custom repo list
2025-07-27T20:19:17 INFO  [MergeExtractor] Cloning pmndrs/zustand
2025-07-27T20:19:17 INFO  [MergeExtractor] Cloning pmndrs/zustand
2025-07-27T20:19:22 INFO  [MergeExtractor] Extracting merge diffs from repo: pmndrs_zustand
2025-07-27T20:19:22 INFO  [MergeExtractor] Extracting merge diffs from repo: pmndrs_zustand
Extracting merges:   0%|          | 0/1 [00:00<?, ?it/s]2025-07-27T20:19:22 DEBUG [MergeExtractor] Captured diff for .eslintrc.json from commit a379a958b620a9b6dc81ac253eab98e06138a22f
2025-07-27T20:19:22 DEBUG [MergeExtractor] Captured diff for .eslintrc.json from commit a379a958b620a9b6dc81ac253eab98e06138a22f
2025-07-27T20:19:22 DEBUG [MergeExtractor] Captured diff for .github/ISSUE_TEMPLATE/bug_report.md from commit a379a958b620a9b6dc81ac253eab98e06138a22f
2025-07-27T20:19:22 DEBUG [MergeExtractor] Captured d

In [31]:
df = pd.read_csv('./data/zustand.csv')

In [32]:
df.isnull()

Unnamed: 0,repo,commit,commit_msg,file,left_diff,right_diff,merged_diff
0,False,False,False,False,True,False,False
1,False,False,False,False,True,False,False
2,False,False,False,False,True,False,False
3,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
249,False,False,False,False,True,False,False
250,False,False,False,False,True,False,False
251,False,False,False,False,False,False,False
252,False,False,False,False,True,False,False


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   repo         254 non-null    object
 1   commit       254 non-null    object
 2   commit_msg   254 non-null    object
 3   file         254 non-null    object
 4   left_diff    59 non-null     object
 5   right_diff   254 non-null    object
 6   merged_diff  254 non-null    object
dtypes: object(7)
memory usage: 14.0+ KB
