In [None]:
import os
import logging
from pathlib import Path
from github import Github
from git import Repo
import pandas as pd
from tqdm import tqdm
import concurrent.futures

# === CONFIGURATION ===
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
YOUR_REPOS = [
    "remix-run/react-router"
    # add your own "owner/repo" strings here
]
REPOS_ROOT = Path("cloned_repos")
OUTPUT_CSV = "./data/react_router.csv"

# === LOGGING SETUP ===
logger = logging.getLogger("MergeExtractor")
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
formatter = logging.Formatter(
    fmt="%(asctime)s %(levelname)-5s [%(name)s] %(message)s",
    datefmt="%Y-%m-%dT%H:%M:%S"
)
handler.setFormatter(formatter)
logger.addHandler(handler)

# === GITHUB API CLIENT (optional, only for checks) ===
gh = Github(GITHUB_TOKEN) if GITHUB_TOKEN else None
print(gh)

def clone_repo(repo_full_name, root_dir=REPOS_ROOT):
    dest = root_dir / repo_full_name.replace("/", "_")
    if dest.exists():
        logger.info(f"Reusing existing clone: {repo_full_name}")
    else:
        logger.info(f"Cloning {repo_full_name}")
        try:
            Repo.clone_from(f"https://github.com/{repo_full_name}.git", dest)
        except Exception as e:
            logger.error(f"Failed to clone {repo_full_name}: {e}")
            return None
    return dest

def extract_merge_data(repo_path):
    logger.info(f"Extracting merge commits from repo: {repo_path.name}")
    repo = Repo(repo_path)
    data = []
    for commit in repo.iter_commits('--all'):
        if len(commit.parents) != 2:
            continue
        p1, p2 = commit.parents
        try:
            base = repo.git.merge_base(p1, p2)
        except Exception:
            logger.debug(f"Skipping commit {commit.hexsha}: no merge_base found")
            continue
        for diff in commit.diff(p1):
            file_path = diff.a_path or diff.b_path
            if not file_path:
                continue
            try:
                base_blob = repo.git.show(f"{base}:{file_path}")
                left_blob = repo.git.show(f"{p1.hexsha}:{file_path}")
                right_blob = repo.git.show(f"{p2.hexsha}:{file_path}")
                merged_blob = repo.git.show(f"{commit.hexsha}:{file_path}")
                data.append({
                    "base": base_blob,
                    "left": left_blob,
                    "right": right_blob,
                    "merged": merged_blob,
                    "file": file_path,
                    "repo": repo_path.name,
                    "commit": commit.hexsha
                })
                logger.debug(f"Commit {commit.hexsha}: added file {file_path}")
            except Exception:
                logger.warning(f"Skipping file {file_path} in commit {commit.hexsha}")
    logger.info(f"Found {len(data)} merge-resolved files in {repo_path.name}")
    return data

def main():
    logger.info("Starting workflow with custom repo list")
    REPOS_ROOT.mkdir(exist_ok=True)
    
    clone_paths = []
    for full_name in YOUR_REPOS:
        path = clone_repo(full_name)
        if path:
            clone_paths.append(path)
    
    all_data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = {executor.submit(extract_merge_data, p): p for p in clone_paths}
        for fut in tqdm(concurrent.futures.as_completed(futures),
                        total=len(futures), desc="Extracting merges"):
            path = futures[fut]
            try:
                result = fut.result()
                all_data.extend(result)
            except Exception as e:
                logger.error(f"Error processing {path.name}: {e}")
    
    df = pd.DataFrame(all_data)
    df.to_csv(OUTPUT_CSV, index=False)
    logger.info(f"Saved {len(df)} records to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()

2025-07-25T14:41:57 INFO  [MergeExtractor] Starting workflow with custom repo list
2025-07-25T14:41:57 INFO  [MergeExtractor] Cloning remix-run/react-router


None


2025-07-25T14:42:04 INFO  [MergeExtractor] Extracting merge commits from repo: remix-run_react-router
2025-07-25T14:42:04 DEBUG [MergeExtractor] Commit 2e22e19f08d84593e5d3eb5224845a2454f23982: added file .github/workflows/docs.yml
2025-07-25T14:42:04 DEBUG [MergeExtractor] Commit 2e22e19f08d84593e5d3eb5224845a2454f23982: added file CHANGELOG.md
2025-07-25T14:42:04 DEBUG [MergeExtractor] Commit 2e22e19f08d84593e5d3eb5224845a2454f23982: added file contributors.yml
2025-07-25T14:42:04 DEBUG [MergeExtractor] Commit 2e22e19f08d84593e5d3eb5224845a2454f23982: added file docs/api/components/Form.md
2025-07-25T14:42:04 DEBUG [MergeExtractor] Commit 2e22e19f08d84593e5d3eb5224845a2454f23982: added file docs/api/components/Link.md
2025-07-25T14:42:04 DEBUG [MergeExtractor] Commit 2e22e19f08d84593e5d3eb5224845a2454f23982: added file docs/api/components/Links.md
2025-07-25T14:42:04 DEBUG [MergeExtractor] Commit 2e22e19f08d84593e5d3eb5224845a2454f23982: added file docs/api/components/Meta.md
2025-07

In [1]:
import pandas as pd
df = pd.read_csv('data/react_router.csv')

In [6]:
df.head()

Unnamed: 0,base,left,right,merged,file,repo,commit
0,name: 📚 Docs\n\non:\n push:\n branches:\n ...,name: 📚 Docs\n\non:\n push:\n branches:\n ...,name: 📚 Docs\n\non:\n push:\n branches:\n ...,name: 📚 Docs\n\non:\n push:\n branches:\n ...,.github/workflows/docs.yml,remix-run_react-router,2e22e19f08d84593e5d3eb5224845a2454f23982
1,<!-- markdownlint-disable no-duplicate-header ...,<!-- markdownlint-disable no-duplicate-header ...,<!-- markdownlint-disable no-duplicate-header ...,<!-- markdownlint-disable no-duplicate-header ...,CHANGELOG.md,remix-run_react-router,2e22e19f08d84593e5d3eb5224845a2454f23982
2,- 0xEddie\n- 3fuyang\n- 43081j\n- aarbi\n- abd...,- 0xEddie\n- 3fuyang\n- 43081j\n- aarbi\n- abd...,- 0xEddie\n- 3fuyang\n- 43081j\n- aarbi\n- abd...,- 0xEddie\n- 3fuyang\n- 43081j\n- aarbi\n- abd...,contributors.yml,remix-run_react-router,2e22e19f08d84593e5d3eb5224845a2454f23982
3,---\ntitle: Form\n---\n\n# Form\n\n<!--\n⚠️ ⚠️...,---\ntitle: Form\n---\n\n# Form\n\n<!--\n⚠️ ⚠️...,---\ntitle: Form\n---\n\n# Form\n\n<!--\n⚠️ ⚠️...,---\ntitle: Form\n---\n\n# Form\n\n<!--\n⚠️ ⚠️...,docs/api/components/Form.md,remix-run_react-router,2e22e19f08d84593e5d3eb5224845a2454f23982
4,---\ntitle: Link\n---\n\n# Link\n\n<!--\n⚠️ ⚠️...,---\ntitle: Link\n---\n\n# Link\n\n<!--\n⚠️ ⚠️...,---\ntitle: Link\n---\n\n# Link\n\n<!--\n⚠️ ⚠️...,---\ntitle: Link\n---\n\n# Link\n\n<!--\n⚠️ ⚠️...,docs/api/components/Link.md,remix-run_react-router,2e22e19f08d84593e5d3eb5224845a2454f23982


In [16]:
df.info()
df.merged

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9165 entries, 0 to 9164
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   base    9165 non-null   object
 1   left    9165 non-null   object
 2   right   9165 non-null   object
 3   merged  9165 non-null   object
 4   file    9165 non-null   object
 5   repo    9165 non-null   object
 6   commit  9165 non-null   object
dtypes: object(7)
memory usage: 501.3+ KB


0       name: 📚 Docs\n\non:\n  push:\n    branches:\n ...
1       <!-- markdownlint-disable no-duplicate-header ...
2       - 0xEddie\n- 3fuyang\n- 43081j\n- aarbi\n- abd...
3       ---\ntitle: Form\n---\n\n# Form\n\n<!--\n⚠️ ⚠️...
4       ---\ntitle: Link\n---\n\n# Link\n\n<!--\n⚠️ ⚠️...
                              ...                        
9160    {\n  "name": "react-router-dom",\n  "version":...
9161    # `react-router-native`\n\n## 6.13.0\n\n### Pa...
9162    import * as ReactRouter from "react-router";\n...
9163    {\n  "name": "react-router-native",\n  "versio...
9164    # `react-router`\n\n## 6.13.0\n\n### Minor Cha...
Name: merged, Length: 9165, dtype: object