# Extracție Pull Requests

Colectăm pentru fiecare PR metadatele, statistici și recenzii (review requests).

In [1]:
%run ./utils.ipynb

In [2]:
import pandas as pd

In [3]:
def extract_prs(repo_full_name: str, max_prs: int = None) -> pd.DataFrame:
    """
    Extrage pull requests pentru un repo.
    - repo_full_name: 'owner/repo'
    - max_prs: dacă nu e None, numărul maxim de PR-uri de extras
    """
    repo = gh.get_repo(repo_full_name)
    rows = []

    for i, pr in enumerate(repo.get_pulls(state="all", sort="created", direction="asc")):
        if max_prs and i >= max_prs:
            break
        users, teams = pr.get_review_requests()
        rows.append({
            "repo_full_name":         repo_full_name,
            "pr_id":                  pr.id,
            "number":                 pr.number,
            "title":                  pr.title,
            "body":                   pr.body,
            "user_login":             pr.user.login,
            "user_id":                pr.user.id,
            "state":                  pr.state,
            "draft":                  pr.draft,
            "created_at":             pr.created_at.isoformat(),
            "updated_at":             pr.updated_at.isoformat(),
            "closed_at":              pr.closed_at.isoformat() if pr.closed_at else None,
            "merged_at":              pr.merged_at.isoformat() if pr.merged_at else None,
            "merge_commit_sha":       pr.merge_commit_sha,
            "mergeable_state":        pr.mergeable_state,
            "additions":              pr.additions,
            "deletions":              pr.deletions,
            "changed_files":          pr.changed_files,
            "commits_count":          pr.commits,
            "review_comments_count":  pr.review_comments,
            "comments_count":         pr.comments,
            "requested_reviewers":    ";".join([u.login for u in users]),
            "requested_teams":        ";".join([t.slug for t in teams]),
            "labels":                 ";".join([lbl["name"] for lbl in pr.raw_data.get("labels", [])])
        })

    df = pd.DataFrame(rows)
    folder = ensure_repo_folder(repo_full_name)
    df.to_csv(folder / "prs.csv", index=False)
    return df

In [4]:
# Citește lista de repo-uri pentru test
repos = read_repo_list("shallow_data.csv")
print("Primele 3 repo-uri:", repos[:3])

Primele 3 repo-uri: ['microsoft/ML-For-Beginners', 'apache/superset', 'keras-team/keras']


In [5]:
# Test extract_prs: primele 20 PR-uri pentru primul repo
test_repo = repos[0]
df_test = extract_prs(test_repo, max_prs=20)
print("Test shape:", df_test.shape)
df_test.head()

Test shape: (20, 24)


Unnamed: 0,repo_full_name,pr_id,number,title,body,user_login,user_id,state,draft,created_at,...,mergeable_state,additions,deletions,changed_files,commits_count,review_comments_count,comments_count,requested_reviewers,requested_teams,labels
0,microsoft/ML-For-Beginners,670352683,32,editorial suggestions,"editorial suggestions, adapt it to LEARN a lit...",softchris,4598064,closed,False,2021-06-15T12:18:00+00:00,...,unknown,45,10,1,2,2,1,,,
1,microsoft/ML-For-Beginners,670363612,33,editorial changes,- smaller editoral changes,softchris,4598064,closed,False,2021-06-15T12:33:24+00:00,...,unknown,15,10,1,2,2,2,,,
2,microsoft/ML-For-Beginners,670405006,34,editorial,the idea is to fill in the bullet lists a litt...,softchris,4598064,closed,False,2021-06-15T13:25:28+00:00,...,unknown,59,47,1,2,8,1,,,
3,microsoft/ML-For-Beginners,671926095,35,changed structure,- adjusted to look like concept units and exer...,softchris,4598064,closed,False,2021-06-16T21:36:11+00:00,...,unknown,192,86,2,4,0,4,,,
4,microsoft/ML-For-Beginners,672615430,36,editorial changes,Editorial changes\r\n\r\n- changed a lot of h3...,softchris,4598064,closed,False,2021-06-17T13:06:38+00:00,...,unknown,122,75,1,2,0,1,,,


In [None]:
# Loop complet: extrage toate PR-urile pentru fiecare repo
for full_name in repos:
    log(f"PRs → {full_name}")
    extract_prs(full_name, max_prs=2)