# Extracție Comentarii PR

Salvăm atât comentarii de tip issue-like, cât și cele de review (diff hunk, path etc.).

In [2]:
%run ./utils.ipynb

In [3]:
import pandas as pd

In [4]:
def extract_pr_comments(repo_full_name: str,
                        max_comments: int = None) -> pd.DataFrame:
    """
    Extrage comentariile pentru PR-uri:
      - comentarii issue-like și review comments
      - max_comments: dacă nu e None, limită totală de comentarii extrase
    """
    repo  = gh.get_repo(repo_full_name)
    rows  = []
    count = 0

    # Parcurgem fiecare PR
    for pr in repo.get_pulls(state="all"):
        # Comentarii issue-like
        for c in pr.get_comments():
            if max_comments and count >= max_comments:
                break
            r = c.raw_data.get("reactions", {})
            rows.append({
                "repo_full_name":     repo_full_name,
                "pr_id":              pr.id,
                "comment_id":         c.id,
                "user_login":         c.user.login,
                "user_id":            c.user.id,
                "created_at":         c.created_at.isoformat(),
                "updated_at":         c.updated_at.isoformat(),
                "body":               c.body,
                "is_review_comment":  False,
                "path":               None,
                "position":           None,
                "diff_hunk":          None,
                "reactions_total":    r.get("total_count", 0),
                "reactions_plus1":    r.get("+1", 0),
                "reactions_minus1":   r.get("-1", 0),
                "reactions_laugh":    r.get("laugh", 0),
                "reactions_hooray":   r.get("hooray", 0),
                "reactions_confused": r.get("confused", 0),
                "reactions_heart":    r.get("heart", 0)
            })
            count += 1
        if max_comments and count >= max_comments:
            break

        # Comentarii de tip review
        for rc in pr.get_review_comments():
            if max_comments and count >= max_comments:
                break
            r = rc.raw_data.get("reactions", {})
            rows.append({
                "repo_full_name":     repo_full_name,
                "pr_id":              pr.id,
                "comment_id":         rc.id,
                "user_login":         rc.user.login,
                "user_id":            rc.user.id,
                "created_at":         rc.created_at.isoformat(),
                "updated_at":         rc.updated_at.isoformat(),
                "body":               rc.body,
                "is_review_comment":  True,
                "path":               rc.path,
                "position":           rc.position,
                "diff_hunk":          rc.diff_hunk,
                "reactions_total":    r.get("total_count", 0),
                "reactions_plus1":    r.get("+1", 0),
                "reactions_minus1":   r.get("-1", 0),
                "reactions_laugh":    r.get("laugh", 0),
                "reactions_hooray":   r.get("hooray", 0),
                "reactions_confused": r.get("confused", 0),
                "reactions_heart":    r.get("heart", 0)
            })
            count += 1
        if max_comments and count >= max_comments:
            break

    df = pd.DataFrame(rows)
    folder = ensure_repo_folder(repo_full_name)
    df.to_csv(folder / "pr_comments.csv", index=False)
    return df

In [5]:
# Citește lista de repo-uri
repos = read_repo_list("shallow_data.csv")
print("Primele 3 repo-uri:", repos[:3])

Primele 3 repo-uri: ['microsoft/ML-For-Beginners', 'apache/superset', 'keras-team/keras']


In [None]:
# Test: extragem primele 20 de comentarii PR pentru primul repo
test_repo = repos[0]
df_test = extract_pr_comments(test_repo, max_comments=20)
print("Test shape:", df_test.shape)
df_test.head()

In [None]:
# Loop complet: toate PR-comments pentru fiecare repo
for full_name in repos:
    log(f"PR comments → {full_name}")
    extract_pr_comments(full_name, max_comments=2)
    

[2025-05-11T22:12:11.156039] PR comments → microsoft/ML-For-Beginners
