# Extracție Commits

Pentru fiecare commit vom salva: sha, URL, autor, date, LoC și număr de fișiere modificate.

In [1]:
# Încarcă funcțiile din utils.ipynb
%run ./utils.ipynb
print("a")

a


In [2]:
from github.GithubException import GithubException
import pandas as pd

In [3]:
# Test read_repo_list()
repos = read_repo_list("shallow_data.csv")
print(f"Am {len(repos)} repo-uri. Primele 5:", repos[:5])

Am 50 repo-uri. Primele 5: ['microsoft/ML-For-Beginners', 'apache/superset', 'keras-team/keras', 'scikit-learn/scikit-learn', 'pandas-dev/pandas']


In [4]:
# Test ensure_repo_folder()
test_repo = repos[0]
folder = ensure_repo_folder(test_repo)
print("Folder creat pentru", test_repo, "→", folder)

Folder creat pentru microsoft/ML-For-Beginners → data/repos/microsoft_ML-For-Beginners


In [5]:
def extract_commits(repo_full_name: str, dest_folder: Path, max_commits: int = None) -> pd.DataFrame:
    repo = gh.get_repo(repo_full_name)
    rows = []
    for i, commit in enumerate(repo.get_commits()):
        if max_commits and i >= max_commits:
            break
        stats     = commit.stats
        author    = commit.author
        committer = commit.committer
        rows.append({
            "repo_full_name":     repo_full_name,
            "sha":                commit.sha,
            "html_url":           commit.html_url,
            "author_login":       author.login    if author    else None,
            "author_id":          author.id       if author    else None,
            "author_type":        author.type     if author    else None,
            "authored_date":      commit.commit.author.date.isoformat(),
            "committer_login":    committer.login if committer else None,
            "committed_date":     commit.commit.committer.date.isoformat(),
            "message":            commit.commit.message,
            "additions":          stats.additions,
            "deletions":          stats.deletions,
            "total_changes":      stats.total,
            "files_changed_count": len(list(commit.files)),
            "parent_shas":        ",".join(p.sha for p in commit.parents)
        })
    df = pd.DataFrame(rows)
    df.to_csv(dest_folder / "commits.csv", index=False)
    return df

In [6]:
# Test extract_commits pe primele 10 commit-uri
df_test = extract_commits(test_repo, folder, max_commits=10)
print("Test shape:", df_test.shape)
df_test.head()

Test shape: (10, 15)


Unnamed: 0,repo_full_name,sha,html_url,author_login,author_id,author_type,authored_date,committer_login,committed_date,message,additions,deletions,total_changes,files_changed_count,parent_shas
0,microsoft/ML-For-Beginners,09b03ccb52eb73842fd01a3f93a30f2905410bfa,https://github.com/microsoft/ML-For-Beginners/...,BethanyJep,44121227,User,2025-02-14T08:01:49+00:00,web-flow,2025-02-14T08:01:49+00:00,Merge pull request #789 from BethanyJep/transl...,40284,0,40284,2354,"90f5aee98a28a5e0f7d4e4fde9af5ce6793ae936,81ff2..."
1,microsoft/ML-For-Beginners,81ff2e6d13c3eee24a56eba28e1716b8274a08a0,https://github.com/microsoft/ML-For-Beginners/...,BethanyJep,44121227,User,2025-02-14T06:08:45+00:00,web-flow,2025-02-14T06:08:45+00:00,Merge branch 'microsoft:main' into translations,12,8,20,1,"9fc2b847b8d4a77242cbb6a432c9fd29e4d61e67,90f5a..."
2,microsoft/ML-For-Beginners,9fc2b847b8d4a77242cbb6a432c9fd29e4d61e67,https://github.com/microsoft/ML-For-Beginners/...,BethanyJep,44121227,User,2025-02-13T19:55:17+00:00,BethanyJep,2025-02-13T19:55:17+00:00,updated image translation for the various lang...,40284,0,40284,2354,9e189e28e21b1fccca193ad24f342aaa761c1bf7
3,microsoft/ML-For-Beginners,90f5aee98a28a5e0f7d4e4fde9af5ce6793ae936,https://github.com/microsoft/ML-For-Beginners/...,WirelessLife,17089114,User,2025-02-13T17:09:42+00:00,web-flow,2025-02-13T17:09:42+00:00,Merge pull request #788 from microsoft/Wireles...,1,0,1,1,"8a6ce4ec4ad22a7e86664ae7e27f93bdd8223e2d,6c3c6..."
4,microsoft/ML-For-Beginners,6c3c6218c4b7a39e78f8922f6321c0d950db2cd6,https://github.com/microsoft/ML-For-Beginners/...,WirelessLife,17089114,User,2025-02-13T17:09:02+00:00,web-flow,2025-02-13T17:09:02+00:00,Update README.md\n\nUpdating Other Courses,1,0,1,1,8a6ce4ec4ad22a7e86664ae7e27f93bdd8223e2d


In [7]:
# Loop complet: toate repo-urile
for full_name in repos:
    log(f"Commits → {full_name}")
    folder = ensure_repo_folder(full_name)
    extract_commits(full_name, folder, max_commits=2)

[2025-05-11T13:17:59.959523] Commits → microsoft/ML-For-Beginners
[2025-05-11T13:18:13.369984] Commits → apache/superset
[2025-05-11T13:18:15.881312] Commits → keras-team/keras
[2025-05-11T13:18:18.288848] Commits → scikit-learn/scikit-learn
[2025-05-11T13:18:20.798411] Commits → pandas-dev/pandas
[2025-05-11T13:18:23.203757] Commits → jakevdp/PythonDataScienceHandbook
[2025-05-11T13:18:25.712413] Commits → apache/airflow
[2025-05-11T13:18:28.120633] Commits → streamlit/streamlit
[2025-05-11T13:18:30.491746] Commits → GokuMohandas/Made-With-ML
[2025-05-11T13:18:32.837382] Commits → gradio-app/gradio
[2025-05-11T13:18:35.249103] Commits → ray-project/ray
[2025-05-11T13:18:37.555947] Commits → explosion/spaCy
[2025-05-11T13:18:39.964338] Commits → AMAI-GmbH/AI-Expert-Roadmap
[2025-05-11T13:18:42.576479] Commits → Lightning-AI/pytorch-lightning
[2025-05-11T13:18:44.978952] Commits → microsoft/Data-Science-For-Beginners
[2025-05-11T13:18:48.190693] Commits → donnemartin/data-science-ipytho