In [None]:
%pip install PyGithub python-dotenv pandas tqdm aiohttp asyncio

In [2]:
from os import getenv
from dotenv import load_dotenv
from github import Github, Commit, Repository
import pandas as pd
import aiohttp
import asyncio
from datetime import datetime, timezone
import pytz
from dataclasses import dataclass
import pickle
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [3]:
load_dotenv(override=True)
GITHUB_TOKEN = getenv('GITHUB_TOKEN')
g = Github(GITHUB_TOKEN, per_page=100)
cutoff_date = datetime(2024, 9, 19, tzinfo=pytz.UTC)
session = aiohttp.ClientSession()

In [4]:
def get_content(repo: Repository, path: str):
    try:
        return repo.get_contents(path).decoded_content.decode('utf-8')
    except Exception as e:
        print(f"Error getting {path}'s content from {repo.full_name}: {e}")
        raise e

In [5]:
@dataclass
class RawData:
    full_path: str
    timestamp: datetime
    sha: str
    message: str
    diff: str

In [6]:
async def get_diff(repo: Repository, sha: str, retries=6):
    diff_url = f'https://github.com/{repo.full_name}/commit/{sha}.diff'
    
    header = {
        'Authorization': f'token {GITHUB_TOKEN}',
        'Accept': 'application/vnd.github.v3.diff'
    }
    
    for attempt in range(retries):
        async with session.get(diff_url, headers=header) as response:
            try:
                if response.status == 200:
                    return await response.text()
                elif response.status == 403: # likely due to rate limit
                    remaining_requests = response.headers.get('X-RateLimit-Remaining')
                    reset_time_utc = datetime.fromtimestamp(int(response.headers.get('X-RateLimit-Reset')), tz=timezone.utc)
                    sao_paulo_tz = pytz.timezone('America/Sao_Paulo')
                    reset_time_sao_paulo = reset_time_utc.astimezone(sao_paulo_tz)

                    print(f"Rate limit hit: {remaining_requests} remaining, resetting at {reset_time_sao_paulo.isoformat()}")
                    
                    wait_time = max(0, (reset_time_utc - datetime.now(timezone.utc)).total_seconds() + 1)
                    await asyncio.sleep(wait_time)
                else:
                    print(f"Attempt {attempt} | Error {response.status} | Remaining: {remaining_requests}")
                    return 'error'
            except aiohttp.ClientResponseError as e:
                return f"Attempt {attempt} | Connection error: {e}"
            except asyncio.TimeoutError:
                return f"Attempt {attempt} | Request timed out. Please try again later."

    return "Max retries exceeded. Please try again later."

# Close the session when done
async def close_session():
    await session.close()

In [7]:
async def process_commit(repo: Repository, commit: Commit, cutoff_date: datetime) -> None:
    if commit.commit.author.date < cutoff_date:
        diff = await get_diff(repo, commit.sha)
        message = commit.commit.message
        timestamp = commit.commit.author.date.isoformat()
        sha = commit.sha
        full_path = f'{repo.owner.login}/{repo.name}'
        rawDataObj = RawData(full_path, timestamp, sha, message, diff)
        return sha, rawDataObj

In [8]:
repos = [('spring-guides', 'gs-accessing-data-jpa'), ('Azure-Samples', 'java-native-telemetry'),
        ('aws-samples', 'amazon-ivs-player-web-sample'), 
        ('aws-samples', 'aws-marketplace-serverless-saas-integration')]
raw_data = {}

tasks = []
for repo in repos:
    repo_obj = g.get_organization(repo[0]).get_repo(repo[1])
    commits = list(repo_obj.get_commits())
    for commit in commits:
        # Task for processing each commit
        task = asyncio.create_task(process_commit(repo_obj, commit, cutoff_date))
        tasks.append(task)

# progress bar
for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing commits"):
    result = await future
    if result:
        sha, rawDataObj = result
        raw_data[sha] = rawDataObj

await close_session()

Processing commits: 100%|██████████| 647/647 [00:01<00:00, 329.20it/s]


In [9]:
p = open('raw_data.pkl', 'wb')
pickle.dump(raw_data, p)
p.close()

In [10]:
df = pd.DataFrame([vars(v) for v in raw_data.values()])
df.to_csv('raw_data.csv', index=False)
df

Unnamed: 0,full_path,timestamp,sha,message,diff
0,spring-guides/gs-accessing-data-jpa,2017-10-17T20:41:23+00:00,7b0fc9e50be27c59bc87175f36446b0e64063fbc,Upgrade to Spring Boot 1.5.8.RELEASE,diff --git a/complete/build.gradle b/complete/...
1,spring-guides/gs-accessing-data-jpa,2019-09-16T15:46:53+00:00,b40ca97cc9b213ab593dd2c430cb5ad2b63b98aa,Fixed a bad path\n\nI found a spot where I had...,diff --git a/README.adoc b/README.adoc\nindex ...
2,spring-guides/gs-accessing-data-jpa,2021-09-28T15:30:12+00:00,a4905daf9584ea0731b559be9934b20aa9b5a085,Add pre-populated Initializr link\n\nto fill i...,diff --git a/README.adoc b/README.adoc\nindex ...
3,spring-guides/gs-accessing-data-jpa,2017-09-13T18:14:24+00:00,d7170d70ebdb6ed06f821198337003a9f833dbc5,Upgrade to Spring Boot 1.5.7.RELEASE,diff --git a/complete/build.gradle b/complete/...
4,spring-guides/gs-accessing-data-jpa,2017-08-12T15:21:14+00:00,ffe59c6df096cc4d90aa278f698ed741448e9820,Added See Also links (#21)\n\nAt the bottom of...,diff --git a/README.adoc b/README.adoc\nindex ...
...,...,...,...,...,...
636,aws-samples/aws-marketplace-serverless-saas-in...,2020-07-29T09:42:06+00:00,3006ddb66fbf1b67ac323e87ca3c3d7a3800c770,docs: fix typos and wrong file names,diff --git a/README.md b/README.md\nindex 5e4b...
637,aws-samples/aws-marketplace-serverless-saas-in...,2020-10-27T15:44:48+00:00,fb78a52df8d92f6e57f82d0ad22938954f25edb3,Typo,diff --git a/template.yaml b/template.yaml\nin...
638,aws-samples/aws-marketplace-serverless-saas-in...,2020-10-09T09:40:13+00:00,f0b9ba4b5eb2dd64144d2cd6f91962b656019b65,Merge pull request #1 from lulzneko/fix/explic...,diff --git a/template.yaml b/template.yaml\nin...
639,aws-samples/aws-marketplace-serverless-saas-in...,2020-07-20T14:53:10+00:00,408f3439211f6ff4053b65f26d4b34524cdedb38,Sample solution,diff --git a/.gitignore b/.gitignore\nnew file...
