In [None]:
%pip install PyGithub python-dotenv pandas tqdm aiohttp asyncio bs4

In [None]:
from os import getenv
from dotenv import load_dotenv
from github import Github, Commit, Repository
import pandas as pd
import aiohttp
import asyncio
from datetime import datetime, timezone
import pytz
from random import randint
from dataclasses import dataclass
import pickle
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [None]:
load_dotenv(override=True)
GITHUB_TOKEN = getenv('GITHUB_TOKEN')
g = Github(GITHUB_TOKEN, per_page=100)
cutoff_date = datetime(2024, 9, 19, tzinfo=pytz.UTC)
session = aiohttp.ClientSession()

In [None]:
def get_content(repo: Repository, path: str):
    try:
        return repo.get_contents(path).decoded_content.decode('utf-8')
    except Exception as e:
        print(f"Error getting {path}'s content from {repo.full_name}: {e}")
        raise e

In [None]:
@dataclass
class RawData:
    full_path: str
    timestamp: datetime
    sha: str
    message: str
    diff: str

In [None]:
async def close_session():
    await session.close()

In [None]:
async def get_diff(repo, sha: str, retries: int=6):
    diff_url = f'https://github.com/{repo.full_name}/commit/{sha}.diff'
    headers = {
        'Authorization': f'token {GITHUB_TOKEN}',
        'Accept': 'application/vnd.github.v3.diff'
    }

    backoff = 2

    async with aiohttp.ClientSession() as session:
        for attempt in range(retries):
            await asyncio.sleep(randint(9, 18))
            try:
                async with session.get(diff_url, headers=headers, timeout=aiohttp.ClientTimeout(total=30)) as response:
                    remaining_requests = response.headers.get('X-RateLimit-Remaining')
                    
                    if response.status == 200:
                        return await response.text()
                    elif response.status == 403:  # Rate limit likely hit
                        reset_time_utc = datetime.fromtimestamp(int(response.headers.get('X-RateLimit-Reset')), tz=timezone.utc)
                        sao_paulo_tz = pytz.timezone('America/Sao_Paulo')
                        reset_time_sao_paulo = reset_time_utc.astimezone(sao_paulo_tz)
                        print(f"Rate limit hit: {remaining_requests} remaining, resetting at {reset_time_sao_paulo.isoformat()}")
                        wait_time = max(1, (reset_time_utc - datetime.now(timezone.utc)).total_seconds() + 1)
                        await asyncio.sleep(wait_time)
                    elif response.status == 429:  # Too many requests, wait 60 seconds
                        print(f"Rate limit hit, waiting 60 seconds (attempt {attempt + 1})")
                        await asyncio.sleep(60)
                    else:
                        print(f"Attempt {attempt + 1} | Error {response.status} | Remaining: {remaining_requests}")
                        await asyncio.sleep(backoff)
                        backoff *= 2
            except aiohttp.ClientConnectorError as e:
                print(f"Attempt {attempt + 1} | Connection error: {e}")
                await asyncio.sleep(backoff)
                backoff *= 2
            except asyncio.TimeoutError:
                print(f"Attempt {attempt + 1} | Request timed out.")
                await asyncio.sleep(backoff)
                backoff *= 2

    return "Max retries exceeded. Please try again later."

In [None]:
async def process_commit(repo: Repository, commit: Commit, cutoff_date: datetime) -> None:
    if commit.commit.author.date < cutoff_date:
        diff = await get_diff(repo, commit.sha)
        message = commit.commit.message
        timestamp = commit.commit.author.date.isoformat()
        sha = commit.sha
        full_path = f'{repo.owner.login}/{repo.name}'
        rawDataObj = RawData(full_path, timestamp, sha, message, diff)
        return sha, rawDataObj

In [None]:
# repos = [('spring-guides', 'gs-accessing-data-jpa'), ('Azure-Samples', 'java-native-telemetry'),
#         ('aws-samples', 'amazon-ivs-player-web-sample'), 
#         ('aws-samples', 'aws-marketplace-serverless-saas-integration')]
repos = pd.read_csv('code_samples.csv', skiprows=1)[:14]

repos = repos.dropna(subset=['html_url'])

raw_data = {}

In [None]:
tasks = []
for repo in tqdm(repos.iterrows(), total=len(repos), desc="Processing repositories"):
    repo_ecosystem = repo[1]['html_url'].split('/')[-2]
    repo_obj = g.get_organization(repo_ecosystem).get_repo(repo[1]['name'])
    commits = list(repo_obj.get_commits())
    for commit in commits:
        # Task for processing each commit
        task = asyncio.create_task(process_commit(repo_obj, commit, cutoff_date))
        tasks.append(task)

In [None]:
async def gather_with_concurrency(n, *coros):
    semaphore = asyncio.Semaphore(n)
    print(f"Semaphore: {semaphore}")

    async def no_coro(coro):
        async with semaphore:
            try:
                return await coro
            except Exception as e:
                print(f"Task failed with error: {e}")
                return None
    
    return [no_coro(coro) for coro in coros]

In [None]:
limited_tasks = await gather_with_concurrency(8, *tasks)

In [None]:
# progress bar
for future in tqdm(asyncio.as_completed(limited_tasks), total=len(tasks), desc="Processing commits"):
    result = await future
    if result:
        sha, rawDataObj = result
        raw_data[sha] = rawDataObj

await close_session()

In [None]:
p = open('raw_data.pkl', 'wb')
pickle.dump(raw_data, p)
p.close()

In [None]:
df = pd.DataFrame([vars(v) for v in raw_data.values()])
df.to_csv('raw_data.csv', index=False)
df