In [25]:
import github3
import json
import time
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil.relativedelta import relativedelta
from github3.exceptions import ForbiddenError


In [26]:
GITHUB_TOKEN = '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' 
ORG_LIST = [
    'meta-llama', 'ollama', 'langchain-ai', 'langchain-ai',
    'microsoft', 'openai', 'elastic', 'milvus-io'
]
REPO_LIST = [
    'llama3', 'ollama', 'langchain', 'langgraph',
    'autogen', 'openai-cookbook', 'elasticsearch', 'pymilvus'
]

In [27]:
# Calculate the date for two months ago and two years ago
# calculate our windows
NOW = datetime.utcnow()
TWO_MONTHS_AGO = NOW - relativedelta(months=2)
TWO_YEARS_AGO  = NOW - relativedelta(years=2)

print("Date two months ago:", TWO_MONTHS_AGO)
print("Date two years ago:", TWO_YEARS_AGO)

Date two months ago: 2025-02-24 05:57:00.821963
Date two years ago: 2023-04-24 05:57:00.821963


In [28]:
# Login to GitHub
gh = github3.login(token=GITHUB_TOKEN)


In [29]:
# ── RATE-LIMIT HELPER ──────────────────────────────────────────────────
def wait_for_search_quota():
    """
    Sleep until GitHub Search API resets if we're out of calls.
    Handles both dict and attribute-based responses.
    """
    rl = gh.rate_limit()
    # gh.rate_limit() may return a dict with 'resources' or a simple obj
    if isinstance(rl, dict) and 'resources' in rl:
        search_limits = rl['resources']['search']
    else:
        # assume attribute
        search_limits = rl.search

    remaining = search_limits['remaining'] if isinstance(search_limits, dict) else search_limits.remaining
    reset_ts  = search_limits['reset']     if isinstance(search_limits, dict) else search_limits.reset

    if remaining < 2:
        now_ts = time.time()
        to_sleep = reset_ts - now_ts + 5
        if to_sleep > 0:
            print(f"[rate-limit] sleeping {to_sleep:.0f}s until reset at {datetime.utcfromtimestamp(reset_ts)}")
            time.sleep(to_sleep)

# ── FETCH ONE WINDOW ──────────────────────────────────────────────────
def fetch_window(org, repo, start_dt, end_dt, out_file):
    """
    Slice [start_dt, end_dt) into one-month segments,
    run a created:start..end search for each, and append to out_file.
    """
    # truncate or create the file first
    open(out_file, 'w').close()

    cur = start_dt
    while cur < end_dt:
        window_end = min(cur + relativedelta(months=1), end_dt)
        start_s = cur.strftime('%Y-%m-%d')
        end_s   = window_end.strftime('%Y-%m-%d')
        query = f'type:issue repo:{org}/{repo} created:{start_s}..{end_s}'

        print(f"→ {org}/{repo} issues {start_s} → {end_s}")
        wait_for_search_quota()

        try:
            for issue in gh.search_issues(query, per_page=100):
                js = json.loads(issue.as_json())
                entry = {
                    'issue_number': js['number'],
                    'issue_title' : js['title'],
                    'issue_body'  : BeautifulSoup(js.get('body_html','') or '', 'html.parser').get_text() or "No body",
                    'created_at'  : js['created_at'][:10],
                    'closed_at'   : (js['closed_at'][:10] if js['closed_at'] else None),
                    'labels'      : [lbl['name'] for lbl in js['labels']],
                    'State'       : js['state'],
                    'Author'      : js['user']['login'],
                }
                with open(out_file, 'a') as f:
                    f.write(json.dumps(entry) + '\n')
        except ForbiddenError:
            print("  ▶ hit ForbiddenError, sleeping 60s …")
            time.sleep(60)
            continue

        cur = window_end
        time.sleep(2)  # gentle pause between windows

# ── MAIN LOOP ─────────────────────────────────────────────────────────
if __name__ == "__main__":
    for org, repo in zip(ORG_LIST, REPO_LIST):
        print(f"=== Processing {org}/{repo} ===")
        # last 2 months
        fname2m = f"{org}_{repo}_issues_2months.json"
        fetch_window(org, repo, TWO_MONTHS_AGO, NOW, fname2m)
        print(f"  ↳ saved 2-month issues to {fname2m}")

        # last 2 years
        fname2y = f"{org}_{repo}_issues_2years.json"
        fetch_window(org, repo, TWO_YEARS_AGO, NOW, fname2y)
        print(f"  ↳ saved 2-year   issues to {fname2y}")

    print("All done!")

=== Processing meta-llama/llama3 ===
→ meta-llama/llama3 issues 2025-02-24 → 2025-03-24
→ meta-llama/llama3 issues 2025-03-24 → 2025-04-24
  ↳ saved 2-month issues to meta-llama_llama3_issues_2months.json
→ meta-llama/llama3 issues 2023-04-24 → 2023-05-24
→ meta-llama/llama3 issues 2023-05-24 → 2023-06-24
→ meta-llama/llama3 issues 2023-06-24 → 2023-07-24
→ meta-llama/llama3 issues 2023-07-24 → 2023-08-24
→ meta-llama/llama3 issues 2023-08-24 → 2023-09-24
→ meta-llama/llama3 issues 2023-09-24 → 2023-10-24
→ meta-llama/llama3 issues 2023-10-24 → 2023-11-24
→ meta-llama/llama3 issues 2023-11-24 → 2023-12-24
→ meta-llama/llama3 issues 2023-12-24 → 2024-01-24
→ meta-llama/llama3 issues 2024-01-24 → 2024-02-24
→ meta-llama/llama3 issues 2024-02-24 → 2024-03-24
→ meta-llama/llama3 issues 2024-03-24 → 2024-04-24
→ meta-llama/llama3 issues 2024-04-24 → 2024-05-24
→ meta-llama/llama3 issues 2024-05-24 → 2024-06-24
→ meta-llama/llama3 issues 2024-06-24 → 2024-07-24
→ meta-llama/llama3 issues 202