# GitHub Repository Scraper
## Query GitHub REST API to find repositories and store metadata

This notebook searches for GitHub repositories using various criteria and saves their metadata for further processing.

In [1]:
from datetime import datetime
import pandas as pd
import requests
import json
import time
import sys
import os

In [None]:
# Configuration
GITHUB_TOKEN = "Removed_Token_For_Security"
MAX_REPOS = 1000
OUTPUT_FILE = "repository_list"

In [3]:
def get_queries_from_user():
    print("Choose an option:")
    print("1. Enter custom search queries")
    print("2. Use default queries")
    
    while True:
        try:
            choice = input("\nEnter your choice (1/2): ").strip()
            if choice == "1":
                print("Enter your search queries (one per line).")
                print("Examples:")
                print("  language:python stars:>100")
                print("  topic:machine-learning")
                print("  web framework in:description")
                print("Press Enter twice when finished.\n")
                
                queries = []
                while True:
                    query = input("Enter query: ").strip()
                    if query == "":
                        if queries:
                            break
                        else:
                            print("No queries entered. Using default queries.")
                            return 0
                    queries.append(query)
                return queries
                
            elif choice == "2":
                return 0  # Signal to use defaults
            else:
                print("Invalid choice. Please enter 1 or 2.")
        except KeyboardInterrupt:
            print("\nOperation cancelled.")
            return None

In [4]:
# Setup session
session = requests.Session()
headers = {
    'Accept': 'application/vnd.github.v3+json',
    'User-Agent': 'GitHub-Repo-Scraper/1.0'
}

if GITHUB_TOKEN:
    headers['Authorization'] = f'token {GITHUB_TOKEN}'
session.headers.update(headers)

- Creates a requests.Session() to maintain connection and headers
- Sets API headers that tell GitHub:
    - We want version 3 of their API
    - Our application name for identification
    - Authorization token if provided

In [5]:
# Check GitHub API rate limits
def check_rate_limit():
    try:
        response = session.get("https://api.github.com/rate_limit")
        if response.status_code == 200:
            data = response.json()
            core = data['resources']['core']
            rate_limit_remaining = core['remaining']
            
            if rate_limit_remaining < 10:
                reset_time = datetime.fromtimestamp(core['reset'])
                wait_time = (reset_time - datetime.now()).total_seconds()
                if wait_time > 0:
                    print(f"Rate limit low. Waiting {wait_time/60:.1f} minutes")
                    time.sleep(wait_time + 10)
                return False
            return True
    except Exception as e:
        print(f"Error checking rate limit: {e}")
    return True

Purpose: Prevents hitting GitHub's API limits
How it works:

- Calls GitHub's rate limit endpoint
- Checks how many requests remain
- If low (<10), calculates when limit resets
- Waits automatically if needed
- Returns False if we should pause, True if we can continue

In [6]:
# Extract relevant metadata from repository data
def extract_repo_metadata(repo_data):
    return {
        'id': repo_data['id'],
        'name': repo_data['name'],
        'full_name': repo_data['full_name'],
        'html_url': repo_data['html_url'],
        'clone_url': repo_data['clone_url'],
        'description': repo_data.get('description', ''),
        'language': repo_data.get('language'),
        'created_at': repo_data['created_at'],
        'updated_at': repo_data['updated_at'],
        'size': repo_data['size'],
        'stargazers_count': repo_data['stargazers_count'],
        'forks_count': repo_data['forks_count'],
        'open_issues_count': repo_data['open_issues_count'],
        'license': repo_data.get('license', {}).get('key') if repo_data.get('license') else None,
        'topics': repo_data.get('topics', []),
        'owner_login': repo_data['owner']['login'],
        'is_fork': repo_data['fork'],
        'is_archived': repo_data.get('archived', False),
    }

Purpose: Cleans and organizes raw repository data
Extracts 16 key fields:

- Basic info: id, name, full_name, description
- URLs: html_url (browser link), clone_url (git clone)
- Technical: language, size (KB), topics (tags)
- Statistics: stargazers_count, forks_count, open_issues_count
- Dates: created_at, updated_at
- Status: is_fork, is_archived
- Ownership: owner_login, license

In [7]:
# Search repositories with given query
def search_repositories(query, max_repos=100):
    if not check_rate_limit():
        return []
        
    repositories = []
    page = 1
    
    while len(repositories) < max_repos:
        url = "https://api.github.com/search/repositories"
        params = {
            'q': query,
            'sort': 'stars',
            'order': 'desc',
            'per_page': 100,
            'page': page
        }
        
        try:
            print(f"Fetching page {page} for: {query}")
            response = session.get(url, params=params)
            
            if response.status_code == 200:
                data = response.json()
                items = data.get('items', [])
                
                if not items:
                    break
                
                for repo in items:
                    repo_data = extract_repo_metadata(repo)
                    repositories.append(repo_data)
                
                print(f"Page {page}: {len(items)} repos (total: {len(repositories)})")
                
                if len(repositories) >= data.get('total_count', 0):
                    break
                    
                page += 1
                time.sleep(1)
                
            elif response.status_code == 403:
                print("Rate limit exceeded. Waiting 60 seconds...")
                time.sleep(60)
            else:
                print(f"API error: {response.status_code}")
                break
                
        except Exception as e:
            print(f"Error: {e}")
            break
            
    return repositories[:max_repos]

Purpose: Main function that fetches repositories from GitHub
Step-by-step process:

- Checks rate limits first
- Uses pagination to get multiple pages (100 repos/page)
- Sends search query to GitHub API
- Processes each repository through extract_repo_metadata()
- Handles errors and rate limits gracefully
- Returns clean repository list

In [8]:
# Save repository list
def save_repo_list(repositories, output_file):
    os.makedirs('../../data/repo_details', exist_ok=True)
    
    # Save as JSON
    with open(f'../../data/repo_details/{output_file}.json', 'w', encoding='utf-8') as f:
        json.dump(repositories, f, indent=2, ensure_ascii=False)
    
    # Save as CSV for easy viewing
    df = pd.DataFrame(repositories)
    df.to_csv(f'../../data/repo_details/{output_file}.csv', index=False)
    
    # Save simple list for downloader
    download_list = [{
        'full_name': repo['full_name'],
        'clone_url': repo['clone_url'],
        'language': repo['language']
    } for repo in repositories]
    
    with open(f'../../data/repo_details/{output_file}_download_list.json', 'w', encoding='utf-8') as f:
        json.dump(download_list, f, indent=2, ensure_ascii=False)
    
    print(f"Saved {len(repositories)} repositories to data/{output_file}.json")

Purpose: Saves collected data in multiple formats
Creates 3 files:

- JSON: Full dataset for programs to read
- CSV: Spreadsheet format to view
- Simplified JSON: Only essential fields for downloading repos later

In [9]:
queries = get_queries_from_user()

if queries == 0:
    print("Using default queries.")
    queries = [
        "language:python stars:>100",
        "language:java stars:>100", 
        "language:cpp stars:>100"
        # "language:javascript stars:>100",
        # "language:go stars:>50",
        # "language:rust stars:>50",
        # "language:php stars:>50",
    ]
    
all_repositories = []

for i, query in enumerate(queries):
    print(f"\nSearch {i+1}/{len(queries)}: {query}")
    
    repos = search_repositories(query, max_repos=MAX_REPOS // len(queries))
    all_repositories.extend(repos)
    
    # Remove duplicates
    seen_ids = set()
    unique_repos = []
    for repo in all_repositories:
        if repo['id'] not in seen_ids:
            seen_ids.add(repo['id'])
            unique_repos.append(repo)
    all_repositories = unique_repos
    
    print(f"Total unique repositories: {len(all_repositories)}")
    time.sleep(5)

if all_repositories:
    save_repo_list(all_repositories, OUTPUT_FILE)
    print(f"\nScraping complete! {len(all_repositories)} repositories saved.")
    
    # Display summary
    df = pd.DataFrame(all_repositories)
    print(f"\nSummary:")
    print(f"Languages found: {df['language'].nunique()}")
    print(f"Total stars: {df['stargazers_count'].sum():,}")
    print(f"Total forks: {df['forks_count'].sum():,}")
    print(f"\nLanguage distribution:")
    print(df['language'].value_counts().head(10))
else:
    print("No repositories collected")

Choose an option:
1. Enter custom search queries
2. Use default queries
Using default queries.

Search 1/3: language:python stars:>100
Fetching page 1 for: language:python stars:>100
Page 1: 100 repos (total: 100)
Fetching page 2 for: language:python stars:>100
Page 2: 100 repos (total: 200)
Fetching page 3 for: language:python stars:>100
Page 3: 100 repos (total: 300)
Fetching page 4 for: language:python stars:>100
Page 4: 100 repos (total: 400)
Total unique repositories: 333

Search 2/3: language:java stars:>100
Fetching page 1 for: language:java stars:>100
Page 1: 100 repos (total: 100)
Fetching page 2 for: language:java stars:>100
Page 2: 100 repos (total: 200)
Fetching page 3 for: language:java stars:>100
Page 3: 100 repos (total: 300)
Fetching page 4 for: language:java stars:>100
Page 4: 100 repos (total: 400)
Total unique repositories: 666

Search 3/3: language:cpp stars:>100
Fetching page 1 for: language:cpp stars:>100
Page 1: 100 repos (total: 100)
Fetching page 2 for: languag