In [12]:
import requests # To make HTTP requests
import pandas as pd # For data manipulation, specifically to transform responses JSON data to DataFrame
import os # To handle file paths
import logging # For logging execution messages
import time # To introduce delays between requests when getting rate-limited

# Configure logging messages format
logging.basicConfig(format='%(levelname)s: %(message)s')
logging.getLogger().setLevel(logging.INFO)

BASE_URL = "https://api.github.com/" # Base URL for the GitHub API

# Optional github API token set

In [13]:
GITHUB_TOKEN = input("Enter your GitHub token: ").strip()  # Prompt user for GitHub token

if GITHUB_TOKEN is None or GITHUB_TOKEN == "":
    logging.info("GITHUB_TOKEN was not set. Some endpoints may not work, others may return limited data and the rate limit is smaller.")
    GITHUB_TOKEN = None
else:
    logging.info("GITHUB_TOKEN is set.")


INFO: GITHUB_TOKEN is set.


# Utility functions

Here I define some utility functions.

In [14]:
# Function to add authentication header if GITHUB_TOKEN was provided in the cell above
def add_auth_header_if_set(headers=None):
    """Add authentication header if GITHUB_TOKEN is provided."""
    if headers is None:
        headers = {}
    if GITHUB_TOKEN:
        headers['Authorization'] = f'Bearer {GITHUB_TOKEN}'
    return headers

# Function to save JSON response data to a JSON file
def save_json_to_file(data, filename, output_dir='output'):
    """Save JSON data to a file."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    file_path = os.path.join(output_dir, filename)
    with open(file_path, 'w') as f:
        import json
        json.dump(data, f, indent=4)
    logging.info(f"Data saved to {file_path}")

# Function to save JSON response data to a CSV file using pandas function 'json_normalize'
def save_json_to_csv(data, filename, output_dir='output'):
    """Save JSON data to a CSV file."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    file_path = os.path.join(output_dir, filename) 
    df = pd.json_normalize(data)
    df.to_csv(file_path, index=False)
    logging.info(f"Data saved to {file_path}")


# Function used to filter nested fields from the response dictionary got from the GitHub API
def extract_nested_fields(item, fields):
    """Recursively extract nested fields from a dictionary."""
    result = {}
    for field in fields:
        if isinstance(field, dict):
            # Handle nested fields like {"commit": ["message", {"author": ["name"]}]}
            for key, subfields in field.items():
                if key in item and isinstance(item[key], dict):
                    result[key] = extract_nested_fields(item[key], subfields)
        elif isinstance(field, str):
            # Handle simple string fields
            if field in item:
                result[field] = item[field]
    return result



# Rate limit check

In [15]:
def get_rate_limit():
    """Fetches the rate limit status for the authenticated user."""
    endpoint_path = "rate_limit"
    headers = {
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28"
    }
    headers = add_auth_header_if_set()
    response = requests.get(
        BASE_URL + endpoint_path,
        headers=headers
    )
    if response.status_code == 200:
        return response.json()
    else:
        logging.error(f"Failed to fetch rate limit: {response.status_code} - {response.text}")
        return []
    

In [16]:
rate_limit_json = get_rate_limit()

print("Rate limit JSON response:")
print(rate_limit_json)

core_rate_limit = rate_limit_json.get('resources', {}).get('core', {})

logging.info(f"Rate limit information for the current {'authenticated' if GITHUB_TOKEN else 'unauthenticated'} user:")
logging.info(f"Limit: {core_rate_limit.get('limit', 'N/A')}")
logging.info(f"Remaining: {core_rate_limit.get('remaining', 'N/A')}")
logging.info(f"Used: {core_rate_limit.get('used', 'N/A')}")
reset_datetime = pd.to_datetime(core_rate_limit.get('reset', 'N/A'), unit='s', utc=True)
time_to_reset = reset_datetime - pd.Timestamp.utcnow()
time_to_reset_comps = time_to_reset.components
logging.info(f"Reset: {reset_datetime} (in {time_to_reset_comps.minutes}m {time_to_reset_comps.seconds}s)")


INFO: Rate limit information for the current authenticated user:
INFO: Limit: 5000
INFO: Remaining: 4988
INFO: Used: 12
INFO: Reset: 2025-06-24 03:04:00+00:00 (in 29m 5s)


Rate limit JSON response:
{'resources': {'core': {'limit': 5000, 'used': 12, 'remaining': 4988, 'reset': 1750734240}, 'search': {'limit': 30, 'used': 0, 'remaining': 30, 'reset': 1750732554}, 'graphql': {'limit': 5000, 'used': 0, 'remaining': 5000, 'reset': 1750736094}, 'integration_manifest': {'limit': 5000, 'used': 0, 'remaining': 5000, 'reset': 1750736094}, 'source_import': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1750732554}, 'code_scanning_upload': {'limit': 5000, 'used': 12, 'remaining': 4988, 'reset': 1750734240}, 'code_scanning_autofix': {'limit': 10, 'used': 0, 'remaining': 10, 'reset': 1750732554}, 'actions_runner_registration': {'limit': 10000, 'used': 0, 'remaining': 10000, 'reset': 1750736094}, 'scim': {'limit': 15000, 'used': 0, 'remaining': 15000, 'reset': 1750736094}, 'dependency_snapshots': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1750732554}, 'dependency_sbom': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1750732554}, 'audit_log': {'lim

# Get repository list

Official documentation: 

In [17]:
def get_org_repos(
        org, 
        filter_fields=False, fields=["name","description","homepage","stargazers_count","watchers_count","created_at","updated_at","pushed_at","size","git_url","html_url"]
    ):
    """Fetch repositories for a given organization."""
    endpoint_path = "orgs/{org}/repos"
    headers = {
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28"
    }
    headers = add_auth_header_if_set()
    response = requests.get(
        BASE_URL + endpoint_path.format(org=org),
        headers=headers
    )
    if response.status_code == 200:
        if filter_fields:
            # Filter out fields that are not relevant
            contents = response.json()
            return [{key: item[key] for key in fields if key in item} for item in contents]
        else:
            # Return all contents including directories
            return response.json()
    else:
        logging.error(f"Failed to fetch repositories for '{org}': {response.status_code} - {response.json().get('message', 'No message')}")
        return []


In [18]:
response_json = get_org_repos("freecodecamp", filter_fields=True)

save_json_to_file(response_json, "repos.json")  # Save the response to a file
save_json_to_csv(response_json, "repos.csv")  # Save the response to a CSV file
df = pd.json_normalize(response_json)  # Convert JSON to DataFrame
df

INFO: Data saved to output\repos.json
INFO: Data saved to output\repos.csv


Unnamed: 0,name,description,homepage,stargazers_count,watchers_count,created_at,updated_at,pushed_at,size,git_url,html_url
0,devdocs,API Documentation Browser,https://devdocs.io,36421,36421,2013-10-24T18:16:07Z,2025-06-24T02:33:46Z,2025-06-19T21:03:56Z,32055,git://github.com/freeCodeCamp/devdocs.git,https://github.com/freeCodeCamp/devdocs
1,freeCodeCamp,freeCodeCamp.org's open-source codebase and cu...,https://contribute.freecodecamp.org,421448,421448,2014-12-24T17:49:19Z,2025-06-24T02:30:32Z,2025-06-23T21:13:24Z,503149,git://github.com/freeCodeCamp/freeCodeCamp.git,https://github.com/freeCodeCamp/freeCodeCamp
2,assets,A collection of https://www.freeCodeCamp.org l...,,103,103,2014-12-30T02:41:08Z,2025-04-22T16:43:19Z,2020-02-20T09:37:41Z,101576,git://github.com/freeCodeCamp/assets.git,https://github.com/freeCodeCamp/assets
3,ZiplineStatusChecker,The Status Checker Zipline starting MEAN.js ap...,,7,7,2015-03-22T02:15:21Z,2025-02-27T07:38:05Z,2018-06-21T09:07:20Z,1801,git://github.com/freeCodeCamp/ZiplineStatusChe...,https://github.com/freeCodeCamp/ZiplineStatusC...
4,fcc-vagrant,Vagrant development environment providing free...,,10,10,2015-06-05T03:45:02Z,2025-02-27T07:38:08Z,2016-01-02T08:12:10Z,197,git://github.com/freeCodeCamp/fcc-vagrant.git,https://github.com/freeCodeCamp/fcc-vagrant
5,JulyGameDev,Read more about this here: http://blog.freecod...,,19,19,2015-07-16T23:23:51Z,2025-02-27T07:38:11Z,2015-07-20T14:29:06Z,1700,git://github.com/freeCodeCamp/JulyGameDev.git,https://github.com/freeCodeCamp/JulyGameDev
6,1Aug2015GameDev,Cameron working on the stream after the Saturd...,,14,14,2015-08-01T20:45:14Z,2025-02-27T07:38:11Z,2015-08-03T06:20:36Z,1124,git://github.com/freeCodeCamp/1Aug2015GameDev.git,https://github.com/freeCodeCamp/1Aug2015GameDev
7,massification,An emailing service built on Amazon SES and Node,,42,42,2015-10-22T21:05:36Z,2025-02-27T07:38:15Z,2019-10-28T07:47:22Z,7,git://github.com/freeCodeCamp/massification.git,https://github.com/freeCodeCamp/massification
8,wiki,freeCodeCamp's deprecated wiki articles,http://www.freecodecamp.com/wiki,281,281,2015-11-01T04:14:19Z,2025-02-28T03:02:47Z,2021-04-01T16:29:50Z,8751,git://github.com/freeCodeCamp/wiki.git,https://github.com/freeCodeCamp/wiki
9,fcc-expressworks,,,12,12,2015-11-09T01:32:50Z,2025-02-27T07:38:16Z,2017-01-16T06:25:43Z,3,git://github.com/freeCodeCamp/fcc-expressworks...,https://github.com/freeCodeCamp/fcc-expressworks


# Get repository content

Official documentation: https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-repository-content

In [19]:
def get_repo_contents(owner, repo, path="", filter_fields=False, fields=["name","path","type","size","html_url","download_url"]):
    """Fetch contents of a repository."""
    endpoint_path = "repos/{owner}/{repo}/contents/{path}"
    headers = {
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28"
    }
    headers = add_auth_header_if_set(headers)
    response = requests.get(
        BASE_URL + endpoint_path.format(owner=owner, repo=repo, path=path),
        headers=headers
    )
    if response.status_code == 200:
        if filter_fields:
            contents = response.json()
            return [{key: item[key] for key in fields if key in item} for item in contents]
        else:
            return response.json()
    else:
        logging.error(f"Failed to fetch contents for '{owner}/{repo}': {response.status_code} - {response.json().get('message', 'No message')}")
        return []


In [20]:
# response_json = get_repo_contents("freeCodeCamp", "freeCodeCamp", "api/src", filter_fields=True)  # Example usage, replace with your owner and repo

# save_json_to_file(response_json, "repo_contents.json")
# save_json_to_csv(response_json, "repo_contents.csv")
# df = pd.json_normalize(response_json)

# Get commits list

Official documentation: https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#list-commits

In [21]:
def get_commits(
        owner, repo, 
        path=None, author=None,since=None, until=None, 
        per_page=None, page=None, 
        filter_fields=False, fields=["html_url", {"commit": ["message", {"author": ["name", "email", "date"]}, {"verification": ["verified"]}]}]
    ):
    
    endpoint_path = "repos/{owner}/{repo}/commits"

    headers = {
        "Accept": "application/vnd.github+json",
        "X-GitHub-Api-Version": "2022-11-28"
    }

    headers = add_auth_header_if_set(headers)
    
    query_params = {}
    if path:
        if not isinstance(path, str):
            raise ValueError("Path must be a string.")
        query_params['path'] = path
    if author:
        if not isinstance(author, str):
            raise ValueError("Author must be a string.")
        query_params['author'] = author
    if since:
        if not isinstance(since, str) or pd.to_datetime(since, errors='coerce') is pd.NaT:
            raise ValueError("Since must be a string in ISO 8601 format (e.g., '2023-01-01T00:00:00Z').") # GitHub API expects dates in ISO 8601 format
        query_params['since'] = since
    if until:
        if not isinstance(until, str) or pd.to_datetime(until, errors='coerce') is pd.NaT:
            raise ValueError("Until must be a string in ISO 8601 format (e.g., '2023-01-01T00:00:00Z').") # GitHub API expects dates in ISO 8601 format
        query_params['until'] = until
    if per_page:
        if not isinstance(per_page, int) or per_page <= 0:
            raise ValueError("Per page must be a positive integer.")
        if per_page > 100:
            logging.warning("The maximum allowed per_page is 100, value was updated to 100.") # GitHub API allows a maximum of 100 items per page
            per_page = 100
        query_params['per_page'] = per_page
    if page:
        if not isinstance(page, int) or page <= 0:
            raise ValueError("Page must be a positive integer.")
        query_params['page'] = page
    
    response = requests.get(
        BASE_URL + endpoint_path.format(owner=owner, repo=repo) + ("?" + "&".join(f"{key}={value}" for key, value in query_params.items()) if query_params else ""),
        headers=headers
    )
    
    if response.status_code == 200:
        if filter_fields:
            contents = response.json()
            return [extract_nested_fields(item, fields) for item in contents] # Extract only specified fields using the extract_nested_fields function defined at the beginning of the notebook
        else:
            return response.json()
    
    else:
        error_message = f"Failed to fetch commits: {response.status_code} - {response.json().get('message', 'No message')}"
        raise ValueError(error_message)


In [22]:
# response_json = get_commits("freeCodeCamp", "freeCodeCamp", filter_fields=True)
response_json = get_commits(
    per_page=10,
    page=1,
    owner = "freeCodeCamp", 
    repo = "freeCodeCamp",
    path = "api/src",
    author = "ojeytonwilliams@gmail.com",
    since = "2020-01-01T00:00:00Z",
    until = "2025-12-31T23:59:59Z",
    filter_fields=True
    )

save_json_to_file(response_json, "commits.json")
save_json_to_csv(response_json, "commits.csv")
df = pd.json_normalize(response_json)
print(len(df))
df


INFO: Data saved to output\commits.json
INFO: Data saved to output\commits.csv


10


Unnamed: 0,html_url,commit.message,commit.author.name,commit.author.email,commit.author.date,commit.verification.verified
0,https://github.com/freeCodeCamp/freeCodeCamp/c...,fix(api): duplicate reply on error in /daily-c...,Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-06-23T13:15:26Z,True
1,https://github.com/freeCodeCamp/freeCodeCamp/c...,chore(api): add user's id to errors sent to Se...,Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-06-09T20:21:02Z,True
2,https://github.com/freeCodeCamp/freeCodeCamp/c...,chore(api): migrate to fastify v5 (#57576),Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-06-02T02:37:57Z,True
3,https://github.com/freeCodeCamp/freeCodeCamp/c...,fix: handle when userinfo has no email address...,Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-05-30T17:02:55Z,True
4,https://github.com/freeCodeCamp/freeCodeCamp/c...,test: inform devs when db connection not estab...,Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-05-28T15:52:11Z,True
5,https://github.com/freeCodeCamp/freeCodeCamp/c...,refactor: remove unused format rules from resp...,Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-05-27T16:16:32Z,True
6,https://github.com/freeCodeCamp/freeCodeCamp/c...,test(api): stop reporting request logs during ...,Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-05-27T16:05:25Z,True
7,https://github.com/freeCodeCamp/freeCodeCamp/c...,fix(api): handle expected Auth0 errors (#60499),Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-05-27T04:21:03Z,True
8,https://github.com/freeCodeCamp/freeCodeCamp/c...,fix(api): handle string challengeType (#60491),Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-05-23T12:56:18Z,True
9,https://github.com/freeCodeCamp/freeCodeCamp/c...,fix(api): handle users without email addresses...,Oliver Eyton-Williams,ojeytonwilliams@gmail.com,2025-05-22T09:28:56Z,True


In [23]:
def get_all_commits(
        owner, repo, 
        path=None, author=None,since=None, until=None, 
        per_page=100, max_pages=30, 
        filter_fields=False, fields=["html_url", {"commit": ["message", {"author": ["name", "email", "date"]}, {"verification": ["verified"]}]}]
    ):
    """Fetch all commits for a repository with pagination."""
    all_commits = []
    page = 1

    consecutive_rate_limit_exceeded = 0  # Counter for consecutive rate limit exceeded errors

    while page <= max_pages:
        try:
            commits = get_commits(
                owner=owner,
                repo=repo,
                path=path,
                author=author,
                since=since,
                until=until,
                per_page=per_page,
                page=page,
                filter_fields=filter_fields,
                fields=fields
            )
            
            logging.info(f"Page {page} fetched with {len(commits)} commits, total so far: {len(all_commits) + len(commits)}")
            
            if not commits:
                break  # No more commits to fetch
            
            all_commits.extend(commits)
            consecutive_rate_limit_exceeded = 0  # Reset counter if successful
            page += 1
        except Exception as e:
            logging.error(f"Error fetching commits on page {page}: {e}")
            
            # If returns "Not found" error, it might be due to an invalid owner, repo, path, author or date range
            if "Not Found" in str(e):
                logging.error("Please check your string parameters and date ranges, the API failed to found data with the given parameters. Stopping further requests.")
                break # Stop further requests if the error is "Not Found"
            
            # If the rate limit is reached, wait for 60 seconds before retrying
            if "API rate limit exceeded" in str(e):
                
                consecutive_rate_limit_exceeded += 1 # Increment the counter for consecutive rate limit exceeded errors
                if consecutive_rate_limit_exceeded >= 3: # If rate limit exceeded 3 times, stop further requests
                    logging.error("Rate limit exceeded 3 times. Stopping further requests.")
                    logging.warning("Returning all commits fetched so far, the information may be incomplete.")
                    break
                
                # If the rate limit is exceeded, wait for 60 seconds before retrying
                logging.info("Rate limit exceeded. Waiting for 60 seconds before retrying...")
                time.sleep(60)
                logging.info("Retrying...") # Retry the iteration (avoiding page+=1)

    return all_commits

In [24]:
all_commits_json = get_all_commits(
    per_page=100,
    max_pages=10,
    owner = "freeCodeCamp", 
    repo = "freeCodeCamp",
    path = "",
    since = "2020-01-01T00:00:00Z",
    until = "2025-12-31T23:59:59Z",
    filter_fields=True
    )

INFO: Page 1 fetched with 100 commits, total so far: 100
INFO: Page 2 fetched with 100 commits, total so far: 200
INFO: Page 3 fetched with 100 commits, total so far: 300
INFO: Page 4 fetched with 100 commits, total so far: 400
INFO: Page 5 fetched with 100 commits, total so far: 500
INFO: Page 6 fetched with 100 commits, total so far: 600
INFO: Page 7 fetched with 100 commits, total so far: 700
INFO: Page 8 fetched with 100 commits, total so far: 800
INFO: Page 9 fetched with 100 commits, total so far: 900
INFO: Page 10 fetched with 100 commits, total so far: 1000


In [25]:
save_json_to_file(all_commits_json, "all_commits.json")
save_json_to_csv(all_commits_json, "all_commits.csv")


INFO: Data saved to output\all_commits.json
INFO: Data saved to output\all_commits.csv
