# Introduction 

GitHub API - Data Extraction Notebook

Author: Gabriela Rivera Plascencia

Purpose: Extract public GitHub repository data, commits, and file contents using the REST API.

In [10]:
## Import Libraries

import requests
import time
import pandas as pd

## Authentication

In [None]:
# Replace with your actual GitHub token
TOKEN = "Enter your full GitHub token:"


In [12]:
# Authentication headers
HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28"
}

## Search public repositories

In [13]:
def search_repositories(query="data", language="python", sort="stars", order="desc", per_page=30, page=1):
    url = "https://api.github.com/search/repositories"
    params = {
        "q": f"{query}+language:{language}",
        "sort": sort,
        "order": order,
        "per_page": per_page,
        "page": page
    }
    response = requests.get(url, headers=HEADERS, params=params)
    if response.status_code == 200:
        return response.json()["items"]
    else:
        print(f"Error: {response.status_code}")
        print(response.json())
        return []


## Get commits

In [14]:
def get_commits(owner, repo, per_page=100, max_pages=5):
    commits = []
    for page in range(1, max_pages + 1):
        url = f"https://api.github.com/repos/{owner}/{repo}/commits"
        params = {"per_page": per_page, "page": page}
        response = requests.get(url, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            if not data:
                break
            commits.extend(data)
        elif response.status_code == 403:
            print("Rate limit reached.")
            break
        else:
            print(f"Error: {response.status_code}")
            print(response.json())
            break
    return commits


## Get file content

In [15]:
def get_file_content(owner, repo, path):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.json())
        return {}


## Function testing

In [16]:
# Search for test repositories

repos = search_repositories()
for repo in repos[:3]:
    print(f"Repo: {repo['full_name']} - ⭐ {repo['stargazers_count']}")

# Obtener commits del primer repositorio
if repos:
    owner, name = repos[0]["owner"]["login"], repos[0]["name"]
    commits = get_commits(owner, name)
    print(f"\nTotal commits obtenidos de {owner}/{name}: {len(commits)}")
    if commits:
        print(f"Primer commit SHA: {commits[0]['sha']}")

# Leer un archivo de ejemplo (README.md)
file_content = get_file_content(owner, name, "README.md")
if file_content:
    print("\nArchivo README.md encontrado:")
    print(file_content.get("Objetive"))


Repo: fighting41love/funNLP - ⭐ 74220
Repo: lk-geimfari/mimesis - ⭐ 4589
Repo: kaitai-io/kaitai_struct - ⭐ 4215

Total commits obtenidos de fighting41love/funNLP: 159
Primer commit SHA: 29f4ac896f11058e87e10968569f999c69679b6f

Archivo README.md encontrado:
None


## Convert to DataFrame

In [17]:
df = pd.DataFrame([
    {
        "sha": c["sha"],
        "author": c["commit"]["author"]["name"],
        "message": c["commit"]["message"],
        "date": c["commit"]["author"]["date"]
    }
    for c in commits
])

df.head()

Unnamed: 0,sha,author,message,date
0,29f4ac896f11058e87e10968569f999c69679b6f,Yang,Merge pull request #97 from ZhaoQiiii/master\n...,2023-08-24T08:47:15Z
1,8b8704b715072a9be720ab636209a3c49b97a7e5,ZhaoQiiii,Update README.md,2023-08-23T09:41:50Z
2,62e8cb5ad697aec34912263428259d8c77161cc7,fighting41love,update multiple LLM frameworks,2023-06-03T14:31:05Z
3,7b4be10a1d5bc198d4f62590bf8a196ae6074c37,fighting41love,add multiple doc-chat frameworks,2023-05-29T06:53:09Z
4,b6a9ae2585e00f86f17699070b245eabd9e8b931,fighting41love,update talbe structure,2023-05-27T12:53:41Z
