# Notebook para la extracción de datos a utilizar

## 1. Seteamos la conexión con la API de Github

In [None]:
#Total users
import requests

def print_total_users_for_location(location="Spain", personal_access_token=None):
    if not personal_access_token:
        raise ValueError("PAT is required.")

    url = "https://api.github.com/graphql"
    headers = {
        "Authorization": f"Bearer {personal_access_token}",
        "Content-Type":  "application/json",
        "Accept":        "application/vnd.github.v3+json"
    }

    users_query = """
    query ($query: String!, $first: Int!) {
      search(query: $query, type: USER, first: $first) {
        userCount
      }
    }
    """
    variables = {
        "query": f"location:{location} type:user",
        "first": 1
    }

    resp = requests.post(url, headers=headers, json={"query": users_query, "variables": variables})
    resp.raise_for_status()
    total_user_count = resp.json()["data"]["search"]["userCount"]

    print(f"Total number of users: {total_user_count}")
    return total_user_count

if __name__ == "__main__":
    TOKEN = "" #insertar token
    print_total_users_for_location("France", TOKEN)


Total number of users: 127477


## 2. Configuraciones adicionales para interactuar con la API

In [None]:
import requests
import datetime
import json
from collections import defaultdict

USER_COUNT_QUERY = """
query($query:String!,$first:Int!){
  search(query:$query, type:USER, first:$first) {
    userCount
  }
}
"""

USERS_PAGINATED_QUERY = """
query($query:String!,$first:Int!,$after:String) {
  search(query:$query, type:USER, first:$first, after:$after) {
    edges { node { ... on User { login } } }
    pageInfo { hasNextPage endCursor }
  }
}
"""

def get_user_count(query, url, headers):
    vars = {"query": query, "first": 1}
    resp = requests.post(url, headers=headers,
                         json={"query": USER_COUNT_QUERY, "variables": vars})
    resp.raise_for_status()
    return resp.json()["data"]["search"]["userCount"]

def paginated_search(query, max_slice, url, headers):
    users = []
    after = None
    while len(users) < max_slice:
        vars = {"query": query, "first": 100, "after": after}
        resp = requests.post(url, headers=headers,
                             json={"query": USERS_PAGINATED_QUERY, "variables": vars})
        resp.raise_for_status()
        data = resp.json()["data"]["search"]
        for edge in data["edges"]:
            users.append(edge["node"]["login"])
            if len(users) >= max_slice:
                break
        if not data["pageInfo"]["hasNextPage"]:
            break
        after = data["pageInfo"]["endCursor"]
    return users

def fetch_users(location, max_users, token):
    url = "https://api.github.com/graphql"
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type":  "application/json",
        "Accept":        "application/vnd.github.v3+json"
    }

    start_date = datetime.date(2008, 1, 1)
    end_date   = datetime.date.today()

    ranges = [(start_date, end_date)]
    all_users = []

    while ranges and len(all_users) < max_users:
        s, e = ranges.pop(0)
        created_q = f"created:{s.isoformat()}..{e.isoformat()}"
        query = f"location:{location} type:user {created_q}"

        count = get_user_count(query, url, headers)
        if count == 0:
            continue
        if count <= 1000:
            slice_users = paginated_search(query, min(count, max_users - len(all_users)), url, headers)
            for u in slice_users:
                if u not in all_users:
                    all_users.append(u)
                    if len(all_users) >= max_users:
                        break
        else:
            days = (e - s).days
            mid  = s + datetime.timedelta(days=days//2)
            ranges.insert(0, (mid + datetime.timedelta(days=1), e))
            ranges.insert(0, (s, mid))

    return all_users

def save_users_to_json(users, output_file):
    with open(output_file, 'w') as f:
        json.dump({"users": users}, f, indent=2)
    print(f"Saved {len(users)} users to {output_file}")

if __name__ == "__main__":
    TOKEN = "" #insertar token
    LOCATION = "France"  #seleccionas el país
    MAX_USERS = 1000  #numero de usuarios maximo
    OUTPUT_FILE = "github_users.json"  #luego de extraer cambiar el nombre a por ejemplo Spain_data.json

    print(f"Fetching {MAX_USERS} users in '{LOCATION}'...")
    users = fetch_users(LOCATION, MAX_USERS, TOKEN)
    print(f"→ Retrieved {len(users)} users.")

    save_users_to_json(users, OUTPUT_FILE)

Fetching 1000 users in 'France'...
→ Retrieved 1000 users.
Saved 1000 users to github_users.json


## 3. Extraemos la data de los usuarios y sus commit diarios en el mes de abril

In [None]:
import json
import time
import sys
from collections import defaultdict
from itertools import cycle
from pathlib import Path
from typing import Dict, List

import datetime as dt
import requests

START_DATE   = "2025-04-01"
END_DATE     = "2025-04-30"
TOKENS       = [ "" #insertar token

]
USERS_JSON   = "github_users.json"
OUTPUT_JSON  = "bd_commits_by_user.json"

try:
    dt.date.fromisoformat(START_DATE)
    dt.date.fromisoformat(END_DATE)
except ValueError:
    sys.exit("ERROR: START_DATE y END_DATE deben tener formato YYYY-MM-DD")

START_ISO = f"{START_DATE}T00:00:00Z"
END_ISO   = f"{END_DATE}T23:59:59Z"

COMMIT_QUERY = """
query($username:String!,$from:DateTime!,$to:DateTime!) {
  user(login:$username) {
    contributionsCollection(from:$from, to:$to) {
      totalCommitContributions
      contributionCalendar {
        weeks {
          contributionDays {
            date
            contributionCount
          }
        }
      }
    }
  }
}
"""

GQL_ENDPOINT = "https://api.github.com/graphql"


class RateLimitError(Exception):
    """Token agotado (5 000 peticiones/hora)."""
    pass


class TokenCycler:
    """Rota los PATs en cuanto ocurre rate‑limit."""
    def __init__(self, tokens: List[str]):
        if not tokens:
            raise ValueError("Debes definir al menos un token en TOKENS")
        self._tokens = cycle(tokens)
        self.current = next(self._tokens)

    def next(self) -> str:
        self.current = next(self._tokens)
        return self.current


def gql_request(query: str, variables: Dict, token: str) -> Dict:
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
        "Accept": "application/vnd.github.v3+json",
    }
    r = requests.post(GQL_ENDPOINT, json={"query": query, "variables": variables}, headers=headers)
    if r.status_code == 403 and "rate limit exceeded" in r.text.lower():
        raise RateLimitError()
    r.raise_for_status()
    payload = r.json()

    if "errors" in payload:
        for err in payload["errors"]:
            if err.get("type") == "RATE_LIMITED" or "rate limit" in err.get("message", "").lower():
                raise RateLimitError()
    return payload.get("data", {})


def load_users(path: str) -> List[str]:
    with open(path, "r") as f:
        data = json.load(f)
    users = data.get("users", [])
    if not users:
        sys.exit(f"ERROR: {path} no contiene la lista 'users'.")
    print(f"→ {len(users):,} usuarios cargados desde {path}")
    return users


def fetch_commits_by_user(
    users: List[str],
    start_iso: str,
    end_iso: str,
    token_cycler: TokenCycler,
    resume_json: Path,
) -> Dict[str, Dict]:
    """Devuelve {username: {"total_commits": int, "daily_commits": {...}}}."""
    if resume_json.exists():
        with open(resume_json, "r") as f:
            comm_data = json.load(f)
        processed = set(comm_data.keys())
        print(f"→ Reanudando: {len(processed):,} usuarios ya procesados")
    else:
        comm_data = {}
        processed = set()

    todo = [u for u in users if u not in processed]
    print(f"→ Quedan {len(todo):,} usuarios por procesar\n")

    for idx, login in enumerate(todo, 1):
        while True:
            try:
                vars_ = {"username": login, "from": start_iso, "to": end_iso}
                data = gql_request(COMMIT_QUERY, vars_, token_cycler.current)
                u = data.get("user")

                if not u or not u.get("contributionsCollection"):
                    print(f"   ○ {login}: sin datos")
                    break

                total = u["contributionsCollection"]["totalCommitContributions"]
                daily = defaultdict(int)
                for week in u["contributionsCollection"]["contributionCalendar"]["weeks"]:
                    for day in week["contributionDays"]:
                        c = day["contributionCount"]
                        if c:
                            daily[day["date"]] += c

                comm_data[login] = {
                    "total_commits": total,
                    "daily_commits": daily,
                }
                print(f"   • {login}: {total} commits ({len(daily)} días)")
                break

            except RateLimitError:
                old = token_cycler.current
                new = token_cycler.next()
                print(f"🔄 Token {old[:10]}… agotado → usando {new[:10]}…")
                time.sleep(1)

            except Exception as e:
                print(f"⚠️  {login}: error → {e}")
                break

        if idx % 100 == 0 or idx == len(todo):
            with open(resume_json, "w") as f:
                json.dump(comm_data, f, indent=2)
            print(f"💾 guardado parcial ({idx}/{len(todo)})\n")

    return comm_data


def main() -> None:
    token_cycler = TokenCycler(TOKENS)
    users = load_users(USERS_JSON)
    results = fetch_commits_by_user(
        users,
        start_iso=START_ISO,
        end_iso=END_ISO,
        token_cycler=token_cycler,
        resume_json=Path(OUTPUT_JSON),
    )

    total_users = len(results)
    total_commits = sum(d["total_commits"] for d in results.values())
    print("\n────────── Resumen final ──────────")
    print(f"Usuarios con commits: {total_users:,}")
    print(f"Commits totales:      {total_commits:,}")
    print(f"Archivo guardado en:  {OUTPUT_JSON}")


if __name__ == "__main__":
    main()


→ 1,000 usuarios cargados desde github_users.json
→ Quedan 1,000 usuarios por procesar

   • fabpot: 111 commits (21 días)
   • benoitc: 0 commits (3 días)
   • jd: 10 commits (11 días)
   • byroot: 146 commits (26 días)
   • n1k0: 16 commits (18 días)
   • tdd: 1 commits (1 días)
   • vdemeester: 136 commits (21 días)
   • cgrand: 18 commits (7 días)
   • glaforge: 4 commits (7 días)
   • lindenb: 11 commits (9 días)
   • jponge: 29 commits (13 días)
   • luislavena: 29 commits (19 días)
   • thoas: 3 commits (18 días)
   • cveneziani: 25 commits (22 días)
   • sunny: 43 commits (27 días)
   • yannickcr: 0 commits (18 días)
   • patcito: 0 commits (14 días)
   • Ovid: 33 commits (6 días)
   • dmathieu: 121 commits (22 días)
   • nono: 10 commits (6 días)
   • regisb: 31 commits (16 días)
   • Cpasjuste: 1 commits (1 días)
   • thbar: 7 commits (14 días)
   • abique: 3 commits (6 días)
   • fdv: 0 commits (0 días)
   • ysbaddaden: 36 commits (23 días)
   • noirbizarre: 31 commits (27 d