# Extracción de Datos

## Importación de Bibliotecas

In [2]:
from statsbombpy import sb
import pandas as pd
from pathlib import Path

#Crecenciales de StatsBomb
creds = {"user": "itam_hackathon@hudl.com", "passwd": "pGwIprel"}

## Competencias

In [3]:
competencias = sb.competitions(creds=creds)
competencias.to_parquet("Datos/Competencias/competencias.parquet")
competencias

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
0,73,317,Mexico,Liga MX,male,False,False,2024/2025,2025-08-30T16:14:20.970616,2025-08-30T16:14:20.970616,2025-08-30T16:14:20.970616,2025-08-30T16:14:20.970616
1,73,281,Mexico,Liga MX,male,False,False,2023/2024,2024-12-20T23:40:31.103974,2024-12-20T23:40:31.103974,2024-12-20T23:40:31.103974,2024-12-20T23:40:31.103974
2,73,235,Mexico,Liga MX,male,False,False,2022/2023,2024-09-28T11:05:11.667984,2024-09-28T11:05:11.667984,2024-09-28T11:05:11.667984,2024-09-28T11:05:11.667984
3,73,108,Mexico,Liga MX,male,False,False,2021/2022,2024-12-10T08:59:57.612449,2024-12-10T08:59:57.612449,2024-12-10T08:59:57.612449,2024-12-10T08:59:57.612449


In [4]:
idsCompetencias = competencias[['competition_id', 'season_id']]
idsCompetencias

Unnamed: 0,competition_id,season_id
0,73,317
1,73,281
2,73,235
3,73,108


## Partidos

In [None]:
def ensure_season_folder(base_dir: Path, season_id) -> Path:
    folder = base_dir / f"season_{season_id}"
    folder.mkdir(parents=True, exist_ok=True)
    return folder


In [None]:
# --- loop principal ---
partidos_list = []

BASE_DIR = Path("Datos") / "Eventos"  # raíz: Hackaton/Datos/Eventos

for competition_id, season_id in idsCompetencias.values:
    # 1) obtiene los partidos
    partidos = sb.matches(competition_id=competition_id, season_id=season_id, creds=creds)
    partidos["competition_id"] = competition_id
    partidos["season_id"] = season_id

    # 2) carpeta: Datos/Eventos/competition_<id>/season_<id>
    output_folder = ensure_season_folder(BASE_DIR / f"competition_{competition_id}", season_id)

    # 3) guarda DENTRO de esa carpeta
    file_path = output_folder / f"matches_competition_{competition_id}_season_{season_id}.parquet"
    partidos.to_parquet(file_path, index=False)

    print("Saved", str(file_path))  # p.ej. Datos\Eventos\competition_73\season_108\matches_competition_73_season_108.parquet

    # (opcional) lista de (season, competition, match)
    for match_id in partidos["match_id"].tolist():
        partidos_list.append((season_id, competition_id, match_id))

Saved Datos\Eventos\competition_73\season_317\matches_competition_73_season_317.parquet
Saved Datos\Eventos\competition_73\season_281\matches_competition_73_season_281.parquet
Saved Datos\Eventos\competition_73\season_235\matches_competition_73_season_235.parquet
Saved Datos\Eventos\competition_73\season_108\matches_competition_73_season_108.parquet


## Eventos

In [1]:
import pandas as pd

def limpiar_eventos(ev: pd.DataFrame) -> pd.DataFrame:
    columnas = [
        "player_id",
        "season_id",
        "competition_id",
        "match_id",
        "player_name",
        "team_name",
        "match_id",
        "timestamp",
        "minute",
        "second",
        "type",
        "type_name",
        "under_pressure",
        "counterpress",
        "shot_end_location",
        "pass_end_location",
        "carry_end_location",
        "goalkeeper_end_location",
        "pass_height_name",
        "pass_body_part_name",
        "location",
        "location_x",
        "location_y",
        "shot_technique_name",
        "shot_body_part_name",
        "shot_first_time",
        "dribble_outcome_name",
        "possession_team"
    ]

    # Solo conserva las columnas que existan
    cols_presentes = [c for c in columnas if c in ev.columns]
    return ev[cols_presentes]


In [None]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd
# import statsbombpy as sb  # asumiendo que ya lo tienes importado
# creds = {...}             # tus credenciales si aplican

BASE_EVENTS = Path("Datos") / "Eventos"

# partidos_list con tu estructura: (season_id, competition_id, match_id)
# Si no la tienes, puedes construirla a partir de tus DFs de matches.
# partidos_list = [(season_id, competition_id, match_id), ...]

def guardar_eventos_por_temporada(partidos_list, skip_existing=True):
    for season_id, competition_id, match_id in tqdm(partidos_list, desc="Descargando eventos"):
        out_dir = ensure_season_folder(BASE_EVENTS, season_id)  # Datos/Eventos/season_<id>
        out_path = out_dir / f"events_{match_id}.parquet"

        if skip_existing and out_path.exists():
            continue

        ev = sb.events(match_id=match_id, creds=creds)  # <- dataframe de eventos
        # añade metadatos útiles
        ev["season_id"] = season_id
        ev["competition_id"] = competition_id
        ev["match_id"] = match_id

        ev = limpiar_eventos(ev)

        ev.to_parquet(out_path, index=False)
        # print(f"Saved {out_path}")  # opcional

# Ejecución:
# guardar_eventos_por_temporada(partidos_list)


In [11]:
guardar_eventos_por_temporada(partidos_list)

  events = pd.concat([*events.values()], axis=0, ignore_index=True, sort=True)
  events = pd.concat([*events.values()], axis=0, ignore_index=True, sort=True)
  ev["season_id"] = season_id
  ev["competition_id"] = competition_id
  events = pd.concat([*events.values()], axis=0, ignore_index=True, sort=True)
  ev["season_id"] = season_id
  ev["competition_id"] = competition_id
  events = pd.concat([*events.values()], axis=0, ignore_index=True, sort=True)
  ev["season_id"] = season_id
  ev["competition_id"] = competition_id
  events = pd.concat([*events.values()], axis=0, ignore_index=True, sort=True)
  ev["season_id"] = season_id
  ev["competition_id"] = competition_id
  events = pd.concat([*events.values()], axis=0, ignore_index=True, sort=True)
  ev["season_id"] = season_id
  ev["competition_id"] = competition_id
  events = pd.concat([*events.values()], axis=0, ignore_index=True, sort=True)
  events = pd.concat([*events.values()], axis=0, ignore_index=True, sort=True)
  ev["season_id"] 

KeyboardInterrupt: 