In [1]:
import pandas as pd
import os

In [2]:
# Ruta del archivo
file_path = "Partidos_Liga 1 Peru_2025.xlsx"

# Leer el archivo Excel
df = pd.read_excel(file_path)

# Convertir a numérico forzando errores a NaN
df["home_score"] = pd.to_numeric(df["home_score"], errors="coerce")
df["away_score"] = pd.to_numeric(df["away_score"], errors="coerce")

# Eliminar filas donde home_score o away_score no sean numéricos
df_clean = df.dropna(subset=["home_score", "away_score"])

# Guardar el resultado limpio en un nuevo Excel
df_clean.to_excel("Partidos_Liga 1 Peru_2025_limpio.xlsx", index=False)

print("Archivo limpio guardado como 'Partidos_Liga 1 Peru_2025_limpio.xlsx'")


Archivo limpio guardado como 'Partidos_Liga 1 Peru_2025_limpio.xlsx'


In [3]:
import ScraperFC as sfc
from pathlib import Path
import glob, traceback
from IPython.display import FileLink

sofascore = sfc.Sofascore()


In [4]:
def rename_duplicate_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = pd.Series(df.columns, dtype="object")
    for name in cols[cols.duplicated()].unique():
        idxs = cols[cols == name].index.tolist()
        for k, i in enumerate(idxs):
            if k: 
                cols.iloc[i] = f"{name}_{k}"
    df.columns = cols
    return df


In [5]:
input_candidates = [
    "Partidos_Liga 1 Peru_2025_limpio.xlsx",
]
paths = []
for pat in input_candidates:
    paths.extend(glob.glob(pat))

if not paths:
    raise FileNotFoundError("No se encontró 'Datos_totales_*.xlsx' (prueba poner la ruta exacta).")

input_file = sorted(set(paths))[-1]
print("Usando archivo:", input_file)

df_matches = pd.read_excel(input_file)

if not (("match_url" in df_matches.columns) or ("match_id" in df_matches.columns)):
    raise KeyError("El Excel debe tener 'match_url' o 'match_id'.")


Usando archivo: Partidos_Liga 1 Peru_2024_limpio.xlsx


In [6]:
liga = globals().get('liga', 'Liga 1 Peru')
year = globals().get('year', '2024')

out_dir = Path("matches_details")
out_dir.mkdir(parents=True, exist_ok=True)

logs = []
total = len(df_matches)


In [None]:
# ---- SCRAPING POR PARTIDO ----
for i, row in df_matches.iterrows():
    match_ref = row.get("match_url") or row.get("match_id")
    if pd.isna(match_ref):
        logs.append((i, None, "ERROR", "Falta match_url y match_id"))
        print(f"[{i+1}/{total}] ❌ Fila sin identificador.")
        continue

    # Normalizar match_id
    match_id = str(match_ref).split("#id:")[-1] if isinstance(match_ref, str) and "#id:" in str(match_ref) else str(match_ref)

    try:
        # ---------- llamadas scrape_* ----------
        try:
            team_stats_df = sofascore.scrape_team_match_stats(match_ref)
        except Exception:
            team_stats_df = pd.DataFrame()

        try:
            player_stats_df = sofascore.scrape_player_match_stats(match_ref)
        except Exception:
            player_stats_df = pd.DataFrame()

        try:
            avg_positions_df = sofascore.scrape_player_average_positions(match_ref)
        except Exception:
            avg_positions_df = pd.DataFrame()

        try:
            shotmap_df = sofascore.scrape_match_shots(match_ref)
        except Exception:
            shotmap_df = pd.DataFrame()

        try:
            momentum_df = sofascore.scrape_match_momentum(match_ref)
        except Exception:
            momentum_df = pd.DataFrame()

        # ---- Heatmaps (usando match_id) ----
        heatmaps_df = pd.DataFrame(columns=["player", "player_id", "heatmap"])
        try:
            hm_dict = sofascore.scrape_heatmaps(match_id)
            heatmaps_list = []
            for pname, info in hm_dict.items():
                if isinstance(info, dict) and info.get("heatmap"):
                    heatmaps_list.append({
                        "player": pname,
                        "player_id": info.get("id"),
                        "heatmap": info.get("heatmap")
                    })
            if heatmaps_list:
                heatmaps_df = rename_duplicate_columns(pd.DataFrame(heatmaps_list))
        except Exception:
            pass  # si falla heatmaps, simplemente se queda vacío

        # ---- Renombrar duplicados ----
        for df_ in (team_stats_df, player_stats_df, avg_positions_df, shotmap_df, momentum_df):
            if isinstance(df_, pd.DataFrame) and not df_.empty:
                rename_duplicate_columns(df_)

        # ---- Guardar por partido ----
        out_xlsx = out_dir / f"Sofascore_{match_id}.xlsx"
        with pd.ExcelWriter(out_xlsx, engine="openpyxl") as w:
            team_stats_df.to_excel(w, sheet_name="Team Stats", index=False)
            player_stats_df.to_excel(w, sheet_name="Player Stats", index=False)
            avg_positions_df.to_excel(w, sheet_name="Average Positions", index=False)
            shotmap_df.to_excel(w, sheet_name="Shotmap", index=False)
            momentum_df.to_excel(w, sheet_name="Match Momentum", index=False)
            heatmaps_df.to_excel(w, sheet_name="Heatmaps", index=False)

        logs.append((match_id, str(out_xlsx), "OK", ""))
        print(f"[{i+1}/{total}] ✅ Partido {match_id} procesado y guardado.")

    except Exception as e:
        logs.append((match_id, None, "ERROR", traceback.format_exc()))
        print(f"[{i+1}/{total}] ❌ Error en {match_id}: {e}")


Running
[1/306] ✅ Partido 12831491 procesado y guardado.
[2/306] ✅ Partido 12831479 procesado y guardado.
[3/306] ✅ Partido 12831475 procesado y guardado.




[4/306] ✅ Partido 12831489 procesado y guardado.
[5/306] ✅ Partido 12831496 procesado y guardado.


Traceback (most recent call last):
  File "c:\Users\Alvaro\Proyectos\Proyecto Gronestats\GroneStatz\venv\Lib\site-packages\botasaurus\browser_decorator.py", line 201, in run_task
    result = func(driver, data)
             ^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alvaro\Proyectos\Proyecto Gronestats\GroneStatz\venv\Lib\site-packages\ScraperFC\utils\botasaurus_getters.py", line 43, in botasaurus_browser_get_json
    result = json.loads(page_source)
             ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Alvaro\AppData\Local\Programs\Python\Python311\Lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Alvaro\AppData\Local\Programs\Python\Python311\Lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Alvaro\AppData\Local\Programs\Python\Python311\Lib\json\decoder.py", line 355, in raw_decode
    raise

Task failed for input: https://api.sofascore.com/api/v1/event/12831493/player/1086295/heatmap


In [7]:
import zipfile
from pathlib import Path
from IPython.display import FileLink

# Carpeta que quieres comprimir
out_dir = Path("matches_details")
zip_filename = "matches_details.zip"

# Crear archivo ZIP
with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(out_dir):
        for file in files:
            file_path = Path(root) / file
            # Guardar con ruta relativa (sin el path absoluto de tu PC)
            arcname = file_path.relative_to(out_dir.parent)
            zipf.write(file_path, arcname)

print(f"✅ Carpeta '{out_dir}' comprimida en {zip_filename}")

# Generar enlace de descarga en Jupyter
FileLink(zip_filename)

✅ Carpeta 'matches_details' comprimida en matches_details.zip
