In [None]:
import pandas as pd
import os, glob, traceback, requests
from pathlib import Path
import ScraperFC as sfc

# -------------------- CONFIG --------------------
file_path = "Partidos_detalles_faltantes_2025.xlsx"
sofascore = sfc.Sofascore()

# -------------------- HELPERS --------------------
def rename_duplicate_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = pd.Series(df.columns, dtype="object")
    for name in cols[cols.duplicated()].unique():
        idxs = cols[cols == name].index.tolist()
        for k, i in enumerate(idxs):
            if k:
                cols.iloc[i] = f"{name}_{k}"
    df.columns = cols
    return df

def safe_get_json(url: str):
    try:
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=15)
        if r.status_code == 200 and r.text.strip():
            return r.json()
        return {}
    except Exception:
        return {}

# ---- safe wrappers (solo para las que no da ScraperFC) ----
def safe_scrape_team_stats(match_id):
    url = f"https://api.sofascore.com/api/v1/event/{match_id}/statistics"
    data = safe_get_json(url)
    if "statistics" in data:
        rows = []
        for block in data["statistics"]:
            for item in block.get("groups", []):
                for stat in item.get("statisticsItems", []):
                    rows.append({
                        "period": block.get("period"),
                        "team": stat.get("team"),
                        "name": stat.get("name"),
                        "value": stat.get("value")
                    })
        return pd.DataFrame(rows)
    return pd.DataFrame()

def safe_scrape_player_stats(match_id):
    url = f"https://api.sofascore.com/api/v1/event/{match_id}/player-statistics"
    data = safe_get_json(url)
    if "players" in data:
        return pd.json_normalize(data["players"])
    return pd.DataFrame()

def safe_scrape_avg_positions(match_id):
    url = f"https://api.sofascore.com/api/v1/event/{match_id}/average-positions"
    data = safe_get_json(url)
    if "averagePositions" in data:
        return pd.DataFrame(data["averagePositions"])
    return pd.DataFrame()

def safe_scrape_shotmap(match_id):
    url = f"https://api.sofascore.com/api/v1/event/{match_id}/shotmap"
    data = safe_get_json(url)
    if "shotmap" in data:
        return pd.DataFrame(data["shotmap"])
    return pd.DataFrame()

def safe_scrape_momentum(match_id):
    url = f"https://api.sofascore.com/api/v1/event/{match_id}/graph"
    data = safe_get_json(url)
    if "graphPoints" in data:
        return pd.DataFrame(data["graphPoints"])
    return pd.DataFrame()

# -------------------- INPUT --------------------
input_candidates = [
    "Partidos_*.xlsx",
    "/mnt/data/Partidos_*.xlsx",
    "/mnt/data/Partidos_Liga 1 Peru_2025.xlsx",
    file_path,
]
paths = []
for pat in input_candidates:
    paths.extend(glob.glob(pat))
if not paths:
    raise FileNotFoundError("No se encontró 'Partidos_*.xlsx' (prueba poner la ruta exacta).")
input_file = sorted(set(paths))[-1]
print("Usando archivo:", input_file)

# Maestro
df_matches = pd.read_excel(input_file)
if not (("match_url" in df_matches.columns) or ("match_id" in df_matches.columns)):
    raise KeyError("El Excel debe tener 'match_url' o 'match_id'.")

# -------------------- OUTPUT --------------------
out_dir = Path("matches_details")
out_dir.mkdir(parents=True, exist_ok=True)

# Partidos ya procesados
procesados = {f.stem.split("_")[1] for f in out_dir.glob("Sofascore_*.xlsx")}
print(f"👉 {len(procesados)} partidos detectados ya procesados. Se saltarán.\n")

logs = []
total = len(df_matches)

# -------------------- LOOP --------------------
for i, row in df_matches.iterrows():
    match_ref = row.get("match_url") or row.get("match_id")
    if pd.isna(match_ref):
        logs.append((i, None, "ERROR", "Falta match_url y match_id"))
        print(f"[{i+1}/{total}] ❌ Fila sin identificador.")
        continue

    match_id = str(match_ref).split("#id:")[-1] if "#id:" in str(match_ref) else str(match_ref)

    if match_id in procesados:
        print(f"[{i+1}/{total}] ⏭ Partido {match_id} ya procesado. Se omite.")
        continue

    try:
        print(f"[{i+1}/{total}] Procesando partido {match_id} ...")

        # --- Team stats ---
        print("   → team_stats")
        try:
            df1 = sofascore.scrape_team_match_stats(match_ref)
            if df1.empty: raise ValueError
        except: df1 = safe_scrape_team_stats(match_id)

        # --- Player stats ---
        print("   → player_stats")
        try:
            df2 = sofascore.scrape_player_match_stats(match_ref)
            if df2.empty: raise ValueError
        except: df2 = safe_scrape_player_stats(match_id)

        # --- Average positions ---
        print("   → avg_positions")
        try:
            df3 = sofascore.scrape_player_average_positions(match_ref)
            if df3.empty: raise ValueError
        except: df3 = safe_scrape_avg_positions(match_id)

        # --- Shotmap ---
        print("   → shotmap")
        try:
            df4 = sofascore.scrape_match_shots(match_ref)
            if df4.empty: raise ValueError
        except: df4 = safe_scrape_shotmap(match_id)

        # --- Momentum ---
        print("   → momentum")
        try:
            df5 = sofascore.scrape_match_momentum(match_ref)
            if df5.empty: raise ValueError
        except: df5 = safe_scrape_momentum(match_id)

        # --- Heatmaps sin filtro de touches ---
        print("   → heatmaps (ScraperFC sin filtro)")
        df6 = pd.DataFrame()
        try:
            if not df2.empty:
                # Normalizar nombres de columnas
                df2.columns = [c.lower().strip().replace(" ", "_").replace(".", "_") for c in df2.columns]
                # Scrapear todos los heatmaps
                all_heatmaps = sofascore.scrape_heatmaps(match_ref)
                rows = []
                for pname, pdata in all_heatmaps.items():
                    rows.append({
                        "player": pname,
                        "player_id": pdata.get("id"),
                        "heatmap": pdata.get("heatmap", [])
                    })
                df6 = pd.DataFrame(rows)
                print(f"      · Jugadores con heatmap descargado: {len(df6)}")
        except Exception as e:
            print(f"      · ERROR heatmaps: {e}")
            df6 = pd.DataFrame(columns=["player", "player_id", "heatmap"])

        # --- Guardar Excel ---
        out_xlsx = out_dir / f"Sofascore_{match_id}.xlsx"
        with pd.ExcelWriter(out_xlsx, engine="openpyxl") as w:
            df1.to_excel(w, sheet_name="Team Stats", index=False)
            df2.to_excel(w, sheet_name="Player Stats", index=False)
            df3.to_excel(w, sheet_name="Average Positions", index=False)
            df4.to_excel(w, sheet_name="Shotmap", index=False)
            df5.to_excel(w, sheet_name="Match Momentum", index=False)
            df6.to_excel(w, sheet_name="Heatmaps", index=False)

        logs.append((match_id, str(out_xlsx), "OK", ""))
        print(f"[{i+1}/{total}] ✅ Partido {match_id} procesado y guardado.\n")

    except Exception as e:
        logs.append((match_id, None, "ERROR", traceback.format_exc()))
        print(f"[{i+1}/{total}] ❌ Error en {match_id}: {e}\n")



Usando archivo: Partidos_detalles_faltantes_2025.xlsx
👉 41 partidos detectados ya procesados. Se saltarán.

[1/143] ⏭ Partido 13939386 ya procesado. Se omite.
[2/143] ⏭ Partido 13940654 ya procesado. Se omite.
[3/143] ⏭ Partido 13565841 ya procesado. Se omite.
[4/143] ⏭ Partido 13668058 ya procesado. Se omite.
[5/143] ⏭ Partido 13668481 ya procesado. Se omite.
[6/143] ⏭ Partido 13679408 ya procesado. Se omite.
[7/143] ⏭ Partido 13679410 ya procesado. Se omite.
[8/143] ⏭ Partido 13679409 ya procesado. Se omite.
[9/143] ⏭ Partido 13679411 ya procesado. Se omite.
[10/143] ⏭ Partido 13679403 ya procesado. Se omite.
[11/143] ⏭ Partido 13679405 ya procesado. Se omite.
[12/143] ⏭ Partido 13679395 ya procesado. Se omite.
[13/143] ⏭ Partido 13679402 ya procesado. Se omite.
[14/143] ⏭ Partido 13679398 ya procesado. Se omite.
[15/143] ⏭ Partido 13679396 ya procesado. Se omite.
[16/143] ⏭ Partido 13679427 ya procesado. Se omite.
[17/143] ⏭ Partido 13679387 ya procesado. Se omite.
[18/143] ⏭ Partid