# Pass Network EDA — AUTOGEN 📊⚽

Notebook autosuficiente para explorar **redes de pases** a partir de un CSV tipo WhoScored y los artefactos generados por `scripts/build_graph.py`.

**Requisitos previos (en tu entorno `futgnn`):**
- `pandas`, `networkx`, `matplotlib`, `torch`, `torch_geometric`
- Haber generado `data/processed/passes_GLOBAL.gpickle` y `passes_GLOBAL.pt` con el script de construcción (o usa el *batch* incluido).

> Puedes editar los parámetros (archivo CSV, umbrales de inferencia) en la siguiente celda.

In [None]:
# === Utils + Parámetros ===
from pathlib import Path
import pandas as pd, pickle, networkx as nx
import torch

# Localiza la raíz del repo (carpetas 'data' y 'src')
def find_repo_root(start: Path | None = None) -> Path:
    p = (start or Path.cwd()).resolve()
    for cand in [p, *p.parents]:
        if (cand / "data").exists() and (cand / "src").exists():
            return cand
    return p

def read_gpickle_robust(path: Path):
    # Carga un .gpickle con fallback a pickle puro (por compatibilidad).
    try:
        from networkx.readwrite.gpickle import read_gpickle
        return read_gpickle(path)
    except Exception:
        with open(path, "rb") as f:
            return pickle.load(f)

def map_from(df, key, value):
    # Crea dict df[key] -> df[value], si ambas columnas existen.
    if key in df and value in df:
        return (df[[key, value]].dropna().drop_duplicates(key).set_index(key)[value].to_dict())
    return {}

# --- Parámetros editables ---
CSV_NAME   = "Atletico Madrid 1-0 Real Madrid-Champions League-2025-04-07.csv"  # cambia aquí si quieres otro CSV
NEAR_DIST  = 10.0
LOOKAHEAD  = 15

ROOT   = find_repo_root()
RAW    = ROOT / "data" / "raw" / CSV_NAME
PROC_G = ROOT / "data" / "processed" / "passes_GLOBAL.gpickle"
PROC_PT= ROOT / "data" / "processed" / "passes_GLOBAL.pt"

print("ROOT:", ROOT)
print("RAW exists?    ", RAW.exists(), RAW)
print("PROC_G exists? ", PROC_G.exists(), PROC_G)
print("PROC_PT exists?", PROC_PT.exists(), PROC_PT)

In [None]:
# === Carga de eventos y artefactos ===
# Si no existen los artefactos GLOBAL, puedes generarlos con el batch (archivo: batch_build_and_report.py)
events = pd.read_csv(RAW, encoding="utf-8-sig", on_bad_lines="skip")

# Grafo NetworkX
G = read_gpickle_robust(PROC_G)

# Objeto PyG (opcional, para features nodales)
data = torch.load(PROC_PT)
print((G.number_of_nodes(), G.number_of_edges()), data)

In [None]:
# === Mapeos mínimos y posición promedio (x,y) por jugador ===
name_map  = map_from(events, "playerId", "shortName")
shirt_map = map_from(events, "playerId", "shirtNo")
team_map  = map_from(events, "playerId", "teamId")

if all(c in events.columns for c in ["playerId","x","y","type"]):
    pos_df = events[events["type"].astype(str).str.lower().eq("pass")][["playerId","x","y"]].dropna()
    pos_xy = pos_df.groupby("playerId")[["x","y"]].mean().to_dict("index")
else:
    pos_xy = {}

print("Players with names:", len(name_map))
print("Players with avg pos:", len(pos_xy))

In [None]:
# === Resúmenes rápidos ===
import numpy as np

edges = [{
    "from_id": u,
    "to_id": v,
    "weight": d.get("weight", 1),
    "success_rate": (d.get("success_count", 0) / max(1, d.get("weight", 1))),
    "dist_avg": (d.get("dist_sum", 0.0) / max(1, d.get("weight", 1))),
} for u, v, d in G.edges(data=True)]
edges_df = pd.DataFrame(edges).sort_values("weight", ascending=False)

nodes = [{
    "player_id": n,
    "name": name_map.get(n, str(n)),
    "shirt": shirt_map.get(n),
    "team": team_map.get(n),
    "deg_out": G.out_degree(n),
    "deg_in": G.in_degree(n),
    "x_mean": pos_xy.get(n, {}).get("x", np.nan),
    "y_mean": pos_xy.get(n, {}).get("y", np.nan),
} for n in G.nodes()]
nodes_df = pd.DataFrame(nodes).sort_values(["team","deg_out"], ascending=[True, False])

edges_df.head(10), nodes_df.head(10)

In [None]:
# === Punto 3: Centralidades (grado/strength, betweenness, PageRank) ===
import numpy as np, networkx as nx
from IPython.display import display

deg_out = dict(G.out_degree())
deg_in  = dict(G.in_degree())

out_strength = {n: 0.0 for n in G.nodes()}
in_strength  = {n: 0.0 for n in G.nodes()}
for u, v, d in G.edges(data=True):
    w = float(d.get('weight', 1))
    out_strength[u] += w
    in_strength[v]  += w

bet_unw = nx.betweenness_centrality(G, normalized=True)

H = G.copy()
for u, v, d in H.edges(data=True):
    w = float(d.get('weight', 1))
    d['length'] = 1.0 / max(w, 1e-9)
bet_w = nx.betweenness_centrality(H, normalized=True, weight='length')

try:
    pr_w = nx.pagerank(G, alpha=0.85, weight='weight', max_iter=100)
except Exception:
    pr_w = {n: np.nan for n in G.nodes()}

nodes_df = nodes_df.set_index('player_id')
nodes_df['str_out']       = nodes_df.index.map(out_strength.get)
nodes_df['str_in']        = nodes_df.index.map(in_strength.get)
nodes_df['betweenness']   = nodes_df.index.map(bet_unw.get)
nodes_df['betweenness_w'] = nodes_df.index.map(bet_w.get)
nodes_df['pagerank_w']    = nodes_df.index.map(pr_w.get)
nodes_df = nodes_df.reset_index()

def top(df, col, k=15):
    cols = [c for c in ['name','player_id','team','shirt',col] if c in df.columns]
    return df.sort_values(col, ascending=False)[cols].head(k)

print("Top betweenness (no ponderado):")
display(top(nodes_df, 'betweenness'))
print("Top betweenness (ponderado 1/weight):")
display(top(nodes_df, 'betweenness_w'))
print("Top PageRank (ponderado):")
display(top(nodes_df, 'pagerank_w'))
print("Top out-strength (suma de pesos):")
display(top(nodes_df, 'str_out'))

In [None]:
# === Visualización: red de pases con layout por posición promedio ===
import matplotlib.pyplot as plt
import networkx as nx

pos, missing = {}, []
for n in G.nodes():
    xy = pos_xy.get(n)
    if xy is not None:
        pos[n] = (float(xy["x"]), float(xy["y"]))
    else:
        missing.append(n)
if missing:
    spring_pos = nx.spring_layout(G.subgraph(missing), seed=1)
    for n in missing:
        pos[n] = spring_pos[n]

plt.figure(figsize=(8, 10))
weights = [min(4, 0.2 * d.get("weight", 1)) for *_, d in G.edges(data=True)]
nx.draw_networkx_edges(G, pos, width=weights, arrows=True, arrowsize=10)
nx.draw_networkx_nodes(G, pos, node_size=120)
labels = {n: name_map.get(n, str(n)) for n in G.nodes()}
nx.draw_networkx_labels(G, pos, labels=labels, font_size=7)
plt.axis("off")
plt.title("Red de pases (layout por posición promedio)")
plt.show()

In [None]:
# === Matriz de pases (origen × destino) ===
import numpy as np
import matplotlib.pyplot as plt

out_strength_order = edges_df.groupby("from_id")["weight"].sum().sort_values(ascending=False).index.tolist()
in_strength_order  = edges_df.groupby("to_id")["weight"].sum().sort_values(ascending=False).index.tolist()

mat = edges_df.pivot_table(index="from_id", columns="to_id", values="weight", fill_value=0)
mat = mat.reindex(index=out_strength_order, columns=in_strength_order)

row_labels = [name_map.get(i, str(i)) for i in mat.index]
col_labels = [name_map.get(i, str(i)) for i in mat.columns]

plt.figure(figsize=(9, 8))
plt.imshow(mat.values, aspect="auto")
plt.colorbar(label="# pases")
plt.xticks(range(len(col_labels)), col_labels, rotation=90, fontsize=6)
plt.yticks(range(len(row_labels)), row_labels, fontsize=6)
plt.title("Matriz de pases (origen × destino)")
plt.tight_layout()
plt.show()

In [None]:
# === Export: tablas y figuras a /data/processed ===
from pathlib import Path
import matplotlib.pyplot as plt

OUT_DIR = ROOT / "data" / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Guarda CSVs
nodes_csv = OUT_DIR / "nodes_GLOBAL.csv"
edges_csv = OUT_DIR / "edges_GLOBAL.csv"
nodes_df.to_csv(nodes_csv, index=False)
edges_df.to_csv(edges_csv, index=False)
print("CSV guardados:", nodes_csv, edges_csv)

# Guarda figuras
FIG_DIR = ROOT / "reports" / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

# Red
plt.figure(figsize=(8, 10))
weights = [min(4, 0.2 * d.get("weight", 1)) for *_, d in G.edges(data=True)]
nx.draw_networkx_edges(G, pos, width=weights, arrows=True, arrowsize=10)
nx.draw_networkx_nodes(G, pos, node_size=120)
labels = {n: name_map.get(n, str(n)) for n in G.nodes()}
nx.draw_networkx_labels(G, pos, labels=labels, font_size=7)
plt.axis("off")
plt.title("Red de pases (layout por posición promedio)")
plt.tight_layout()
plt.savefig(FIG_DIR / "pass_network_GLOBAL.png", dpi=200)
plt.close()

# Matriz
import numpy as np
plt.figure(figsize=(9, 8))
plt.imshow(mat.values, aspect="auto")
plt.colorbar(label="# pases")
plt.xticks(range(len(col_labels)), col_labels, rotation=90, fontsize=6)
plt.yticks(range(len(row_labels)), row_labels, fontsize=6)
plt.title("Matriz de pases (origen × destino)")
plt.tight_layout()
plt.savefig(FIG_DIR / "pass_matrix_GLOBAL.png", dpi=200)
plt.close()

print("Figuras guardadas en:", FIG_DIR)