In [68]:
# =========================
# 0) Imports y configuración
# =========================
from statsbombpy import sb
import pandas as pd
import numpy as np
from collections import Counter
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from numpy.linalg import norm
from random import sample
import math
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 200)

In [69]:
# ==== BLOQUE 1: Lectura de eventos desde carpetas season_* ====
import os
from pathlib import Path
import pandas as pd

BASE_EVENTS = Path("Datos") / "Eventos"   # cambia si tu ruta es distinta

# Tipos de acción que usaremos para Player2Vec
ACTION_TYPES = {'Pass','Carry','Dribble','Shot','Ball Recovery','Interception','Pressure'}

def listar_temp_y_archivos(base_dir=BASE_EVENTS):
    """Devuelve [(season_id, [paths parquet...]), ...] ordenado por season."""
    items = []
    if not base_dir.exists():
        raise FileNotFoundError(f"No existe {base_dir.resolve()}")
    for d in sorted(base_dir.glob("season_*")):
        if not d.is_dir(): 
            continue
        season_id = int(str(d.name).split("_")[-1])
        files = sorted(d.glob("events_*.parquet"))
        if files:
            items.append((season_id, files))
    return items


In [70]:
def cargar_eventos(base_dir=BASE_EVENTS, seasons=None):
    """
    Lee todos los .parquet encontrados.
    - seasons: lista de season_id a incluir (None = todas)
    Retorna: DataFrame concatenado.
    """
    season_files = listar_temp_y_archivos(base_dir)
    if seasons is not None:
        season_files = [ (sid, fls) for sid, fls in season_files if sid in set(seasons) ]
    if not season_files:
        raise FileNotFoundError(f"No encontré archivos en {base_dir}/season_*/events_*.parquet")

    chunks = []
    for sid, files in season_files:
        for p in files:
            df = pd.read_parquet(p)
            # aseguremos algunas columnas
            for col in ['player_id','type','match_id','minute','second','location_x','location_y']:
                if col not in df.columns:
                    df[col] = pd.NA
            df['season_id'] = sid  # por si faltara
            chunks.append(df)

    events = pd.concat(chunks, ignore_index=True)

    # tipos/orden mínimo
    numeric_cols = ['player_id','match_id','minute','second','location_x','location_y','season_id','competition_id']
    for c in numeric_cols:
        if c in events.columns:
            events[c] = pd.to_numeric(events[c], errors='coerce')

    # auditoría rápida
    print(f"Total filas: {len(events):,}")
    if 'type' in events.columns:
        print("\nTop tipos de 'type':")
        print(events['type'].value_counts().head(20))
    keep = ['match_id','timestamp','minute','second','type','player_id','location_x','location_y','under_pressure','counterpress']
    keep = [c for c in keep if c in events.columns]
    print("\n=== PREVIEW COLUMNAS DISPONIBLES ===")
    print("\t".join(keep))
    print(events[keep].head(10))

    # métrica útil para tokenización
    sub = events[events['type'].isin(ACTION_TYPES)].copy()
    n_action = len(sub)
    n_pid = sub['player_id'].notna().sum()
    print(f"\nFilas con ACTION_TYPES: {n_action:,}")
    print(f"Con player_id no nulos: {n_pid:,}")

    return events

In [71]:
# --- ejemplo: leer TODO lo que esté en Datos/Eventos ---
events = cargar_eventos(BASE_EVENTS)

Total filas: 3,522,149

Top tipos de 'type':
type
Pass               970427
Ball Receipt*      892480
Carry              764547
Pressure           329042
Ball Recovery      102196
Duel                77305
Clearance           45662
Block               39085
Goal Keeper         34943
Dribble             32813
Miscontrol          31167
Foul Committed      29844
Shot                29264
Foul Won            28181
Dispossessed        26273
Interception        23313
Dribbled Past       19086
Substitution         9609
Injury Stoppage      7488
Half End             4388
Name: count, dtype: Int64

=== PREVIEW COLUMNAS DISPONIBLES ===
match_id	timestamp	minute	second	type	player_id	location_x	location_y	under_pressure	counterpress
   match_id     timestamp  minute  second         type  player_id  location_x  \
0   3799351  00:00:00.000       0       0  Starting XI        NaN         NaN   
1   3799351  00:00:00.000       0       0  Starting XI        NaN         NaN   
2   3799351  00:00:00.000

In [72]:
# ==== BLOQUE 2: Tokenización y documentos por jugador (XY) ====
import pandas as pd
import numpy as np
from collections import Counter

# Tipos de acción que definen estilo
ACTION_TYPES = {'Pass','Carry','Dribble','Shot','Ball Recovery','Interception','Pressure'}

# Zonas (StatsBomb: x∈[0,120], y∈[0,80])
def _tercio(x):  return 'Def' if x < 40 else ('Med' if x < 80 else 'Ata')
def _carril(y):  return 'Izq' if y < 26.67 else ('Cen' if y < 53.33 else 'Der')

def zona_token_from_xy(row):
    x = row.get('location_x'); y = row.get('location_y')
    if pd.isna(x) or pd.isna(y):
        return 'Zona_NA'
    return f"Z_{_tercio(float(x))}_{_carril(float(y))}"

def _is_true(x):
    return isinstance(x, (bool, np.bool_)) and bool(x)

def _get(row, col):
    return row[col] if (col in row and pd.notna(row[col])) else None

def event_to_token_plus_xy(row):
    """
    Construye la 'palabra' combinando:
      - type (Pass/Carry/Dribble/Shot/Ball Recovery/Interception/Pressure)
      - zona 3x3 a partir de location_x/location_y
      - modificadores disponibles (under_pressure, counterpress, pass/shot/dribble attrs)
    """
    tname = row['type']
    if tname not in ACTION_TYPES:
        return None

    ztok = zona_token_from_xy(row)

    mods = []
    if _is_true(row.get('under_pressure')): mods.append('BajoPresion')
    if _is_true(row.get('counterpress')):   mods.append('ContraPresion')

    if tname == 'Pass':
        h  = _get(row, 'pass_height_name')
        bp = _get(row, 'pass_body_part_name')
        oc = _get(row, 'pass_outcome_name')
        if h:  mods.append({'Ground Pass':'Raso','Low Pass':'Bajo','High Pass':'Alto'}.get(h, h.replace(' ','_')))
        if bp: mods.append({'Right Foot':'PieDer','Left Foot':'PieIzq','Head':'Cabeza'}.get(bp, bp.replace(' ','_')))
        if oc: mods.append('Out_'+oc.replace(' ',''))
    elif tname == 'Shot':
        tech = _get(row, 'shot_technique_name')
        bp   = _get(row, 'shot_body_part_name')
        ft   = _get(row, 'shot_first_time')
        if _is_true(ft): mods.append('PrimerToque')
        if tech: mods.append(tech.replace(' ',''))
        if bp:   mods.append({'Right Foot':'PieDer','Left Foot':'PieIzq','Head':'Cabeza'}.get(bp, bp.replace(' ','_')))
    elif tname == 'Dribble':
        oc = _get(row, 'dribble_outcome_name')
        if oc: mods.append(oc)

    return f"{tname}_{ztok}" + (f"_{'_'.join(mods)}" if mods else "")

def build_player_docs_basic_xy(df: pd.DataFrame, min_actions=30, tok_fn=event_to_token_plus_xy):
    """
    Crea documentos (lista de tokens) por player_id.
    - Ordena por (match_id, minute, second) si existen
    - Agrupa SOLO por player_id (player_name puede venir NaN). Asigna nombre seguro.
    - Filtra jugadores con al menos min_actions tokens.
    """
    by = [c for c in ('match_id','minute','second') if c in df.columns]
    df = df.sort_values(by)

    rows = []
    it = df.itertuples(index=False, name=None)
    # posiciones de columnas para acceso rápido
    cols = list(df.columns)
    idx_pid   = cols.index('player_id') if 'player_id' in cols else None
    idx_type  = cols.index('type') if 'type' in cols else None
    idx_px    = cols.index('location_x') if 'location_x' in cols else None
    idx_py    = cols.index('location_y') if 'location_y' in cols else None

    # mapeo de nombres (si existe)
    idx_pname = cols.index('player_name') if 'player_name' in cols else None

    for row in it:
        r = dict(zip(cols, row))
        # rápido: si falta lo esencial, sigue
        if idx_pid is None or idx_type is None:
            continue
        if pd.isna(r['player_id']):
            continue
        tok = tok_fn(r)
        if tok:
            pname = r['player_name'] if idx_pname is not None else None
            rows.append((r['player_id'], pname, tok))

    if not rows:
        print("[INFO] No se generaron tokens (revisa 'type', 'player_id' y location_x/y).")
        return pd.Series(dtype=object)

    tok_df = pd.DataFrame(rows, columns=['player_id','player_name','token'])

    # Agrupa por player_id; toma el primer nombre no nulo como etiqueta
    agg = tok_df.groupby('player_id').agg(
        tokens=('token', list),
        name=('player_name', lambda s: next((x for x in s if pd.notna(x)), None))
    )

    # nombre seguro si faltó
    agg['player_name'] = [
        (f"player_{int(pid)}" if (pd.isna(nm) or nm is None) else str(nm))
        for pid, nm in zip(agg.index, agg['name'])
    ]
    agg.drop(columns=['name'], inplace=True)

    # filtra por mínimo de acciones
    agg = agg[agg['tokens'].apply(len) >= min_actions]

    # devuelve Serie con índice MultiIndex (player_id, player_name)
    docs = pd.Series(
        agg['tokens'].values,
        index=pd.MultiIndex.from_arrays([agg.index, agg['player_name']], names=['player_id','player_name'])
    )
    return docs

# --- construir documentos ---
player_docs = build_player_docs_basic_xy(events, min_actions=30)

# --- diagnóstico rápido del corpus ---
def quick_diag(docs, topn=20):
    vocab = Counter()
    for d in docs.values: vocab.update(d)
    print(f"# jugadores: {len(docs)}")
    lens = [len(d) for d in docs.values]
    if lens:
        print(f"tokens/jugador -> min:{min(lens)} p50:{np.median(lens):.0f} max:{max(lens)} mean:{np.mean(lens):.1f}")
    print(f"# vocab: {len(vocab)} | Top {topn}:")
    for tok, c in vocab.most_common(topn):
        print(f"{tok:55s} {c}")

quick_diag(player_docs, topn=20)


# jugadores: 942
tokens/jugador -> min:30 p50:1256 max:17340 mean:2389.2
# vocab: 180 | Top 20:
Pass_Z_Med_Izq                                          139237
Pass_Z_Med_Der                                          137414
Pass_Z_Med_Cen                                          117056
Pass_Z_Def_Cen                                          110933
Carry_Z_Med_Izq                                         89193
Carry_Z_Med_Der                                         87688
Pass_Z_Ata_Der                                          82789
Carry_Z_Med_Cen                                         75521
Pass_Z_Ata_Izq                                          73777
Pass_Z_Def_Izq                                          66884
Pass_Z_Def_Der                                          64383
Carry_Z_Def_Cen                                         62226
Carry_Z_Med_Izq_BajoPresion                             46608
Carry_Z_Ata_Der                                         45468
Carry_Z_Med_Der_BajoPresion     

In [73]:
# ==== BLOQUE 3: Entrenamiento de embeddings ====
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import numpy as np

def train_embeddings(player_docs, w2v_dim=128, d2v_dim=128, window=8, min_count=1, workers=4, epochs=12):
    corpus = list(player_docs.values)
    if not corpus:
        raise ValueError("Corpus vacío: no hay documentos de jugadores.")

    total_tokens = int(np.sum([len(d) for d in corpus]))
    print(f"Entrenando Word2Vec con {len(corpus)} documentos y {total_tokens:,} tokens...")

    w2v = Word2Vec(
        sentences=corpus,
        vector_size=w2v_dim,
        window=window,
        min_count=min_count,
        sg=1,             # Skip-gram
        negative=10,
        workers=workers,
        epochs=epochs
    )

    tagged = [TaggedDocument(words=doc, tags=[str(pid)]) for (pid,_), doc in player_docs.items()]
    print(f"Entrenando Doc2Vec ({len(tagged)} jugadores)...")

    d2v = Doc2Vec(
        documents=tagged,
        vector_size=d2v_dim,
        window=window,
        min_count=min_count,
        dm=1,            # PV-DM
        negative=10,
        workers=workers,
        epochs=epochs
    )

    print("✅ Entrenamiento completado.")
    return w2v, d2v

# Ejecuta:
w2v, d2v = train_embeddings(player_docs, w2v_dim=128, d2v_dim=128, window=8, min_count=1, workers=4, epochs=12)

Entrenando Word2Vec con 942 documentos y 2,250,649 tokens...
Entrenando Doc2Vec (942 jugadores)...
✅ Entrenamiento completado.


In [74]:
# ==== BLOQUE 4: Vecinos Doc2Vec (nativo) ====
import pandas as pd

# Índice rápido de jugadores disponibles
players_index = pd.DataFrame(
    [(pid, pname) for (pid, pname) in player_docs.index],
    columns=['player_id','player_name']
).reset_index(drop=True)

def find_players(name_substr, top=20):
    """Busca por substring (case-insensitive)."""
    m = players_index[players_index['player_name'].str.contains(name_substr, case=False, na=False)]
    return m.head(top)

# Mapa id->nombre para presentar bonito
pid2name = {float(pid): pname for (pid, pname) in player_docs.index}

def doc2vec_neighbors(target_pid, d2v, k=10):
    """Top-K más similares usando el índice nativo de Doc2Vec."""
    key = str(target_pid)
    try:
        sims = d2v.dv.most_similar(key, topn=k+20)  # pido más por si sale el propio
    except KeyError:
        raise ValueError(f"Doc2Vec no tiene la clave {key}. Revisa que el player_id exista en player_docs.")
    rows = []
    for tag, sim in sims:
        if tag == key:  # omite el propio
            continue
        try:
            pid = float(tag)
        except:
            pid = tag
        rows.append((pid, pid2name.get(pid, f"player_{int(pid)}"), float(sim)))
        if len(rows) >= k:
            break
    return pd.DataFrame(rows, columns=['player_id','player_name','similaridad'])


In [75]:
# === Uso ===
# 1) Busca a tu jugador:
find_players("435")  # ejemplo

# 2) Toma su player_id y pide vecinos:

Unnamed: 0,player_id,player_name
415,31435.0,player_31435
591,43541.0,player_43541
592,43547.0,player_43547
593,43552.0,player_43552
594,43553.0,player_43553
595,43554.0,player_43554
596,43555.0,player_43555
597,43557.0,player_43557


In [76]:
doc2vec_neighbors(43557.0, d2v, k=10)

Unnamed: 0,player_id,player_name,similaridad
0,15872.0,player_15872,0.774386
1,359908.0,player_359908,0.768627
2,99579.0,player_99579,0.751329
3,276099.0,player_276099,0.750072
4,26774.0,player_26774,0.747934
5,26362.0,player_26362,0.747777
6,30701.0,player_30701,0.742793
7,386196.0,player_386196,0.740569
8,28208.0,player_28208,0.737505
9,75306.0,player_75306,0.732017


Help on function lineups in module statsbombpy.sb:

lineups(match_id, fmt='dataframe', creds: dict = {'user': None, 'passwd': None})

