# Entrenamiento del Modelo Live Win Probability (LWP)

**Objetivo:** Entrenar un modelo de Machine Learning que prediga la probabilidad de victoria en vivo basado en el estado actual del partido.

**Datos:** 4 temporadas de Liga MX con datos 360 de StatsBomb

**Salidas:** 
- `P(Victoria Local)`
- `P(Empate)`
- `P(Victoria Visitante)`

## 1. Setup y Configuración

In [None]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
import statsbombpy.sb as sb

# PySpark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

print(f"Setup complete - {datetime.now()}")

## 2. Inicializar Spark Session con GPU

In [None]:
# Initialize Spark with RAPIDS GPU acceleration
spark = SparkSession.builder \
    .appName("LWP-Model-Training-GPU") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.rapids.sql.enabled", "true") \
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Spark Master: {spark.sparkContext.master}")
print(f"Spark UI: http://localhost:4040")
print("\nSpark Configuration:")
for conf in spark.sparkContext.getConf().getAll():
    if 'rapids' in conf[0].lower() or 'gpu' in conf[0].lower():
        print(f"  {conf[0]}: {conf[1]}")

## 3. Cargar Datos Históricos de StatsBomb

In [None]:
# Liga MX competition ID
COMPETITION_ID = 40  # Liga MX

print("Cargando competiciones...")
competitions = sb.competitions()
liga_mx = competitions[competitions['competition_id'] == COMPETITION_ID]
print(f"\nTemporadas disponibles de Liga MX:")
print(liga_mx[['season_id', 'season_name']])

# Get the last 4 seasons
season_ids = liga_mx['season_id'].tail(4).tolist()
print(f"\nTemporadas seleccionadas: {season_ids}")

In [None]:
# Load all matches from the selected seasons
all_matches = []

for season_id in season_ids:
    print(f"\nCargando partidos de temporada {season_id}...")
    matches = sb.matches(competition_id=COMPETITION_ID, season_id=season_id)
    all_matches.append(matches)
    print(f"  - {len(matches)} partidos cargados")

matches_df = pd.concat(all_matches, ignore_index=True)
print(f"\nTotal de partidos: {len(matches_df)}")
print(f"Columnas: {matches_df.columns.tolist()}")
matches_df.head()

## 4. Feature Engineering - Extracción de Características

In [None]:
def extract_features_from_events(match_id, events_df):
    """
    Extrae características para cada snapshot temporal del partido.
    Cada fila representa el estado del partido en un momento dado.
    """
    features = []
    
    # Get match info
    match_info = matches_df[matches_df['match_id'] == match_id].iloc[0]
    home_team = match_info['home_team']
    away_team = match_info['away_team']
    home_score = match_info['home_score']
    away_score = match_info['away_score']
    
    # Determine final result
    if home_score > away_score:
        result = 'home_win'
    elif home_score < away_score:
        result = 'away_win'
    else:
        result = 'draw'
    
    # Sort events by time
    events_df = events_df.sort_values(['period', 'minute', 'second'])
    
    # Initialize tracking variables
    current_home_score = 0
    current_away_score = 0
    home_shots = 0
    away_shots = 0
    home_passes = 0
    away_passes = 0
    
    # Create snapshots every 5 minutes
    for minute in range(0, 95, 5):
        events_until_now = events_df[events_df['minute'] <= minute]
        
        if len(events_until_now) == 0:
            continue
        
        # Count events by team
        home_events = events_until_now[events_until_now['team'] == home_team]
        away_events = events_until_now[events_until_now['team'] == away_team]
        
        # Calculate current score
        goals = events_until_now[events_until_now['type'] == 'Shot']
        current_home_score = len(goals[(goals['team'] == home_team) & (goals['shot_outcome'] == 'Goal')])
        current_away_score = len(goals[(goals['team'] == away_team) & (goals['shot_outcome'] == 'Goal')])
        
        # Calculate stats
        home_shots = len(home_events[home_events['type'] == 'Shot'])
        away_shots = len(away_events[away_events['type'] == 'Shot'])
        home_passes = len(home_events[home_events['type'] == 'Pass'])
        away_passes = len(away_events[away_events['type'] == 'Pass'])
        
        # Calculate possession (simplified)
        total_events = len(home_events) + len(away_events)
        home_possession = len(home_events) / total_events if total_events > 0 else 0.5
        
        # Create feature row
        feature_row = {
            'match_id': match_id,
            'minute': minute,
            'home_score': current_home_score,
            'away_score': current_away_score,
            'score_diff': current_home_score - current_away_score,
            'home_shots': home_shots,
            'away_shots': away_shots,
            'shots_diff': home_shots - away_shots,
            'home_passes': home_passes,
            'away_passes': away_passes,
            'passes_diff': home_passes - away_passes,
            'home_possession': home_possession,
            'time_remaining': 90 - minute,
            'result': result
        }
        
        features.append(feature_row)
    
    return features

In [None]:
# Extract features from all matches
print("Extrayendo características de los partidos...")
print("Nota: Este proceso puede tomar varios minutos debido a las llamadas a la API.\n")

all_features = []
match_ids = matches_df['match_id'].head(50).tolist()  # Use first 50 matches for training

for i, match_id in enumerate(match_ids, 1):
    try:
        print(f"[{i}/{len(match_ids)}] Procesando match {match_id}...", end=" ")
        events = sb.events(match_id=match_id, split=False, flatten_attrs=True)
        features = extract_features_from_events(match_id, events)
        all_features.extend(features)
        print(f"✓ {len(features)} snapshots")
    except Exception as e:
        print(f"✗ Error: {e}")
        continue

# Convert to pandas DataFrame
features_pd = pd.DataFrame(all_features)
print(f"\nTotal de snapshots (filas): {len(features_pd)}")
print(f"Distribución de resultados:")
print(features_pd['result'].value_counts())
features_pd.head(10)

## 5. Preparar Datos para Entrenamiento en Spark

In [None]:
# Convert pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(features_pd)

print("Spark DataFrame creado:")
spark_df.printSchema()
print(f"\nNúmero de filas: {spark_df.count()}")
spark_df.show(10)

In [None]:
# Prepare features and labels
feature_cols = [
    'minute', 'home_score', 'away_score', 'score_diff',
    'home_shots', 'away_shots', 'shots_diff',
    'home_passes', 'away_passes', 'passes_diff',
    'home_possession', 'time_remaining'
]

# Create vector assembler
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

# Create label indexer (convert string labels to numeric)
label_indexer = StringIndexer(
    inputCol="result",
    outputCol="label"
)

# Transform data
data_with_features = assembler.transform(spark_df)
data_with_labels = label_indexer.fit(data_with_features).transform(data_with_features)

print("Datos preparados para entrenamiento:")
data_with_labels.select('features', 'label', 'result').show(10, truncate=False)

In [None]:
# Split data into training and test sets
train_data, test_data = data_with_labels.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {train_data.count()} filas")
print(f"Test set: {test_data.count()} filas")

## 6. Entrenar Modelo de Clasificación

In [None]:
# Create Random Forest classifier
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    numTrees=100,
    maxDepth=10,
    seed=42
)

print("Entrenando modelo Random Forest con GPU...")
print("Revisa Spark UI en http://localhost:4040 para métricas de rendimiento\n")

start_time = datetime.now()
rf_model = rf.fit(train_data)
training_time = (datetime.now() - start_time).total_seconds()

print(f"✓ Modelo entrenado en {training_time:.2f} segundos")
print(f"✓ Número de árboles: {rf_model.getNumTrees}")
print(f"✓ Feature importances disponibles")

## 7. Evaluar Modelo

In [None]:
# Make predictions on test set
predictions = rf_model.transform(test_data)

print("Predicciones del modelo:")
predictions.select('minute', 'score_diff', 'result', 'label', 'prediction', 'probability').show(20, truncate=False)

In [None]:
# Evaluate model
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

accuracy = evaluator_accuracy.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)

print("="*50)
print("MÉTRICAS DEL MODELO")
print("="*50)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")
print(f"Training Time: {training_time:.2f} seconds")
print("="*50)

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.featureImportances.toArray()
}).sort_values('importance', ascending=False)

print("\nImportancia de características:")
print(feature_importance)

## 8. Guardar Modelo Entrenado

In [None]:
# Save model to disk
MODEL_PATH = "/work/models/lwp_model"

print(f"Guardando modelo en {MODEL_PATH}...")
rf_model.write().overwrite().save(MODEL_PATH)
print("✓ Modelo guardado exitosamente")

# Save label mapping
label_mapping = label_indexer.fit(data_with_features).labels
print(f"\nMapeo de etiquetas: {label_mapping}")
print("  0 = Victoria Local (home_win)")
print("  1 = Empate (draw)")
print("  2 = Victoria Visitante (away_win)")

## 9. Test de Inferencia

In [None]:
# Test inference with sample data
test_scenario = spark.createDataFrame([
    # Scenario 1: Home team winning 2-0 at minute 70
    (70, 2, 0, 2, 8, 3, 5, 250, 180, 70, 0.58, 20),
    # Scenario 2: Tied 1-1 at minute 45
    (45, 1, 1, 0, 5, 5, 0, 200, 200, 0, 0.50, 45),
    # Scenario 3: Away team leading 0-1 at minute 80
    (80, 0, 1, -1, 6, 8, -2, 280, 220, -60, 0.56, 10),
], feature_cols)

test_features = assembler.transform(test_scenario)
test_predictions = rf_model.transform(test_features)

print("Test de inferencia con escenarios de ejemplo:")
test_predictions.select(
    'minute', 'score_diff', 'home_possession', 'time_remaining',
    'prediction', 'probability'
).show(truncate=False)

print("\nInterpretación de probabilidades:")
print("probability[0] = P(Victoria Local)")
print("probability[1] = P(Empate)")
print("probability[2] = P(Victoria Visitante)")

## 10. Resumen y Próximos Pasos

In [None]:
print("="*60)
print("RESUMEN DEL ENTRENAMIENTO")
print("="*60)
print(f"✓ Modelo: Random Forest Classifier")
print(f"✓ Datos: {len(matches_df)} partidos procesados")
print(f"✓ Features: {len(feature_cols)} características")
print(f"✓ Training samples: {train_data.count()}")
print(f"✓ Test samples: {test_data.count()}")
print(f"✓ Accuracy: {accuracy:.4f}")
print(f"✓ F1 Score: {f1_score:.4f}")
print(f"✓ Training time: {training_time:.2f} seconds")
print(f"✓ Modelo guardado en: {MODEL_PATH}")
print("="*60)
print("\nPRÓXIMOS PASOS:")
print("1. Ejecutar notebook 03_Streaming_Estadisticas.ipynb")
print("2. Ejecutar notebook 04_Streaming_Inferencia_LWP.ipynb")
print("3. Capturar métricas desde Spark UI (localhost:4040)")
print("="*60)

In [None]:
# Stop Spark session
# spark.stop()
print("\nNota: Spark session sigue activa para exploración adicional.")
print("Ejecuta 'spark.stop()' cuando termines.")