# Modelos secuenciales (RNN / LSTM / GRU)

Objetivo: entrenar modelos secuenciales que usen la historia reciente de cada selección (ventaja local/visitante, goles, resultados previos) + variables estáticas para predecir el **resultado del partido**: {0: local, 1: empate, 2: visita}.

Basado en: "Modelos secuenciales" (SimpleRNN/LSTM, validación temporal, seq-to-seq).

Formato esperado de entrada:
1. results_clean.csv
2. goalscorers_clean.csv
3. former_names_clean.csv
4. shootouts_clean.csv

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, SimpleRNN, LSTM, GRU, Input, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from google.colab import files

Se carga el archivo .zip y se descomprime

In [2]:
uploaded = files.upload()
!unzip datasets_clean.zip

Saving datasets_clean.zip to datasets_clean.zip
Archive:  datasets_clean.zip
 extracting: results_clean.csv       
 extracting: goalscorers_clean.csv   
 extracting: former_names_clean.csv  
 extracting: shootouts_clean.csv     


Cargar los archivos csv cleans

In [4]:
df_results = pd.read_csv("results_clean.csv", parse_dates=['date'])
df_goalscorers = pd.read_csv("goalscorers_clean.csv")
df_former = pd.read_csv("former_names_clean.csv")
df_shootouts = pd.read_csv("shootouts_clean.csv")

print(df_results.shape, df_goalscorers.shape, df_former.shape, df_shootouts.shape)


(48532, 12) (44354, 8) (34, 4) (653, 5)


Crear la columna objetivo 'outcome' (0 local, 1 empate, 2 visita)

In [5]:
def outcome_from_scores(h,g):
    if h > g: return 0
    if h == g: return 1
    return 2

df_results['outcome'] = df_results.apply(lambda r: outcome_from_scores(r['home_score'], r['away_score']), axis=1)
df_results['year'] = df_results['date'].dt.year

print("Distribución de clases:")
print(df_results['outcome'].value_counts(normalize=True))

Distribución de clases:
outcome
0    0.490336
2    0.282288
1    0.227376
Name: proportion, dtype: float64


Features estáticos simples: home advantage flag (ya implícito), tournament, neutral. Normalizar nombres de equipos

In [6]:
df_results['home_team'] = df_results['home_team'].str.title()
df_results['away_team'] = df_results['away_team'].str.title()

# ejemplo de features estáticas a usar
df_results['is_neutral'] = df_results['neutral'].astype(int) if 'neutral' in df_results.columns else 0
# turn tournament/country into categorical codes (si existen)
if 'tournament' in df_results.columns:
    df_results['tournament_code'] = df_results['tournament'].astype('category').cat.codes
else:
    df_results['tournament_code'] = 0

df_results[['home_team','away_team','date','home_score','away_score','outcome']].head()

Unnamed: 0,home_team,away_team,date,home_score,away_score,outcome
0,Scotland,England,1872-11-30,0,0,1
1,England,Scotland,1873-03-08,4,2,0
2,Scotland,England,1874-03-07,2,1,0
3,England,Scotland,1875-03-06,2,2,1
4,Scotland,England,1876-03-04,3,0,0


Funciones para construir historial de los ultimos n partidos por equipo

idea: para cada partido, construiremos dos secuencias: historial_local (últ N partidos del home team) y historial_visitante (últ N partidos del away team). Cada historial puede incluir: resultado(0/1/2), goles_for, goals_against


In [7]:
def build_team_histories(df_results, look_back=5, features=['result','goals_for','goals_against']):
    # preparar dataframe con eventos por equipo y fecha
    home = df_results[['date','home_team','home_score','away_score']].copy()
    home.columns = ['date','team','goals_for','goals_against']
    home['is_home'] = 1
    away = df_results[['date','away_team','away_score','home_score']].copy()
    away.columns = ['date','team','goals_for','goals_against']
    away['is_home'] = 0
    events = pd.concat([home, away], ignore_index=True).sort_values(['team','date'])
    # calcular resultado por equipo
    def res(gf,ga):
        if gf>ga: return 1  # win
        if gf==ga: return 0  # draw
        return -1           # loss
    events['result'] = events.apply(lambda r: res(r['goals_for'], r['goals_against']), axis=1)
    # for each team and date, build rolling windows -- we'll return events so we can index later
    return events

# ejemplo de uso
events = build_team_histories(df_results, look_back=5)
events.groupby('team').size().sort_values(ascending=False).head()

Unnamed: 0_level_0,0
team,Unnamed: 1_level_1
Sweden,1093
England,1082
Argentina,1059
Brazil,1051
Germany,1023


Para cada partido, extraer las últimas N filas del equipo antes de la fecha

In [8]:
from collections import deque

def get_last_n_for_team(events_df, team, date, n=5):
    team_events = events_df[(events_df['team']==team) & (events_df['date']<date)].sort_values('date', ascending=False)
    rows = team_events.head(n)
    # si faltan, rellenar con ceros
    if rows.shape[0] < n:
        pad = pd.DataFrame({
            'date':[date]* (n - rows.shape[0]),
            'team':[team]* (n - rows.shape[0]),
            'goals_for':[0]*(n - rows.shape[0]),
            'goals_against':[0]*(n - rows.shape[0]),
            'is_home':[0]*(n - rows.shape[0]),
            'result':[0]*(n - rows.shape[0])
        })
        rows = pd.concat([rows, pad], ignore_index=True)
    return rows.sort_values('date').reset_index(drop=True)

# prueba rápida con el primer partido
sample = df_results.iloc[0]
print(sample['date'], sample['home_team'], sample['away_team'])
print(get_last_n_for_team(events, sample['home_team'], sample['date'], n=5).tail())


1872-11-30 00:00:00 Scotland England
        date      team  goals_for  goals_against  is_home  result
0 1872-11-30  Scotland          0              0        0       0
1 1872-11-30  Scotland          0              0        0       0
2 1872-11-30  Scotland          0              0        0       0
3 1872-11-30  Scotland          0              0        0       0
4 1872-11-30  Scotland          0              0        0       0


Armado final de X (secuencias + estáticas) y (outcome)

In [9]:
look_back = 6  # por ejemplo: 6 partidos previos
rows = []
for idx, row in df_results.iterrows():
    date = row['date']
    home = row['home_team']
    away = row['away_team']
    # secuencias
    seq_home = get_last_n_for_team(events, home, date, n=look_back)
    seq_away = get_last_n_for_team(events, away, date, n=look_back)
    # features: transformar a arrays (por ejemplo: result, goals_for, goals_against)
    home_seq = np.vstack([seq_home['result'].values, seq_home['goals_for'].values, seq_home['goals_against'].values]).T  # shape (look_back, 3)
    away_seq = np.vstack([seq_away['result'].values, seq_away['goals_for'].values, seq_away['goals_against'].values]).T
    # estáticos simples
    is_neutral = int(row.get('neutral', 0)) if 'neutral' in row.index else 0
    tournament = int(row.get('tournament_code', 0))
    # etiqueta
    y = int(row['outcome'])
    rows.append({
        'home_seq': home_seq,
        'away_seq': away_seq,
        'is_neutral': is_neutral,
        'tournament': tournament,
        'y': y,
        'date': date
    })

# convertir a arrays (esto puede ocupar memoria; si hay problema hay que muestrear)
print("Partidos procesados:", len(rows))


Partidos procesados: 48532


Convertir lista en matrices numpy para Keras y concatenamos home_seq y away_seq en una sola secuencia multicanal

In [10]:
n = len(rows)
n_features_team = 3
X_seq = np.zeros((n, look_back, n_features_team*2), dtype=float)
X_static = np.zeros((n, 2), dtype=float)  # is_neutral, tournament (puedes ampliar)
y = np.zeros(n, dtype=int)
dates = []

for i, r in enumerate(rows):
    hs = r['home_seq']  # (look_back, 3)
    as_ = r['away_seq']
    X_seq[i, :, :n_features_team] = hs
    X_seq[i, :, n_features_team:] = as_
    X_static[i, 0] = r['is_neutral']
    X_static[i, 1] = r['tournament']
    y[i] = r['y']
    dates.append(r['date'])

print("X_seq", X_seq.shape, "X_static", X_static.shape, "y", y.shape)


X_seq (48532, 6, 6) X_static (48532, 2) y (48532,)


Escalar las columnas numéricas de las secuencias (por característica) aplana temporalmente para escalar por feature

In [11]:
X_seq_reshaped = X_seq.reshape(-1, X_seq.shape[-1])
scaler_seq = StandardScaler()
X_seq_scaled = scaler_seq.fit_transform(X_seq_reshaped).reshape(X_seq.shape)

# escalar X_static
scaler_static = StandardScaler()
X_static_scaled = scaler_static.fit_transform(X_static)

# codificar etiquetas en one-hot para Keras si se desea
from tensorflow.keras.utils import to_categorical
y_cat = to_categorical(y, num_classes=3)


Setup de validación temporal

In [12]:
tscv = TimeSeriesSplit(n_splits=5)  # ajustable

def eval_model(model, X_seq_train, X_static_train, y_train, X_seq_val, X_static_val, y_val):
    # combinar si el modelo espera concatenado
    # ejemplo: si modelo usa solo X_seq, pasa solo X_seq; si espera concat, ajustar
    y_pred_prob = model.predict([X_seq_val, X_static_val]) if isinstance(model.input, list) else model.predict(X_seq_val)
    y_pred = np.argmax(y_pred_prob, axis=1)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='macro')
    return acc, f1, y_pred

Modelo secuencial: entrada secuencias + features estáticas

In [13]:
tf.keras.backend.clear_session()

# define model that consumes sequence and static input
seq_input = tf.keras.layers.Input(shape=(look_back, n_features_team*2), name='seq_input')
x = SimpleRNN(64, activation='relu')(seq_input)
x = Dropout(0.3)(x)

static_input = tf.keras.layers.Input(shape=(X_static_scaled.shape[1],), name='static_input')
s = Dense(16, activation='relu')(static_input)

concat = tf.keras.layers.concatenate([x, s])
out = Dense(32, activation='relu')(concat)
out = Dense(3, activation='softmax')(out)

model_rnn = tf.keras.models.Model(inputs=[seq_input, static_input], outputs=out)
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_rnn.summary()


Entrenamiento rápido

In [14]:
split = int(n*0.8)
X_seq_train, X_seq_val = X_seq_scaled[:split], X_seq_scaled[split:]
X_static_train, X_static_val = X_static_scaled[:split], X_static_scaled[split:]
y_train_cat, y_val_cat = y_cat[:split], y_cat[split:]

es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model_rnn.fit([X_seq_train, X_static_train], y_train_cat,
                        validation_data=([X_seq_val, X_static_val], y_val_cat),
                        epochs=100, batch_size=128, callbacks=[es], verbose=1)


Epoch 1/100
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.4981 - loss: 1.0275 - val_accuracy: 0.5406 - val_loss: 0.9761
Epoch 2/100
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5343 - loss: 0.9808 - val_accuracy: 0.5379 - val_loss: 0.9767
Epoch 3/100
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5332 - loss: 0.9812 - val_accuracy: 0.5395 - val_loss: 0.9717
Epoch 4/100
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5347 - loss: 0.9774 - val_accuracy: 0.5414 - val_loss: 0.9695
Epoch 5/100
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5403 - loss: 0.9693 - val_accuracy: 0.5402 - val_loss: 0.9712
Epoch 6/100
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5369 - loss: 0.9719 - val_accuracy: 0.5364 - val_loss: 0.9759
Epoch 7/100
[1m304/30

Evaluación

In [15]:
y_val = np.argmax(y_val_cat, axis=1)
y_pred_prob = model_rnn.predict([X_seq_val, X_static_val])
y_pred = np.argmax(y_pred_prob, axis=1)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("F1 macro:", f1_score(y_val, y_pred, average='macro'))
print(classification_report(y_val, y_pred, digits=4))
cm = confusion_matrix(y_val, y_pred)
print("Confusion matrix:\n", cm)

[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Accuracy: 0.5413619037807768
F1 macro: 0.3824799600148374
              precision    recall  f1-score   support

           0     0.5531    0.8603    0.6733      4646
           1     0.0000    0.0000    0.0000      2235
           2     0.5071    0.4452    0.4741      2826

    accuracy                         0.5414      9707
   macro avg     0.3534    0.4352    0.3825      9707
weighted avg     0.4124    0.5414    0.4603      9707

Confusion matrix:
 [[3997    0  649]
 [1661    0  574]
 [1568    0 1258]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Modelo LSTM

In [16]:
tf.keras.backend.clear_session()
seq_input = tf.keras.layers.Input(shape=(look_back, n_features_team*2))
x = LSTM(64, activation='tanh')(seq_input)
x = Dropout(0.3)(x)
static_input = tf.keras.layers.Input(shape=(X_static_scaled.shape[1],))
s = Dense(16, activation='relu')(static_input)
concat = tf.keras.layers.concatenate([x, s])
out = Dense(32, activation='relu')(concat)
out = Dense(3, activation='softmax')(out)
model_lstm = tf.keras.models.Model(inputs=[seq_input, static_input], outputs=out)
model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_lstm.summary()

# entrenamiento de ejemplo (prueba con menos epochs para iterar rápido)
es = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
model_lstm.fit([X_seq_train, X_static_train], y_train_cat,
               validation_data=([X_seq_val, X_static_val], y_val_cat),
               epochs=50, batch_size=128, callbacks=[es], verbose=1)

Epoch 1/50
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.5124 - loss: 1.0042 - val_accuracy: 0.5411 - val_loss: 0.9701
Epoch 2/50
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.5332 - loss: 0.9780 - val_accuracy: 0.5374 - val_loss: 0.9698
Epoch 3/50
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.5393 - loss: 0.9713 - val_accuracy: 0.5390 - val_loss: 0.9675
Epoch 4/50
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.5381 - loss: 0.9744 - val_accuracy: 0.5373 - val_loss: 0.9686
Epoch 5/50
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.5394 - loss: 0.9715 - val_accuracy: 0.5365 - val_loss: 0.9718
Epoch 6/50
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.5374 - loss: 0.9720 - val_accuracy: 0.5379 - val_loss: 0.9697
Epoch 7/50
[1m304/304

<keras.src.callbacks.history.History at 0x7a4d002e6270>

Guardar modelos entrenados

In [18]:
model_rnn.save("model_rnn.h5")
model_lstm.save("model_lstm.h5")

