In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error




In [2]:
with open('data/curated/player_data.json', 'r') as f:
    player_data = json.load(f)

In [3]:
df = pd.DataFrame(player_data)

In [4]:
df = df.astype({
    'match_id': 'string',
    'account_id': 'string',
    'hero_id': 'string',
    'radiant_team_id': 'string',
    'dire_team_id': 'string',
    'win': 'boolean',
    'firstblood_claimed': 'boolean',
    'team_id': 'string',
    'enemy_team_id': 'string',
    'valveId': 'string',
})

In [5]:
q1_glicko = df['glicko2_rating'].quantile(0.25)
df['glicko2_rating'] = df['glicko2_rating'].fillna(q1_glicko)
q1_enemy_glicko = df['enemy_glicko2_rating'].quantile(0.25)
df['enemy_glicko2_rating'] = df['enemy_glicko2_rating'].fillna(q1_enemy_glicko)

In [6]:
mask = df['rating_advantage'].isna()
df.loc[mask, 'rating_advantage'] = df.loc[mask, 'glicko2_rating'] - df.loc[mask, 'enemy_glicko2_rating']

In [7]:
df = df.sort_values(['account_id', 'match_start_time'])


In [8]:
SEQ_LEN=5

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15437 entries, 1126 to 14854
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   match_id                  15437 non-null  string 
 1   match_start_time          15437 non-null  int64  
 2   account_id                15437 non-null  string 
 3   name                      15437 non-null  object 
 4   fantasy_role              15437 non-null  object 
 5   hero_id                   15437 non-null  string 
 6   hero_variant              15437 non-null  object 
 7   isRadiant                 15437 non-null  bool   
 8   radiant_team_id           15437 non-null  string 
 9   dire_team_id              15437 non-null  string 
 10  tournament_start_date     15437 non-null  object 
 11  duration                  15437 non-null  int64  
 12  win                       15437 non-null  boolean
 13  kills                     15437 non-null  int64  
 14  deaths  

In [10]:
not_features = [
    'match_id', 'name', 'match_start_time', 'radiant_team_id', 'dire_team_id',
    'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score',
    'rolling_winrate_10',
    'rolling_winrate_15',
    'rolling_fantasy_score_10',
    'rolling_fantasy_score_15',
    'rolling_enemy_glicko2_10',
    'rolling_enemy_glicko2_15',
]

In [11]:
not_features_fe = [
    'match_id', 'name', 'match_start_time', 'radiant_team_id', 'dire_team_id',
    'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score',
]

In [12]:
df['isRadiant'] = df['isRadiant'].astype(int)
df['win'] = df['win'].astype(int)
df['firstblood_claimed'] = df['firstblood_claimed'].astype(int)

In [13]:
df = pd.get_dummies(df, columns=['fantasy_role', 'hero_id', 'hero_variant'])

# LSTM without FE

In [14]:
#features = [col for col in df.columns if col not in not_features]
features = [col for col in df.columns if col not in not_features and pd.api.types.is_numeric_dtype(df[col])]#df[features].isna().sum()
df[features] = df[features].fillna(df[features].mean())

In [15]:
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

Create sequences

In [16]:
X_seqs = []
y_seqs = []

for _, player_df in df.groupby('account_id'):
    player_df = player_df.reset_index(drop=True)
    for i in range(len(player_df) - SEQ_LEN):
        X_seqs.append(player_df.loc[i:i+SEQ_LEN-1, features].values)
        y_seqs.append(player_df.loc[i+SEQ_LEN, 'fantasy_score'])

X_seqs = np.array(X_seqs)
y_seqs = np.array(y_seqs)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_seqs, y_seqs, test_size=0.20, random_state=1001)

In [18]:
early_stop = EarlyStopping(
    monitor='val_loss',      
    patience=3,
    restore_best_weights=True
)

In [19]:
model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, X_train.shape[2])),
    layers.LSTM(256, return_sequences=True),
    layers.Dropout(0.3),
    layers.LSTM(256, return_sequences=True),
    layers.LSTM(128, return_sequences=True),
    layers.LSTM(64, return_sequences=True),
    layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.summary()

history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop]
)

Epoch 1/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 79.4669 - val_loss: 27.0072
Epoch 2/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 25.1415 - val_loss: 25.7938
Epoch 3/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 25.0028 - val_loss: 25.3532
Epoch 4/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - loss: 24.9256 - val_loss: 24.9942
Epoch 5/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 24.1587 - val_loss: 24.7750
Epoch 6/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 24.2456 - val_loss: 24.6322
Epoch 7/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - loss: 24.3883 - val_loss: 24.5345
Epoch 8/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 24.7290 - val_loss: 24.5116
Epoch 9/30
[1m3

# LSTM with FE

In [20]:
features_fe = [col for col in df.columns if col not in not_features and pd.api.types.is_numeric_dtype(df[col])]#df[features].isna().sum()
df[features_fe] = df[features_fe].fillna(df[features_fe].mean())

In [21]:
scaler = StandardScaler()
df[features_fe] = scaler.fit_transform(df[features_fe])

In [22]:
X_seqs_fe = []
y_seqs_fe = []

for _, player_df in df.groupby('account_id'):
    player_df = player_df.reset_index(drop=True)
    for i in range(len(player_df) - SEQ_LEN):
        X_seqs_fe.append(player_df.loc[i:i+SEQ_LEN-1, features_fe].values)
        y_seqs_fe.append(player_df.loc[i+SEQ_LEN, 'fantasy_score'])

X_seqs_fe = np.array(X_seqs_fe)
y_seqs_fe = np.array(y_seqs_fe)

In [23]:
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X_seqs_fe, y_seqs_fe, test_size=0.20, random_state=1001)

In [24]:
model_fe = models.Sequential([
    layers.Input(shape=(SEQ_LEN, X_train_fe.shape[2])),
    layers.LSTM(256, return_sequences=True),
    layers.Dropout(0.3),
    layers.LSTM(256, return_sequences=True),
    layers.LSTM(128, return_sequences=True),
    layers.LSTM(64, return_sequences=True),
    layers.Dense(1)
])
model_fe.compile(optimizer='adam', loss='mse')
model_fe.summary()

history = model_fe.fit(
    X_train_fe, y_train_fe,
    epochs=30,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop]
)

Epoch 1/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 77.8265 - val_loss: 27.3076
Epoch 2/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 26.5736 - val_loss: 26.6591
Epoch 3/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 26.3071 - val_loss: 26.3080
Epoch 4/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 25.9919 - val_loss: 25.7579
Epoch 5/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 25.4280 - val_loss: 25.3314
Epoch 6/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 24.3948 - val_loss: 24.9864
Epoch 7/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 25.0060 - val_loss: 24.8162
Epoch 8/30
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 24.1044 - val_loss: 24.6481
Epoch 9/30
[1m3

# Comparison LSTM with/without FE

In [25]:
y_pred = model.predict(X_test)
print("LSTM without FE, RMSE:", root_mean_squared_error(y_test, y_pred))

[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step


ValueError: Found array with dim 3, while dim <= 2 is required.

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
print("LSTM without FE, Train RMSE:", root_mean_squared_error(y_train, y_train_pred))
print("LSTM without FE, Test RMSE:", root_mean_squared_error(y_test, y_test_pred))
print("LSTM without FE, Train MAE:", mean_absolute_error(y_train, y_train_pred))
print("LSTM without FE, Test MAE:", mean_absolute_error(y_test, y_test_pred))
print("LSTM without FE, Train R2:", r2_score(y_train, y_train_pred))
print("LSTM without FE, Test R2:", r2_score(y_test, y_test_pred))

In [None]:
y_pred_fe = model_fe.predict(X_test_fe)
print("LSTM with FE, RMSE:", root_mean_squared_error(y_test_fe, y_pred_fe))

In [None]:
print("LSTM with FE, Train RMSE:", root_mean_squared_error(y_train, y_train_pred))
print("LSTM with FE, Test RMSE:", root_mean_squared_error(y_test, y_test_pred))
print("LSTM with FE, Train MAE:", mean_absolute_error(y_train, y_train_pred))
print("LSTM with FE, Test MAE:", mean_absolute_error(y_test, y_test_pred))
print("LSTM with FE, Train R2:", r2_score(y_train, y_train_pred))
print("LSTM with FE, Test R2:", r2_score(y_test, y_test_pred))

In [None]:
def make_symmetric_matchups(matchups):
    symmetric = []
    for t1, t2 in matchups:
        symmetric.append((t1, t2))
        symmetric.append((t2, t1))
    return symmetric

In [None]:
def predict_players_for_matchups_lstm(df, model, matchup_list, features, seq_len=10, lineup_dir="data/lineups"):
    results = []
    for team_id, enemy_team_id in matchup_list:
        lineup_path = os.path.join(lineup_dir, f"team_lineup_{team_id}.json")
        try:
            with open(lineup_path, "r", encoding="utf-8") as f:
                lineup = [p for p in json.load(f) if p.get("is_current_team_member")]
        except FileNotFoundError:
            continue

        for player in lineup:
            account_id = str(player["account_id"])
            player_hist = df[df["account_id"] == account_id].sort_values("match_start_time")
            if len(player_hist) < seq_len:
                continue

            seq_df = player_hist.iloc[-seq_len:].copy()
            seq_df["team_id"] = team_id
            seq_df["enemy_team_id"] = enemy_team_id

            # Preprocessing (same as training)
            seq_df['isRadiant'] = seq_df['isRadiant'].astype(int)
            seq_df['win'] = seq_df['win'].astype(int)
            seq_df['firstblood_claimed'] = seq_df['firstblood_claimed'].astype(int)
            for col in features:
                if col not in seq_df.columns:
                    seq_df[col] = 0
            seq_df = seq_df[features]
            seq_df = seq_df.fillna(seq_df.mean()).astype(np.float32)
            print(seq_df.head())
            print(seq_df.sum())
            X_pred = seq_df.values.reshape(1, seq_len, len(features))
            pred_score = model.predict(X_pred)[0][0]
            results.append({
                "account_id": account_id,
                "name": player.get("name", ""),
                "team_id": team_id,
                "enemy_team_id": enemy_team_id,
                "predicted_fantasy_score": pred_score
            })
    return pd.DataFrame(results)

In [None]:
matchups = [('7119388', '8261500'), ('8597976', '7119388'), ('8597976', '8261500')] # Spirit vs. Xtreme Gaming, Spirit vs. Talon
symmetric_matchups = make_symmetric_matchups(matchups)
result = predict_players_for_matchups_lstm(df, model, symmetric_matchups, features, seq_len=SEQ_LEN)

In [None]:
result

In [None]:
#print(np.std(X_test, axis=0)) 

In [None]:
print(np.mean(y_train))

In [None]:
y_train[0]

In [None]:
y_train[1]

In [None]:
y_pred = model.predict(X_test)
print(y_pred[:10])