In [99]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers, models
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [100]:
with open('data/curated/player_data.json', 'r') as f:
    player_data = json.load(f)

In [101]:
df = pd.DataFrame(player_data)

In [102]:
df = df.astype({
    'match_id': 'string',
    'account_id': 'string',
    'hero_id': 'string',
    'radiant_team_id': 'string',
    'dire_team_id': 'string',
    'win': 'boolean',
    'firstblood_claimed': 'boolean',
    'team_id': 'string',
    'enemy_team_id': 'string',
    'valveId': 'string',
})

In [103]:
q1_glicko = df['glicko2_rating'].quantile(0.25)
df['glicko2_rating'] = df['glicko2_rating'].fillna(q1_glicko)
q1_enemy_glicko = df['enemy_glicko2_rating'].quantile(0.25)
df['enemy_glicko2_rating'] = df['enemy_glicko2_rating'].fillna(q1_enemy_glicko)

In [104]:
mask = df['rating_advantage'].isna()
df.loc[mask, 'rating_advantage'] = df.loc[mask, 'glicko2_rating'] - df.loc[mask, 'enemy_glicko2_rating']

In [105]:
df = df.sort_values(['account_id', 'match_start_time'])


In [106]:
SEQ_LEN=10

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15530 entries, 1126 to 14947
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   match_id                 15530 non-null  string 
 1   match_start_time         15530 non-null  int64  
 2   account_id               15530 non-null  string 
 3   name                     15530 non-null  object 
 4   fantasy_role             15530 non-null  object 
 5   hero_id                  15530 non-null  string 
 6   hero_variant             15530 non-null  object 
 7   isRadiant                15530 non-null  bool   
 8   radiant_team_id          15530 non-null  string 
 9   dire_team_id             15530 non-null  string 
 10  tournament_start_date    15530 non-null  object 
 11  duration                 15530 non-null  int64  
 12  win                      15530 non-null  boolean
 13  kills                    15530 non-null  int64  
 14  deaths                  

In [108]:
not_features = [
    'match_id', 'account_id', 'name', 'radiant_team_id', 'dire_team_id',
    'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score', 'team_id'
]

In [109]:
df['isRadiant'] = df['isRadiant'].astype(int)
df['win'] = df['win'].astype(int)
df['firstblood_claimed'] = df['firstblood_claimed'].astype(int)

In [110]:
for col in ['fantasy_role', 'hero_id', 'hero_variant']:
    if col in df.columns:
        seq_df = pd.get_dummies(df, columns=[col])

In [111]:
#features = [col for col in df.columns if col not in not_features]
features = [col for col in df.columns if col not in not_features and pd.api.types.is_numeric_dtype(df[col])]
#df[features].isna().sum()
df[features] = df[features].fillna(df[features].mean())

In [112]:

X_seqs = []
y_seqs = []

for _, player_df in df.groupby('account_id'):
    player_df = player_df.reset_index(drop=True)
    for i in range(len(player_df) - SEQ_LEN):
        X_seqs.append(player_df.loc[i:i+SEQ_LEN-1, features].values)
        y_seqs.append(player_df.loc[i+SEQ_LEN, 'fantasy_score'])

X_seqs = np.array(X_seqs)
y_seqs = np.array(y_seqs)

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X_seqs, y_seqs, test_size=0.1, random_state=42)

In [114]:
model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, X_train.shape[2])),
    layers.LSTM(64, return_sequences=False),
    layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 227.3612 - val_loss: 145.0984
Epoch 2/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 136.2067 - val_loss: 83.6161
Epoch 3/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 77.8408 - val_loss: 49.2093
Epoch 4/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 46.3427 - val_loss: 32.3748
Epoch 5/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 32.3013 - val_loss: 25.6007
Epoch 6/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 27.1478 - val_loss: 23.5151
Epoch 7/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 24.5771 - val_loss: 23.0872
Epoch 8/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 24.3983 - val_loss: 23.0737
Epoch 9/20
[1m340/34

In [115]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
print("LSTM RMSE:", root_mean_squared_error(y_test, y_pred))

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
LSTM RMSE: 4.810679186983202


In [116]:
def make_symmetric_matchups(matchups):
    symmetric = []
    for t1, t2 in matchups:
        symmetric.append((t1, t2))
        symmetric.append((t2, t1))
    return symmetric

In [117]:
def predict_players_for_matchups_lstm(df, model, matchup_list, features, seq_len=10, lineup_dir="data/lineups"):
    results = []
    for team_id, enemy_team_id in matchup_list:
        lineup_path = os.path.join(lineup_dir, f"team_lineup_{team_id}.json")
        try:
            with open(lineup_path, "r", encoding="utf-8") as f:
                lineup = [p for p in json.load(f) if p.get("is_current_team_member")]
        except FileNotFoundError:
            continue

        for player in lineup:
            account_id = str(player["account_id"])
            player_hist = df[df["account_id"] == account_id].sort_values("match_start_time")
            if len(player_hist) < seq_len:
                continue

            seq_df = player_hist.iloc[-seq_len:].copy()
            seq_df["team_id"] = team_id
            seq_df["enemy_team_id"] = enemy_team_id

            # Preprocessing (same as training)
            seq_df['isRadiant'] = seq_df['isRadiant'].astype(int)
            seq_df['win'] = seq_df['win'].astype(int)
            seq_df['firstblood_claimed'] = seq_df['firstblood_claimed'].astype(int)
            seq_df = pd.get_dummies(seq_df, columns=['fantasy_role', 'hero_id', 'hero_variant'])
            for col in features:
                if col not in seq_df.columns:
                    seq_df[col] = 0
            seq_df = seq_df[features]
            seq_df = seq_df.fillna(seq_df.mean()).astype(np.float32)

            X_pred = seq_df.values.reshape(1, seq_len, len(features))
            pred_score = model.predict(X_pred)[0][0]
            results.append({
                "account_id": account_id,
                "name": player.get("name", ""),
                "team_id": team_id,
                "enemy_team_id": enemy_team_id,
                "predicted_fantasy_score": pred_score
            })
    return pd.DataFrame(results)

In [122]:
print(X_test[0])

[[ 1.74773490e+09  1.00000000e+00  3.32700000e+03  0.00000000e+00
   6.00000000e+00  8.00000000e+00  4.66000000e+02  1.50000000e+01
   5.53000000e+02  1.00000000e+00  0.00000000e+00  6.20689630e-01
   1.00000000e+00  2.00000000e+00  1.00000000e+01  0.00000000e+00
   3.65669000e+01  4.00000000e+00  1.20000000e+01  7.34000000e+02
   7.02910267e-01  1.88005943e+03  2.00390851e+03 -1.23849071e+02]
 [ 1.74774030e+09  0.00000000e+00  1.45800000e+03  1.00000000e+00
   4.00000000e+00  2.00000000e+00  1.54000000e+02  9.00000000e+00
   5.26000000e+02  0.00000000e+00  0.00000000e+00  6.92307700e-01
   2.00000000e+00  0.00000000e+00  1.00000000e+01  0.00000000e+00
   3.46326450e+01  0.00000000e+00  1.40000000e+01  6.44000000e+02
   8.67421180e-01  1.88005943e+03  2.00390851e+03 -1.23849071e+02]
 [ 1.74775290e+09  0.00000000e+00  3.08000000e+03  0.00000000e+00
   1.00000000e+00  6.00000000e+00  3.86000000e+02  1.00000000e+01
   5.08000000e+02  2.00000000e+00  1.00000000e+00  5.90909060e-01
   1.000

In [118]:
matchups = [('7119388', '8261500'), ('8597976', '7119388'), ('8597976', '8261500')] # Spirit vs. Xtreme Gaming, Spirit vs. Talon
symmetric_matchups = make_symmetric_matchups(matchups)
result = predict_players_for_matchups_lstm(df, model, symmetric_matchups, features, seq_len=SEQ_LEN)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26

In [119]:
result

Unnamed: 0,account_id,name,team_id,enemy_team_id,predicted_fantasy_score
0,113331514,Miposhka,7119388,8261500,14.7918
1,321580662,Yatoro,7119388,8261500,14.7918
2,302214028,Collapse,7119388,8261500,14.7918
3,106305042,Larl,7119388,8261500,14.7918
4,847565596,rue,7119388,8261500,14.7918
5,203351055,Malik,7119388,8261500,14.7918
6,137129583,Xm,8261500,7119388,14.7918
7,157475523,XinQ,8261500,7119388,14.7918
8,129958758,Xxs,8261500,7119388,14.7918
9,898754153,Ame,8261500,7119388,14.7918


In [120]:
#print(np.std(X_test, axis=0)) 

[[7.75108408e+06 4.99938688e-01 7.01182517e+02 4.99515831e-01
  3.99935614e+00 2.93183958e+00 1.77812931e+02 6.27790807e+00
  1.71484529e+02 1.76007563e+00 5.41532861e-01 1.51123340e-01
  4.93938353e+00 2.69908967e+00 4.90468394e+00 3.08650973e-01
  3.83221563e+01 1.06700209e+01 6.99049054e+00 2.05986508e+02
  2.20934171e-01 1.01445966e+02 1.09258207e+02 1.37424343e+02]
 [7.77068946e+06 4.99938688e-01 6.59718831e+02 4.98847433e-01
  3.90623602e+00 2.96848199e+00 1.74469369e+02 6.26600488e+00
  1.73097451e+02 1.71827227e+00 5.15881923e-01 1.49930088e-01
  4.61615761e+00 2.83049211e+00 5.04078504e+00 2.77785535e-01
  3.51064812e+01 1.10269735e+01 7.06258561e+00 2.05013026e+02
  2.20610238e-01 1.01861630e+02 1.07308649e+02 1.40304167e+02]
 [7.76958785e+06 4.99829669e-01 6.60509630e+02 4.99913104e-01
  3.90659304e+00 3.03205700e+00 1.79370666e+02 6.22060949e+00
  1.70954976e+02 1.67941596e+00 5.13266694e-01 1.57030706e-01
  4.73511376e+00 2.59309004e+00 4.68310321e+00 2.87576233e-01
  3.68

In [121]:
print(np.std(y_train))

4.911157654332858
