In [88]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler

In [89]:
with open('data/curated/player_data.json', 'r') as f:
    player_data = json.load(f)

In [90]:
df = pd.DataFrame(player_data)

In [91]:
df = df.astype({
    'match_id': 'string',
    'account_id': 'string',
    'hero_id': 'string',
    'radiant_team_id': 'string',
    'dire_team_id': 'string',
    'win': 'boolean',
    'firstblood_claimed': 'boolean',
    'team_id': 'string',
    'enemy_team_id': 'string',
    'valveId': 'string',
})

In [92]:
q1_glicko = df['glicko2_rating'].quantile(0.25)
df['glicko2_rating'] = df['glicko2_rating'].fillna(q1_glicko)
q1_enemy_glicko = df['enemy_glicko2_rating'].quantile(0.25)
df['enemy_glicko2_rating'] = df['enemy_glicko2_rating'].fillna(q1_enemy_glicko)

In [93]:
mask = df['rating_advantage'].isna()
df.loc[mask, 'rating_advantage'] = df.loc[mask, 'glicko2_rating'] - df.loc[mask, 'enemy_glicko2_rating']

In [94]:
df = df.sort_values(['account_id', 'match_start_time'])


In [95]:
SEQ_LEN=5

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15530 entries, 1126 to 14947
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   match_id                 15530 non-null  string 
 1   match_start_time         15530 non-null  int64  
 2   account_id               15530 non-null  string 
 3   name                     15530 non-null  object 
 4   fantasy_role             15530 non-null  object 
 5   hero_id                  15530 non-null  string 
 6   hero_variant             15530 non-null  object 
 7   isRadiant                15530 non-null  bool   
 8   radiant_team_id          15530 non-null  string 
 9   dire_team_id             15530 non-null  string 
 10  tournament_start_date    15530 non-null  object 
 11  duration                 15530 non-null  int64  
 12  win                      15530 non-null  boolean
 13  kills                    15530 non-null  int64  
 14  deaths                  

In [97]:
not_features = [
    'match_id', 'match_start_time', 'account_id', 'name', 'radiant_team_id', 'dire_team_id',
    'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score', 'team_id'
]

In [98]:
df['isRadiant'] = df['isRadiant'].astype(int)
df['win'] = df['win'].astype(int)
df['firstblood_claimed'] = df['firstblood_claimed'].astype(int)

In [99]:
df = pd.get_dummies(df, columns=['fantasy_role', 'hero_id', 'hero_variant'])

In [100]:
#features = [col for col in df.columns if col not in not_features]
features = [col for col in df.columns if col not in not_features and pd.api.types.is_numeric_dtype(df[col])]#df[features].isna().sum()
df[features] = df[features].fillna(df[features].mean())

In [101]:
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

Create sequences

In [102]:
X_seqs = []
y_seqs = []

for _, player_df in df.groupby('account_id'):
    player_df = player_df.reset_index(drop=True)
    for i in range(len(player_df) - SEQ_LEN):
        X_seqs.append(player_df.loc[i:i+SEQ_LEN-1, features].values)
        y_seqs.append(player_df.loc[i+SEQ_LEN, 'fantasy_score'])

X_seqs = np.array(X_seqs)
y_seqs = np.array(y_seqs)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_seqs, y_seqs, test_size=0.15)#, random_state=1001)

In [104]:
early_stop = EarlyStopping(
    monitor='val_loss',      
    patience=3,
    restore_best_weights=True
)

In [105]:
model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, X_train.shape[2])),
    layers.LSTM(256, return_sequences=True),
    layers.Dropout(0.3),
    layers.LSTM(256, return_sequences=True),
    layers.LSTM(128, return_sequences=True),
    layers.LSTM(64, return_sequences=True),
    layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.summary()

history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop]
)

Epoch 1/30
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - loss: 71.4193 - val_loss: 24.3698
Epoch 2/30
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 24.3710 - val_loss: 24.0352
Epoch 3/30
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 22.8926 - val_loss: 22.4400
Epoch 4/30
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - loss: 21.1720 - val_loss: 22.3285
Epoch 5/30
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - loss: 21.2177 - val_loss: 21.8757
Epoch 6/30
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 20.4170 - val_loss: 22.0487
Epoch 7/30
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 19.1408 - val_loss: 22.1136
Epoch 8/30
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 17.9232 - val_loss: 23.8843


In [106]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
print("LSTM RMSE:", root_mean_squared_error(y_test, y_pred))

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
LSTM RMSE: 4.681979958392133


In [107]:
def make_symmetric_matchups(matchups):
    symmetric = []
    for t1, t2 in matchups:
        symmetric.append((t1, t2))
        symmetric.append((t2, t1))
    return symmetric

In [108]:
def predict_players_for_matchups_lstm(df, model, matchup_list, features, seq_len=10, lineup_dir="data/lineups"):
    results = []
    for team_id, enemy_team_id in matchup_list:
        lineup_path = os.path.join(lineup_dir, f"team_lineup_{team_id}.json")
        try:
            with open(lineup_path, "r", encoding="utf-8") as f:
                lineup = [p for p in json.load(f) if p.get("is_current_team_member")]
        except FileNotFoundError:
            continue

        for player in lineup:
            account_id = str(player["account_id"])
            player_hist = df[df["account_id"] == account_id].sort_values("match_start_time")
            if len(player_hist) < seq_len:
                continue

            seq_df = player_hist.iloc[-seq_len:].copy()
            seq_df["team_id"] = team_id
            seq_df["enemy_team_id"] = enemy_team_id

            # Preprocessing (same as training)
            seq_df['isRadiant'] = seq_df['isRadiant'].astype(int)
            seq_df['win'] = seq_df['win'].astype(int)
            seq_df['firstblood_claimed'] = seq_df['firstblood_claimed'].astype(int)
            for col in features:
                if col not in seq_df.columns:
                    seq_df[col] = 0
            seq_df = seq_df[features]
            seq_df = seq_df.fillna(seq_df.mean()).astype(np.float32)
            print(seq_df.head())
            print(seq_df.sum())
            X_pred = seq_df.values.reshape(1, seq_len, len(features))
            pred_score = model.predict(X_pred)[0][0]
            results.append({
                "account_id": account_id,
                "name": player.get("name", ""),
                "team_id": team_id,
                "enemy_team_id": enemy_team_id,
                "predicted_fantasy_score": pred_score
            })
    return pd.DataFrame(results)

In [109]:
matchups = [('7119388', '8261500'), ('8597976', '7119388'), ('8597976', '8261500')] # Spirit vs. Xtreme Gaming, Spirit vs. Talon
symmetric_matchups = make_symmetric_matchups(matchups)
result = predict_players_for_matchups_lstm(df, model, symmetric_matchups, features, seq_len=SEQ_LEN)

       isRadiant  duration  win     kills    deaths  last_hits    denies  \
14721        1.0  0.376873  1.0 -0.704568  0.381609  -1.138988 -0.651497   
14716       -1.0  0.830625  1.0 -0.449100 -0.272626  -0.736308 -0.492680   
14661        1.0 -0.024809  1.0 -1.215504  1.035845  -1.083826 -0.969131   
14651        1.0  0.043626  1.0 -0.449100 -0.272626  -1.072794 -1.127947   
14641        1.0 -0.404176  1.0  0.061835 -0.272626  -1.199666 -0.969131   

       gold_per_min  towers_killed  roshans_killed  ...  hero_variant_95, 2  \
14721     -1.201305      -0.641048        1.496848  ...           -0.030038   
14716     -0.642011      -0.641048       -0.424472  ...           -0.030038   
14661     -0.961607      -0.044340        1.496848  ...           -0.030038   
14651     -0.904537      -0.641048       -0.424472  ...           -0.030038   
14641     -1.041507      -0.641048       -0.424472  ...           -0.030038   

       hero_variant_96, 1  hero_variant_96, 2  hero_variant_97, 2  \

In [110]:
result

Unnamed: 0,account_id,name,team_id,enemy_team_id,predicted_fantasy_score
0,113331514,Miposhka,7119388,8261500,14.025311
1,321580662,Yatoro,7119388,8261500,16.250183
2,302214028,Collapse,7119388,8261500,11.918607
3,106305042,Larl,7119388,8261500,15.021812
4,847565596,rue,7119388,8261500,15.432895
5,203351055,Malik,7119388,8261500,11.436419
6,137129583,Xm,8261500,7119388,14.636961
7,157475523,XinQ,8261500,7119388,14.535098
8,129958758,Xxs,8261500,7119388,12.259999
9,898754153,Ame,8261500,7119388,16.041948


In [111]:
#print(np.std(X_test, axis=0)) 

In [112]:
print(np.mean(y_train))

14.675479091281941


In [113]:
y_train[0]

9.15784553

In [114]:
y_train[1]

16.73686615

In [115]:
y_pred = model.predict(X_test)
print(y_pred[:10])

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[[15.906025]
 [14.546443]
 [15.14638 ]
 [14.642901]
 [12.958228]
 [15.187846]
 [15.289487]
 [14.705613]
 [15.075238]
 [15.080228]]
