In [146]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers, models
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [147]:
with open('data/curated/player_data.json', 'r') as f:
    player_data = json.load(f)

In [148]:
df = pd.DataFrame(player_data)

In [149]:
df = df.astype({
    'match_id': 'string',
    'account_id': 'string',
    'hero_id': 'string',
    'radiant_team_id': 'string',
    'dire_team_id': 'string',
    'win': 'boolean',
    'firstblood_claimed': 'boolean',
    'team_id': 'string',
    'enemy_team_id': 'string',
    'valveId': 'string',
})

In [150]:
q1_glicko = df['glicko2_rating'].quantile(0.25)
df['glicko2_rating'] = df['glicko2_rating'].fillna(q1_glicko)
q1_enemy_glicko = df['enemy_glicko2_rating'].quantile(0.25)
df['enemy_glicko2_rating'] = df['enemy_glicko2_rating'].fillna(q1_enemy_glicko)

In [151]:
mask = df['rating_advantage'].isna()
df.loc[mask, 'rating_advantage'] = df.loc[mask, 'glicko2_rating'] - df.loc[mask, 'enemy_glicko2_rating']

In [152]:
df = df.sort_values(['account_id', 'match_start_time'])


In [153]:
SEQ_LEN=10

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15530 entries, 1126 to 14947
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   match_id                 15530 non-null  string 
 1   match_start_time         15530 non-null  int64  
 2   account_id               15530 non-null  string 
 3   name                     15530 non-null  object 
 4   fantasy_role             15530 non-null  object 
 5   hero_id                  15530 non-null  string 
 6   hero_variant             15530 non-null  object 
 7   isRadiant                15530 non-null  bool   
 8   radiant_team_id          15530 non-null  string 
 9   dire_team_id             15530 non-null  string 
 10  tournament_start_date    15530 non-null  object 
 11  duration                 15530 non-null  int64  
 12  win                      15530 non-null  boolean
 13  kills                    15530 non-null  int64  
 14  deaths                  

In [155]:
not_features = [
    'match_id', 'account_id', 'name', 'radiant_team_id', 'dire_team_id',
    'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score', 'team_id'
]

In [156]:
df['isRadiant'] = df['isRadiant'].astype(int)
df['win'] = df['win'].astype(int)
df['firstblood_claimed'] = df['firstblood_claimed'].astype(int)

In [157]:
#features = [col for col in df.columns if col not in not_features]
features = [col for col in df.columns if col not in not_features and pd.api.types.is_numeric_dtype(df[col])]
#df[features].isna().sum()
df[features] = df[features].fillna(df[features].mean())

In [158]:

X_seqs = []
y_seqs = []

for _, player_df in df.groupby('account_id'):
    player_df = player_df.reset_index(drop=True)
    for i in range(len(player_df) - SEQ_LEN):
        X_seqs.append(player_df.loc[i:i+SEQ_LEN-1, features].values)
        y_seqs.append(player_df.loc[i+SEQ_LEN, 'fantasy_score'])

X_seqs = np.array(X_seqs)
y_seqs = np.array(y_seqs)

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X_seqs, y_seqs, test_size=0.1)#, random_state=42)

In [160]:
model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, X_train.shape[2])),
    layers.LSTM(64, return_sequences=False),
    layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 199.7586 - val_loss: 98.7033
Epoch 2/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 76.3448 - val_loss: 41.7659
Epoch 3/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 35.6445 - val_loss: 27.5305
Epoch 4/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 25.5919 - val_loss: 25.0889
Epoch 5/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 23.7374 - val_loss: 24.8323
Epoch 6/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 24.1404 - val_loss: 24.7862
Epoch 7/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 23.6590 - val_loss: 24.7771
Epoch 8/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 24.5669 - val_loss: 24.7893
Epoch 9/20
[1m340/340

In [161]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
print("LSTM RMSE:", root_mean_squared_error(y_test, y_pred))

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
LSTM RMSE: 4.832583052509657


In [162]:
def make_symmetric_matchups(matchups):
    symmetric = []
    for t1, t2 in matchups:
        symmetric.append((t1, t2))
        symmetric.append((t2, t1))
    return symmetric

In [163]:
def predict_players_for_matchups_lstm(df, model, matchup_list, features, seq_len=10, lineup_dir="data/lineups"):
    results = []
    for team_id, enemy_team_id in matchup_list:
        lineup_path = os.path.join(lineup_dir, f"team_lineup_{team_id}.json")
        try:
            with open(lineup_path, "r", encoding="utf-8") as f:
                lineup = [p for p in json.load(f) if p.get("is_current_team_member")]
        except FileNotFoundError:
            continue

        for player in lineup:
            account_id = str(player["account_id"])
            player_hist = df[df["account_id"] == account_id].sort_values("match_start_time")
            if len(player_hist) < seq_len:
                continue

            seq_df = player_hist.iloc[-seq_len:].copy()
            seq_df["team_id"] = team_id
            seq_df["enemy_team_id"] = enemy_team_id

            # Preprocessing (same as training)
            seq_df['isRadiant'] = seq_df['isRadiant'].astype(int)
            seq_df['win'] = seq_df['win'].astype(int)
            seq_df['firstblood_claimed'] = seq_df['firstblood_claimed'].astype(int)
            for col in ['fantasy_role', 'hero_id', 'hero_variant']:
                if col in df.columns:
                    seq_df = pd.get_dummies(seq_df, columns=[col])
            for col in features:
                if col not in seq_df.columns:
                    seq_df[col] = 0
            seq_df = seq_df[features]
            seq_df = seq_df.fillna(seq_df.mean()).astype(np.float32)
            print(seq_df.head())
            print(seq_df.sum())
            X_pred = seq_df.values.reshape(1, seq_len, len(features))
            pred_score = model.predict(X_pred)[0][0]
            results.append({
                "account_id": account_id,
                "name": player.get("name", ""),
                "team_id": team_id,
                "enemy_team_id": enemy_team_id,
                "predicted_fantasy_score": pred_score
            })
    return pd.DataFrame(results)

In [164]:
print(X_test[0])

[[ 1.74021810e+09  1.00000000e+00  2.40700000e+03  1.00000000e+00
   1.10000000e+01  2.00000000e+00  2.49000000e+02  7.00000000e+00
   5.81000000e+02  1.00000000e+00  0.00000000e+00  8.28571440e-01
   1.00000000e+00  2.00000000e+00  1.40000000e+01  0.00000000e+00
   6.74992200e+01  4.00000000e+00  1.80000000e+01  7.89000000e+02
   6.90986257e-01  1.99680192e+03  2.02278638e+03 -2.59844640e+01]
 [ 1.74022243e+09  0.00000000e+00  2.11000000e+03  1.00000000e+00
   1.20000000e+01  2.00000000e+00  2.46000000e+02  1.40000000e+01
   6.59000000e+02  0.00000000e+00  0.00000000e+00  6.31578900e-01
   0.00000000e+00  2.00000000e+00  1.60000000e+01  0.00000000e+00
   0.00000000e+00  4.00000000e+00  1.20000000e+01  7.99000000e+02
   8.11843169e-01  1.99680192e+03  2.02278638e+03 -2.59844640e+01]
 [ 1.74031711e+09  1.00000000e+00  2.40600000e+03  1.00000000e+00
   9.00000000e+00  1.00000000e+00  3.21000000e+02  1.10000000e+01
   6.46000000e+02  0.00000000e+00  0.00000000e+00  7.02702700e-01
   2.000

In [165]:
matchups = [('7119388', '8261500'), ('8597976', '7119388'), ('8597976', '8261500')] # Spirit vs. Xtreme Gaming, Spirit vs. Talon
symmetric_matchups = make_symmetric_matchups(matchups)
result = predict_players_for_matchups_lstm(df, model, symmetric_matchups, features, seq_len=SEQ_LEN)

       match_start_time  isRadiant  duration  win  kills  deaths  last_hits  \
15341      1.752055e+09        1.0    2254.0  1.0    3.0     2.0       67.0   
15206      1.752138e+09        0.0    1577.0  1.0    6.0     2.0       22.0   
15181      1.752142e+09        1.0    1928.0  1.0    1.0     3.0       39.0   
14821      1.752658e+09        1.0    3897.0  1.0    3.0     6.0      206.0   
14816      1.752664e+09        0.0    2339.0  1.0    4.0     3.0       76.0   

       denies  gold_per_min  towers_killed  ...  rune_pickups  \
15341     5.0         358.0            0.0  ...           1.0   
15206     3.0         339.0            0.0  ...           1.0   
15181     2.0         346.0            1.0  ...           1.0   
14821     0.0         446.0            1.0  ...           7.0   
14816     2.0         378.0            1.0  ...           3.0   

       firstblood_claimed      stuns  creeps_stacked  assists  xp_per_min  \
15341                 0.0  27.233250            10.0     

In [166]:
result

Unnamed: 0,account_id,name,team_id,enemy_team_id,predicted_fantasy_score
0,113331514,Miposhka,7119388,8261500,14.688537
1,321580662,Yatoro,7119388,8261500,14.688537
2,302214028,Collapse,7119388,8261500,14.688537
3,106305042,Larl,7119388,8261500,14.688537
4,847565596,rue,7119388,8261500,14.688537
5,203351055,Malik,7119388,8261500,14.688537
6,137129583,Xm,8261500,7119388,14.688537
7,157475523,XinQ,8261500,7119388,14.688537
8,129958758,Xxs,8261500,7119388,14.688537
9,898754153,Ame,8261500,7119388,14.688537


In [167]:
#print(np.std(X_test, axis=0)) 

In [168]:
print(np.std(y_train))

4.9083899767896
