In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers, models
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler



In [2]:
with open('data/curated/player_data.json', 'r') as f:
    player_data = json.load(f)

In [3]:
df = pd.DataFrame(player_data)

In [4]:
df = df.astype({
    'match_id': 'string',
    'account_id': 'string',
    'hero_id': 'string',
    'radiant_team_id': 'string',
    'dire_team_id': 'string',
    'win': 'boolean',
    'firstblood_claimed': 'boolean',
    'team_id': 'string',
    'enemy_team_id': 'string',
    'valveId': 'string',
})

In [5]:
q1_glicko = df['glicko2_rating'].quantile(0.25)
df['glicko2_rating'] = df['glicko2_rating'].fillna(q1_glicko)
q1_enemy_glicko = df['enemy_glicko2_rating'].quantile(0.25)
df['enemy_glicko2_rating'] = df['enemy_glicko2_rating'].fillna(q1_enemy_glicko)

In [6]:
mask = df['rating_advantage'].isna()
df.loc[mask, 'rating_advantage'] = df.loc[mask, 'glicko2_rating'] - df.loc[mask, 'enemy_glicko2_rating']

In [7]:
df = df.sort_values(['account_id', 'match_start_time'])


In [8]:
SEQ_LEN=10

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15530 entries, 1126 to 14947
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   match_id                 15530 non-null  string 
 1   match_start_time         15530 non-null  int64  
 2   account_id               15530 non-null  string 
 3   name                     15530 non-null  object 
 4   fantasy_role             15530 non-null  object 
 5   hero_id                  15530 non-null  string 
 6   hero_variant             15530 non-null  object 
 7   isRadiant                15530 non-null  bool   
 8   radiant_team_id          15530 non-null  string 
 9   dire_team_id             15530 non-null  string 
 10  tournament_start_date    15530 non-null  object 
 11  duration                 15530 non-null  int64  
 12  win                      15530 non-null  boolean
 13  kills                    15530 non-null  int64  
 14  deaths                  

In [10]:
not_features = [
    'match_id', 'match_start_time', 'account_id', 'name', 'radiant_team_id', 'dire_team_id',
    'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score', 'team_id'
]

In [11]:
df['isRadiant'] = df['isRadiant'].astype(int)
df['win'] = df['win'].astype(int)
df['firstblood_claimed'] = df['firstblood_claimed'].astype(int)

In [12]:
df = pd.get_dummies(df, columns=['fantasy_role', 'hero_id', 'hero_variant'])

In [13]:
#features = [col for col in df.columns if col not in not_features]
features = [col for col in df.columns if col not in not_features and pd.api.types.is_numeric_dtype(df[col])]#df[features].isna().sum()
df[features] = df[features].fillna(df[features].mean())

In [14]:
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

Create sequences

In [15]:
X_seqs = []
y_seqs = []

for _, player_df in df.groupby('account_id'):
    player_df = player_df.reset_index(drop=True)
    for i in range(len(player_df) - SEQ_LEN):
        X_seqs.append(player_df.loc[i:i+SEQ_LEN-1, features].values)
        y_seqs.append(player_df.loc[i+SEQ_LEN, 'fantasy_score'])

X_seqs = np.array(X_seqs)
y_seqs = np.array(y_seqs)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_seqs, y_seqs, test_size=0.15)#, random_state=42)

In [17]:
model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, X_train.shape[2])),
    layers.LSTM(128, return_sequences=True),    
    layers.LSTM(64),
    layers.Dense(1)
])
model.compile(optimizer='adam', loss='rmse')
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 70.8715 - val_loss: 22.9023
Epoch 2/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 23.7857 - val_loss: 22.4736
Epoch 3/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 22.0311 - val_loss: 21.6607
Epoch 4/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 21.1497 - val_loss: 21.6665
Epoch 5/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 20.7827 - val_loss: 22.6723
Epoch 6/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 19.0803 - val_loss: 22.5113
Epoch 7/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 17.6507 - val_loss: 23.5515
Epoch 8/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 15.5377 - val_loss: 25.2500
Epoch 9/20
[1m340/340[

In [18]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
print("LSTM RMSE:", root_mean_squared_error(y_test, y_pred))

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
LSTM RMSE: 5.465769428497673


In [19]:
def make_symmetric_matchups(matchups):
    symmetric = []
    for t1, t2 in matchups:
        symmetric.append((t1, t2))
        symmetric.append((t2, t1))
    return symmetric

In [20]:
def predict_players_for_matchups_lstm(df, model, matchup_list, features, seq_len=10, lineup_dir="data/lineups"):
    results = []
    for team_id, enemy_team_id in matchup_list:
        lineup_path = os.path.join(lineup_dir, f"team_lineup_{team_id}.json")
        try:
            with open(lineup_path, "r", encoding="utf-8") as f:
                lineup = [p for p in json.load(f) if p.get("is_current_team_member")]
        except FileNotFoundError:
            continue

        for player in lineup:
            account_id = str(player["account_id"])
            player_hist = df[df["account_id"] == account_id].sort_values("match_start_time")
            if len(player_hist) < seq_len:
                continue

            seq_df = player_hist.iloc[-seq_len:].copy()
            seq_df["team_id"] = team_id
            seq_df["enemy_team_id"] = enemy_team_id

            # Preprocessing (same as training)
            seq_df['isRadiant'] = seq_df['isRadiant'].astype(int)
            seq_df['win'] = seq_df['win'].astype(int)
            seq_df['firstblood_claimed'] = seq_df['firstblood_claimed'].astype(int)
            for col in features:
                if col not in seq_df.columns:
                    seq_df[col] = 0
            seq_df = seq_df[features]
            seq_df = seq_df.fillna(seq_df.mean()).astype(np.float32)
            print(seq_df.head())
            print(seq_df.sum())
            X_pred = seq_df.values.reshape(1, seq_len, len(features))
            pred_score = model.predict(X_pred)[0][0]
            results.append({
                "account_id": account_id,
                "name": player.get("name", ""),
                "team_id": team_id,
                "enemy_team_id": enemy_team_id,
                "predicted_fantasy_score": pred_score
            })
    return pd.DataFrame(results)

In [22]:
matchups = [('7119388', '8261500'), ('8597976', '7119388'), ('8597976', '8261500')] # Spirit vs. Xtreme Gaming, Spirit vs. Talon
symmetric_matchups = make_symmetric_matchups(matchups)
result = predict_players_for_matchups_lstm(df, model, symmetric_matchups, features, seq_len=SEQ_LEN)

       isRadiant  duration  win     kills    deaths  last_hits    denies  \
15341        1.0 -0.218212  1.0 -0.449100 -0.926862  -0.956954 -0.333863   
15206       -1.0 -1.225393  1.0  0.317303 -0.926862  -1.205182 -0.651497   
15181        1.0 -0.703206  1.0 -0.960036 -0.599744  -1.111407 -0.810314   
14821        1.0  2.226099  1.0 -0.449100  0.381609  -0.190208 -1.127947   
14816       -1.0 -0.091756  1.0 -0.193632 -0.599744  -0.907309 -0.810314   

       gold_per_min  towers_killed  roshans_killed  ...  hero_variant_95, 2  \
15341     -0.767567      -0.641048       -0.424472  ...           -0.030038   
15206     -0.876001      -0.641048       -0.424472  ...           -0.030038   
15181     -0.836052      -0.044340       -0.424472  ...           -0.030038   
14821     -0.265344      -0.044340       -0.424472  ...           -0.030038   
14816     -0.653425      -0.044340       -0.424472  ...           -0.030038   

       hero_variant_96, 1  hero_variant_96, 2  hero_variant_97, 2  \

In [23]:
result

Unnamed: 0,account_id,name,team_id,enemy_team_id,predicted_fantasy_score
0,113331514,Miposhka,7119388,8261500,21.29949
1,321580662,Yatoro,7119388,8261500,10.961967
2,302214028,Collapse,7119388,8261500,10.90728
3,106305042,Larl,7119388,8261500,18.949455
4,847565596,rue,7119388,8261500,15.800823
5,203351055,Malik,7119388,8261500,21.847534
6,137129583,Xm,8261500,7119388,24.40139
7,157475523,XinQ,8261500,7119388,21.907246
8,129958758,Xxs,8261500,7119388,12.618283
9,898754153,Ame,8261500,7119388,13.371992


In [24]:
#print(np.std(X_test, axis=0)) 

In [25]:
print(np.mean(y_train))

14.69407122638694


In [26]:
y_train[0]

9.727

In [27]:
y_train[1]

12.37747215

In [28]:
y_pred = model.predict(X_test)
print(y_pred[:10])

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[[20.361927 ]
 [17.97901  ]
 [ 7.849568 ]
 [16.18136  ]
 [15.475761 ]
 [18.57052  ]
 [ 8.711716 ]
 [20.246962 ]
 [13.4634075]
 [13.96554  ]]
