In [63]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [64]:
with open('data/curated/player_data.json', 'r') as f:
    player_data = json.load(f)

In [65]:
df = pd.DataFrame(player_data)

In [66]:
df = df.astype({
    'match_id': 'string',
    'account_id': 'string',
    'hero_id': 'string',
    'radiant_team_id': 'string',
    'dire_team_id': 'string',
    'win': 'boolean',
    'firstblood_claimed': 'boolean',
    'team_id': 'string',
    'enemy_team_id': 'string',
    'valveId': 'string',
})

In [67]:
q1_glicko = df['glicko2_rating'].quantile(0.25)
df['glicko2_rating'] = df['glicko2_rating'].fillna(q1_glicko)
q1_enemy_glicko = df['enemy_glicko2_rating'].quantile(0.25)
df['enemy_glicko2_rating'] = df['enemy_glicko2_rating'].fillna(q1_enemy_glicko)

In [68]:
mask = df['rating_advantage'].isna()
df.loc[mask, 'rating_advantage'] = df.loc[mask, 'glicko2_rating'] - df.loc[mask, 'enemy_glicko2_rating']

In [69]:
df = df.sort_values(['account_id', 'match_start_time'])


In [70]:
SEQ_LEN=10

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15530 entries, 1126 to 14947
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   match_id                 15530 non-null  string 
 1   match_start_time         15530 non-null  int64  
 2   account_id               15530 non-null  string 
 3   name                     15530 non-null  object 
 4   fantasy_role             15530 non-null  object 
 5   hero_id                  15530 non-null  string 
 6   hero_variant             15530 non-null  object 
 7   isRadiant                15530 non-null  bool   
 8   radiant_team_id          15530 non-null  string 
 9   dire_team_id             15530 non-null  string 
 10  tournament_start_date    15530 non-null  object 
 11  duration                 15530 non-null  int64  
 12  win                      15530 non-null  boolean
 13  kills                    15530 non-null  int64  
 14  deaths                  

In [72]:
not_features = [
    'match_id', 'account_id', 'name', 'radiant_team_id', 'dire_team_id',
    'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score', 'team_id'
]

In [73]:
df['isRadiant'] = df['isRadiant'].astype(int)
df['win'] = df['win'].astype(int)
df['firstblood_claimed'] = df['firstblood_claimed'].astype(int)

In [74]:
df = pd.get_dummies(df, columns=['fantasy_role', 'hero_id', 'hero_variant'])

In [75]:
#features = [col for col in df.columns if col not in not_features]
features = [col for col in df.columns if col not in not_features and pd.api.types.is_numeric_dtype(df[col])]
#df[features].isna().sum()
df[features] = df[features].fillna(df[features].mean())

In [76]:

X_seqs = []
y_seqs = []

for _, player_df in df.groupby('account_id'):
    player_df = player_df.reset_index(drop=True)
    for i in range(len(player_df) - SEQ_LEN):
        X_seqs.append(player_df.loc[i:i+SEQ_LEN-1, features].values)
        y_seqs.append(player_df.loc[i+SEQ_LEN, 'fantasy_score'])

X_seqs = np.array(X_seqs)
y_seqs = np.array(y_seqs)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X_seqs, y_seqs, test_size=0.1, random_state=42)

In [78]:
model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, X_train.shape[2])),
    layers.LSTM(64, return_sequences=False),
    layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 192.2426 - val_loss: 100.8829
Epoch 2/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 88.0633 - val_loss: 46.6722
Epoch 3/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 43.4799 - val_loss: 28.0038
Epoch 4/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 28.4734 - val_loss: 23.6215
Epoch 5/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 25.2763 - val_loss: 23.0700
Epoch 6/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 24.3575 - val_loss: 23.0850
Epoch 7/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 24.2671 - val_loss: 23.1157
Epoch 8/20
[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 24.8130 - val_loss: 23.1027
Epoch 9/20
[1m340/340

In [79]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
print("LSTM RMSE:", root_mean_squared_error(y_test, y_pred))

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
LSTM RMSE: 4.807088396014597
