# General Setup

In [1]:
import json
import pandas as pd
import subprocess
import os
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error


In [2]:
with open('data/curated/player_data.json', 'r') as f:
	player_data = json.load(f)

In [3]:
df = pd.DataFrame(player_data)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15437 entries, 14216 to 10673
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   match_id                  15437 non-null  int64  
 1   match_start_time          15437 non-null  int64  
 2   account_id                15437 non-null  int64  
 3   name                      15437 non-null  object 
 4   fantasy_role              15437 non-null  object 
 5   hero_id                   15437 non-null  int64  
 6   hero_variant              15437 non-null  object 
 7   isRadiant                 15437 non-null  bool   
 8   radiant_team_id           15437 non-null  int64  
 9   dire_team_id              15437 non-null  int64  
 10  tournament_start_date     15437 non-null  object 
 11  duration                  15437 non-null  int64  
 12  win                       15437 non-null  int64  
 13  kills                     15437 non-null  int64  
 14  deaths 

In [5]:
df = df.astype({
    'match_id': 'string',
    'account_id': 'string',
    'hero_id': 'string',
    'radiant_team_id': 'string',
    'dire_team_id': 'string',
    'win': 'boolean',
    'firstblood_claimed': 'boolean',
    'team_id': 'string',
	'enemy_team_id': 'string',
    'valveId': 'string',
})

## Set missing glicko scores to the first quartile
*Using the "..._clean" data set, this does not affect anything.*
Q1 was chosen, because teams that do not have a glicko 2 rating, usually are new or weaker teams.

In [6]:
q1_glicko = df['glicko2_rating'].quantile(0.25)
df['glicko2_rating'] = df['glicko2_rating'].fillna(q1_glicko)
q1_enemy_glicko = df['enemy_glicko2_rating'].quantile(0.25)
df['enemy_glicko2_rating'] = df['enemy_glicko2_rating'].fillna(q1_enemy_glicko)

In [7]:
mask = df['rating_advantage'].isna()
df.loc[mask, 'rating_advantage'] = df.loc[mask, 'glicko2_rating'] - df.loc[mask, 'enemy_glicko2_rating']

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15437 entries, 14216 to 10673
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   match_id                  15437 non-null  string 
 1   match_start_time          15437 non-null  int64  
 2   account_id                15437 non-null  string 
 3   name                      15437 non-null  object 
 4   fantasy_role              15437 non-null  object 
 5   hero_id                   15437 non-null  string 
 6   hero_variant              15437 non-null  object 
 7   isRadiant                 15437 non-null  bool   
 8   radiant_team_id           15437 non-null  string 
 9   dire_team_id              15437 non-null  string 
 10  tournament_start_date     15437 non-null  object 
 11  duration                  15437 non-null  int64  
 12  win                       15437 non-null  boolean
 13  kills                     15437 non-null  int64  
 14  deaths 

# XGBoost
Without time dimension.

In [9]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error


In [10]:
enemy_team_avg = df.groupby('enemy_team_id')['fantasy_score'].mean().rename('enemy_avg_fantasy_score')
df = df.merge(enemy_team_avg, left_on='enemy_team_id', right_index=True, how='left')

In [11]:
not_features = [
	'match_id', 'name', 'match_start_time', 'radiant_team_id', 'dire_team_id',
	'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score',
    'rolling_winrate_10',
    'rolling_winrate_15',
    'rolling_fantasy_score_10',
    'rolling_fantasy_score_15',
    'rolling_enemy_glicko2_10',
    'rolling_enemy_glicko2_15',
]

In [12]:
not_features_fe = [
    'match_id', 'name', 'match_start_time', 'radiant_team_id', 'dire_team_id',
    'tournament_start_date', 'teamName', 'valveId', 'enemy_teamName', 'enemy_valveId', 'fantasy_score', 
]

# XHBoost without feature engineering

In [13]:
X = df.drop(columns=not_features)
y = df['fantasy_score']

In [14]:
X = pd.get_dummies(X, columns=['fantasy_role', 'hero_id', 'hero_variant', 'account_id', 'team_id', 'enemy_team_id'])


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1001)


In [16]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [17]:
y_pred = xgb_model.predict(X_test)
print("RMSE:", root_mean_squared_error(y_test, y_pred))

RMSE: 0.8673414545412719


In [18]:
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

# XGBoost with Feature Engineering

In [19]:
X_fe = df.drop(columns=not_features_fe)
y_fe = df['fantasy_score']

In [20]:
X_fe = pd.get_dummies(X_fe, columns=['fantasy_role', 'hero_id', 'hero_variant', 'account_id', 'team_id', 'enemy_team_id'])

In [21]:
X_fe_train, X_fe_test, y_fe_train, y_fe_test = train_test_split(X_fe, y_fe, test_size=0.20, random_state=1001)

In [22]:
xgb_model_fe = xgb.XGBRegressor()
xgb_model_fe.fit(X_fe_train, y_fe_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [23]:
y_fe_pred = xgb_model_fe.predict(X_fe_test)
print("RMSE:", root_mean_squared_error(y_fe_test, y_fe_pred))

RMSE: 0.8776266747620363


In [24]:
y_fe_train_pred = xgb_model_fe.predict(X_fe_train)
y_fe_test_pred = xgb_model_fe.predict(X_fe_test)

# Comparing the 2 XGBoost Models

In [25]:
print("No FE, Train RMSE:", root_mean_squared_error(y_train, y_train_pred))
print("No FE, Test RMSE:", root_mean_squared_error(y_test, y_test_pred))
print("No FE, Train MAE:", mean_absolute_error(y_train, y_train_pred))
print("No FE, Test MAE:", mean_absolute_error(y_test, y_test_pred))
print("No FE, Train R2:", r2_score(y_train, y_train_pred))
print("No FE, Test R2:", r2_score(y_test, y_test_pred))

No FE, Train RMSE: 0.3939321872118213
No FE, Test RMSE: 0.8673414545412719
No FE, Train MAE: 0.3082063262578888
No FE, Test MAE: 0.5733837587569408
No FE, Train R2: 0.9934442645619632
No FE, Test R2: 0.9683705920522633


In [26]:
print("With FE, Train RMSE:", root_mean_squared_error(y_fe_train, y_fe_train_pred))
print("With FE, Test RMSE:", root_mean_squared_error(y_fe_test, y_fe_test_pred))
print("With FE, Train MAE:", mean_absolute_error(y_fe_train, y_fe_train_pred))
print("With FE, Test MAE:", mean_absolute_error(y_fe_test, y_fe_test_pred))
print("With FE, Train R2:", r2_score(y_fe_train, y_fe_train_pred))
print("With FE, Test R2:", r2_score(y_fe_test, y_fe_test_pred))

With FE, Train RMSE: 0.3780385948582158
With FE, Test RMSE: 0.8776266747620363
With FE, Train MAE: 0.2935665818706187
With FE, Test MAE: 0.5887378605228828
With FE, Train R2: 0.9939625886798958
With FE, Test R2: 0.9676160004824991


# Prediction Framework

In [27]:
def build_prediction_rows(df, matchups, lineup_dir, rolling_n=20):
    rows = []
    for team_id, enemy_team_id in matchups:
        lineup_path = os.path.join(lineup_dir, f"team_lineup_{team_id}.json")
        if not os.path.exists(lineup_path):
            subprocess.run(["python", "get_current_lineups.py", str(team_id)])
        if not os.path.exists(lineup_path):
            continue  # Skip if still missing
        with open(lineup_path, "r", encoding="utf-8") as f:
            lineup = [p for p in json.load(f) if p.get("is_current_team_member")]
        for player in lineup:
            account_id = str(player["account_id"])
            player_hist = df[df["account_id"] == account_id].sort_values("match_start_time")
            if player_hist.empty:
                continue
            numeric_cols = player_hist.select_dtypes(include='number').columns.difference(
                ['match_id', 'account_id', 'radiant_team_id', 'dire_team_id', 'team_id', 'enemy_team_id', 'match_start_time']
            )
            rolling = player_hist[numeric_cols].tail(rolling_n).mean()
            row = player_hist.iloc[-1].copy()
            row[numeric_cols] = rolling
            row["team_id"] = team_id
            row["enemy_team_id"] = enemy_team_id
            rows.append(row)
    return pd.DataFrame(rows)

In [28]:
def predict_players_for_matchups(df, model, matchup_list, X, not_features, lineup_dir="data/lineups", rolling_n=20):
    # Ensure all lineup files exist (call API if not)
    team_ids = set()
    for t1, t2 in matchup_list:
        team_ids.add(t1)
        team_ids.add(t2)
    for team_id in team_ids:
        lineup_path = os.path.join(lineup_dir, f"team_lineup_{team_id}.json")
        if not os.path.exists(lineup_path):
            subprocess.run(["python", "get_current_lineups.py", str(team_id)])
    
    # Build synthetic prediction rows
    pred_df = build_prediction_rows(df, matchup_list, lineup_dir, rolling_n=rolling_n)
    if pred_df.empty:
        print("No eligible players found for the given matchups.")
        return pd.DataFrame()
    # Prepare features (same as training)
    X_pred = pred_df.drop(columns=not_features)
    X_pred = pd.get_dummies(X_pred, columns=['fantasy_role', 'hero_id', 'hero_variant', 'account_id', 'team_id', 'enemy_team_id'])
    X_pred = X_pred.reindex(columns=X.columns, fill_value=0)
    # print(X_pred.filter(like='enemy_team_id').head(10)) # Debugger, shows the dummy encoded enemy_team variable.
    # More debugging
    #rows = X_pred[X_pred['account_id_847565596'] == 1]  # print dummies for "rue"
    #print(rows[['enemy_team_id_7119388', 'enemy_team_id_8261500', 'enemy_team_id_8597976']])
    
    pred_df['predicted_fantasy_score'] = model.predict(X_pred) # Predict fantasy scores
    return pred_df[['account_id', 'name', 'fantasy_role', 'team_id', 'enemy_team_id', 'predicted_fantasy_score']]

In [29]:
def make_symmetric_matchups(matchups):
    symmetric = []
    for t1, t2 in matchups:
        symmetric.append((t1, t2))
        symmetric.append((t2, t1))
    return symmetric

In [30]:
matchups = [('7119388', '8261500'), ('8597976', '7119388'), ('8597976', '8261500')] # Spirit vs. Xtreme Gaming, Spirit vs. Talon
symetric_matchups = make_symmetric_matchups(matchups)
result = predict_players_for_matchups(df, xgb_model, symetric_matchups, X, not_features)

In [31]:
result

Unnamed: 0,account_id,name,fantasy_role,team_id,enemy_team_id,predicted_fantasy_score
14548,113331514,Miposhka,Support,7119388,8261500,15.317161
14547,321580662,Yatoro,Core,7119388,8261500,18.734705
14551,302214028,Collapse,Core,7119388,8261500,11.399706
14549,106305042,Larl,Mid,7119388,8261500,17.042755
14550,847565596,rue,Support,7119388,8261500,15.916888
6619,203351055,Malik,Core,7119388,8261500,10.636276
14803,137129583,Xm,Mid,8261500,7119388,12.442446
14805,157475523,XinQ,Support,8261500,7119388,13.163057
14804,129958758,Xxs,Core,8261500,7119388,9.022573
14802,898754153,Ame,Core,8261500,7119388,17.941978


In [59]:
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
feature_importances

Unnamed: 0,feature,importance
24,fantasy_role_Core,0.148177
3,kills,0.114550
14,firstblood_claimed,0.105811
11,obs_placed,0.064895
2,win,0.059273
...,...,...
93,hero_id_46,0.000000
726,enemy_team_id_9530947,0.000000
709,enemy_team_id_8574561,0.000000
708,enemy_team_id_8375259,0.000000


In [61]:
xgb_fe_feature_importances = pd.DataFrame({
    'feature': X_fe.columns,
    'importance': xgb_model_fe.feature_importances_
})
xgb_fe_feature_importances = feature_importances.sort_values(by='importance', ascending=False)
xgb_fe_feature_importances

Unnamed: 0,feature,importance
24,fantasy_role_Core,0.148177
3,kills,0.114550
14,firstblood_claimed,0.105811
11,obs_placed,0.064895
2,win,0.059273
...,...,...
686,team_id_9729720,0.000000
687,team_id_9741253,0.000000
264,"hero_variant_3, 2",0.000000
692,team_id_9828954,0.000000


testing stuff

In [33]:
print(df['enemy_glicko2_rating'].describe())
print(df['enemy_glicko2_rating'].isna().sum())

count    15437.000000
mean      1924.014299
std        108.462035
min       1548.386055
25%       1849.286620
50%       1944.550013
75%       2013.159488
max       2101.693435
Name: enemy_glicko2_rating, dtype: float64
0


# Random Forest without FE

In [34]:
from sklearn.ensemble import RandomForestRegressor


In [35]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.2)

In [36]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=1001)
rf_model.fit(X_train_rf, y_train_rf)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Random Forest with FE

In [37]:
X_fe_train_rf, X_fe_test_rf, y_fe_train_rf, y_fe_test_rf = train_test_split(X_fe, y_fe, test_size=0.2)


In [38]:
rf_fe_model = RandomForestRegressor(n_estimators=100, random_state=1001)
rf_fe_model.fit(X_fe_train_rf, y_fe_train_rf)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Comparing Random Forests

In [39]:
y_pred_rf = rf_model.predict(X_test_rf)
print("Random Forest, without FE RMSE:", root_mean_squared_error(y_test_rf, y_pred_rf))

Random Forest, without FE RMSE: 1.2722558877916756


In [40]:
y_fe_pred_rf = rf_fe_model.predict(X_fe_test_rf)
print("Random Forest, with FE RMSE:", root_mean_squared_error(y_test_rf, y_pred_rf))

Random Forest, with FE RMSE: 1.2722558877916756


In [41]:
y_train_pred_rf = rf_model.predict(X_train_rf)
y_test_pred_rf = rf_model.predict(X_test_rf)

In [42]:
y_fe_train_pred_rf = rf_fe_model.predict(X_fe_train_rf)
y_fe_test_pred_rf = rf_fe_model.predict(X_fe_test_rf)

In [43]:
print("No FE, Train RMSE:", root_mean_squared_error(y_train_rf, y_train_pred_rf))
print("No FE, Test RMSE:", root_mean_squared_error(y_test_rf, y_test_pred_rf))
print("No FE, Train MAE:", mean_absolute_error(y_train_rf, y_train_pred_rf))
print("No FE, Test MAE:", mean_absolute_error(y_test_rf, y_test_pred_rf))
print("No FE, Train R2:", r2_score(y_train_rf, y_train_pred_rf))
print("No FE, Test R2:", r2_score(y_test_rf, y_test_pred_rf))

No FE, Train RMSE: 0.48182074328992214
No FE, Test RMSE: 1.2722558877916756
No FE, Train MAE: 0.34243217335029186
No FE, Test MAE: 0.9252751835401113
No FE, Train R2: 0.9902057219029652
No FE, Test R2: 0.9315708082803821


In [44]:
print("With FE, Train RMSE:", root_mean_squared_error(y_fe_train_rf, y_fe_train_pred_rf))
print("With FE, Test RMSE:", root_mean_squared_error(y_fe_test_rf, y_fe_test_pred_rf))
print("With FE, Train MAE:", mean_absolute_error(y_fe_train_rf, y_fe_train_pred_rf))
print("With FE, Test MAE:", mean_absolute_error(y_fe_test_rf, y_fe_test_pred_rf))
print("With FE, Train R2:", r2_score(y_fe_train_rf, y_fe_train_pred_rf))
print("With FE, Test R2:", r2_score(y_fe_test_rf, y_fe_test_pred_rf))

With FE, Train RMSE: 0.5026481983069198
With FE, Test RMSE: 1.2750513418761862
With FE, Train MAE: 0.3597812528466868
With FE, Test MAE: 0.9405851263556935
With FE, Train R2: 0.9894118184393078
With FE, Test R2: 0.9293953300614634


In [45]:
matchups = [('7119388', '8261500'), ('8597976', '7119388'), ('8597976', '8261500')] # Spirit vs. Xtreme Gaming, Spirit vs. Talon
symetric_matchups = make_symmetric_matchups(matchups)
result = predict_players_for_matchups(df, rf_model, symetric_matchups, X, not_features)

In [46]:
result

Unnamed: 0,account_id,name,fantasy_role,team_id,enemy_team_id,predicted_fantasy_score
14548,113331514,Miposhka,Support,7119388,8261500,15.970001
14547,321580662,Yatoro,Core,7119388,8261500,20.564463
14551,302214028,Collapse,Core,7119388,8261500,12.22734
14549,106305042,Larl,Mid,7119388,8261500,18.404969
14550,847565596,rue,Support,7119388,8261500,16.22605
6619,203351055,Malik,Core,7119388,8261500,10.894281
14803,137129583,Xm,Mid,8261500,7119388,14.17719
14805,157475523,XinQ,Support,8261500,7119388,13.048568
14804,129958758,Xxs,Core,8261500,7119388,11.155621
14802,898754153,Ame,Core,8261500,7119388,17.918316


In [57]:
rf_feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values(by='importance', ascending=False)
rf_feature_importances.head(10)

Unnamed: 0,feature,importance
3,kills,0.258703
15,stuns,0.164944
11,obs_placed,0.097386
8,towers_killed,0.077941
17,assists,0.07305
12,camps_stacked,0.060334
13,rune_pickups,0.059466
14,firstblood_claimed,0.05584
7,gold_per_min,0.03151
16,creeps_stacked,0.016212


In [58]:
rf_fe_feature_importances = pd.DataFrame({
    'feature': X_fe.columns,
    'importance': rf_fe_model.feature_importances_
}).sort_values(by='importance', ascending=False)
rf_fe_feature_importances.head(10)

Unnamed: 0,feature,importance
3,kills,0.256771
15,stuns,0.152994
17,assists,0.096964
11,obs_placed,0.086298
8,towers_killed,0.082059
12,camps_stacked,0.062057
13,rune_pickups,0.058066
14,firstblood_claimed,0.056168
7,gold_per_min,0.028181
4,deaths,0.01566


# Naive Baseline Model

In [48]:
mean_baseline = y_train.mean()
y_pred_baseline = [mean_baseline] * len(y_test)
print("Mean Baseline RMSE:", root_mean_squared_error(y_test, y_pred_baseline))

Mean Baseline RMSE: 4.877597261406306


In [49]:
y_pred_baseline_train = rf_model.predict(X_train_rf)
y_pred_baseline_test = rf_model.predict(X_test_rf)

In [50]:
print("Mean Baseline, Train RMSE:", root_mean_squared_error(y_train, y_pred_baseline_train))
print("Mean Baseline, Test RMSE:", root_mean_squared_error(y_test, y_pred_baseline_test))
print("Mean Baseline, Train MAE:", mean_absolute_error(y_train, y_pred_baseline_train))
print("Mean Baseline, Test MAE:", mean_absolute_error(y_test, y_pred_baseline_test))
print("Mean Baseline, Train R2:", r2_score(y_train, y_pred_baseline_train))
print("Mean Baseline, Test R2:", r2_score(y_test, y_pred_baseline_test))

Mean Baseline, Train RMSE: 6.7621321579858105
Mean Baseline, Test RMSE: 6.56330562031149
Mean Baseline, Train MAE: 5.359821007836883
Mean Baseline, Test MAE: 5.245732962160308
Mean Baseline, Train R2: -0.9317271901206061
Mean Baseline, Test R2: -0.8111570472116103


# LSTM
Wegen den Python-Version voraussetzungen von TensorFlow wird LSTM in einem eigenen Notebook implementiert.