In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC
import xgboost as xgb
import joblib

In [2]:
df = pd.read_csv("data/combined_team_match_history.csv", dtype={'Patch': str}, keep_default_na=False)
print(df.shape)
print(df["GameID"].nunique())
print(df["Result"].value_counts())
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", None)

df_original = df.copy()



(4684, 57)
2342
Result
0    2342
1    2342
Name: count, dtype: int64


In [3]:
print(df['Patch'])

0        15.1
1        15.1
2        15.1
3        15.1
4        15.1
        ...  
4679    15.20
4680    15.20
4681    15.20
4682    15.20
4683    15.20
Name: Patch, Length: 4684, dtype: object


In [4]:
print(df.dtypes)
print(df.isnull().sum())
print(df["ADC_champion"].unique())
print(df["JUNGLE_champion"].unique())
print(df["TOP_champion"].unique())
print(df["MID_champion"].unique())
print(df["SUPPORT_champion"].unique())
print(df["Side"].unique())
print(df["Patch"].unique())
print(df["Result"].unique())

GameID                     int64
Team                      object
Date                      object
Region                    object
Kills                      int64
Deaths                     int64
Assists                    int64
Result                     int64
Game Time                float64
Side                       int64
Patch                     object
GD@15                    float64
ADC_champion              object
JUNGLE_champion           object
MID_champion              object
SUPPORT_champion          object
TOP_champion              object
ADC_player                object
JUNGLE_player             object
MID_player                object
SUPPORT_player            object
TOP_player                object
ADC_kills                  int64
JUNGLE_kills               int64
MID_kills                  int64
SUPPORT_kills              int64
TOP_kills                  int64
ADC_assists                int64
JUNGLE_assists             int64
MID_assists                int64
SUPPORT_as

In [5]:
print(df["Region"].unique())
print(df["Region"].value_counts())

['CN' 'KR' 'EUW' 'WR' 'NA']
Region
CN     1634
KR     1102
WR      960
EUW     610
NA      378
Name: count, dtype: int64


In [6]:
#convert all team, region and player and champion columns to lowercase
df["Team"] = df["Team"].str.lower()
df["Region"] = df["Region"].str.lower()

for role in ["TOP", "JUNGLE", "MID", "ADC", "SUPPORT"]:
    df[f"{role}_player"] = df[f"{role}_player"].str.lower()

 
for role in ["TOP", "JUNGLE", "MID", "ADC", "SUPPORT"]:
    df[f"{role}_champion"] = df[f"{role}_champion"].str.lower()
    

In [7]:
df.describe()

Unnamed: 0,GameID,Kills,Deaths,Assists,Result,Game Time,Side,GD@15,ADC_kills,JUNGLE_kills,MID_kills,SUPPORT_kills,TOP_kills,ADC_assists,JUNGLE_assists,MID_assists,SUPPORT_assists,TOP_assists,ADC_deaths,JUNGLE_deaths,MID_deaths,SUPPORT_deaths,TOP_deaths,ADC_kp%,JUNGLE_kp%,MID_kp%,SUPPORT_kp%,TOP_kp%,ADC_dmg%,JUNGLE_dmg%,MID_dmg%,SUPPORT_dmg%,TOP_dmg%,ADC_ka_per_minute,JUNGLE_ka_per_minute,MID_ka_per_minute,SUPPORT_ka_per_minute,TOP_ka_per_minute,ADC_gd@15,JUNGLE_gd@15,MID_gd@15,SUPPORT_gd@15,TOP_gd@15
count,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0,4684.0
mean,67922.353117,14.425278,14.450256,35.538856,0.5,1958.959009,0.5,0.0,4.524338,3.041631,3.421862,0.712852,2.724594,5.753202,7.551025,6.491033,10.176558,5.567037,2.359308,2.996371,2.637489,3.560418,2.89667,0.697448,0.730542,0.67344,0.74594,0.562053,0.277659,0.164795,0.256461,0.078239,0.222854,0.315877,0.326714,0.304586,0.335717,0.254101,0.0,0.0,0.0,0.0,0.0
std,3010.457697,7.345023,7.340581,19.747076,0.500053,309.066121,0.500053,2034.388683,3.251643,2.484669,2.63999,0.969841,2.219665,3.791454,4.608314,4.194925,5.818542,3.93395,1.735209,1.996951,1.802699,2.137586,1.893603,0.184715,0.173662,0.188165,0.175136,0.19842,0.062928,0.048977,0.058326,0.030418,0.059313,0.184248,0.184864,0.178987,0.19127,0.157136,789.567032,666.857335,700.312723,397.426561,734.10513
min,62896.0,0.0,0.0,0.0,0.0,1227.0,0.0,-8943.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081,0.043,0.079,0.019,0.052,0.0,0.0,0.0,0.0,0.0,-3723.0,-2758.0,-2799.0,-2064.0,-3658.0
25%,65431.0,8.0,8.0,18.0,0.0,1732.0,0.0,-1292.0,2.0,1.0,1.0,0.0,1.0,3.0,4.0,3.0,5.0,2.0,1.0,1.0,1.0,2.0,1.0,0.6,0.636,0.571,0.667,0.444,0.233,0.13,0.216,0.058,0.181,0.17,0.18,0.16,0.18,0.13,-482.0,-418.25,-427.25,-244.0,-450.25
50%,67847.5,15.0,15.0,35.0,0.5,1892.5,0.5,0.0,4.0,3.0,3.0,0.0,2.0,5.0,7.0,6.0,10.0,5.0,2.0,3.0,2.0,3.0,3.0,0.714,0.75,0.692,0.765,0.571,0.275,0.16,0.253,0.072,0.219,0.3,0.31,0.29,0.32,0.24,0.0,0.0,0.0,0.0,0.0
75%,70590.0,20.0,20.0,50.0,1.0,2125.0,1.0,1292.0,7.0,4.0,5.0,1.0,4.0,8.0,11.0,9.0,14.0,8.0,3.0,4.0,4.0,5.0,4.0,0.824,0.846,0.8,0.867,0.692,0.319,0.195,0.295,0.092,0.261,0.44,0.45,0.42,0.4625,0.36,482.0,418.25,427.25,244.0,450.25
max,72951.0,40.0,40.0,110.0,1.0,3558.0,1.0,8943.0,19.0,17.0,19.0,7.0,13.0,27.0,24.0,28.0,35.0,23.0,9.0,12.0,11.0,12.0,10.0,1.0,1.167,1.0,1.167,1.0,0.564,0.414,0.509,0.3,0.441,1.09,1.19,1.05,1.13,1.0,3723.0,2758.0,2799.0,2064.0,3658.0


In [8]:
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date")

In [9]:
# encode champion names
champion_cols = ["TOP_champion", "JUNGLE_champion", "MID_champion", "ADC_champion", "SUPPORT_champion"]
champion_encoders = {}
for col in champion_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  
    champion_encoders[col] = le

joblib.dump(champion_encoders, "champion_encoders.pkl")

['champion_encoders.pkl']

In [10]:
champion_encoders = joblib.load("champion_encoders.pkl")
champion_names_by_role = {}

for role, encoder in champion_encoders.items():
    champion_names_by_role[role] = list(encoder.classes_)
    print(f"{role} ({len(encoder.classes_)})")
    print(encoder.classes_)

TOP_champion (65)
['aatrox' 'akali' 'ambessa' 'anivia' 'annie' 'aurora' 'camille' 'chogath'
 'corki' 'diana' 'dr. mundo' 'fiora' 'galio' 'gangplank' 'garen' 'gnar'
 'gragas' 'gwen' 'irelia' 'jax' 'jayce' 'karma' 'kayle' 'kennen' 'kled'
 'ksante' 'malphite' 'maokai' 'mordekaiser' 'naafiri' 'nasus' 'neeko'
 'nidalee' 'olaf' 'ornn' 'poppy' 'quinn' 'reksai' 'renekton' 'riven'
 'rumble' 'ryze' 'sejuani' 'sett' 'shen' 'shyvana' 'singed' 'sion'
 'smolder' 'swain' 'sylas' 'trundle' 'udyr' 'urgot' 'varus' 'vayne' 'vex'
 'vladimir' 'volibear' 'warwick' 'wukong' 'yasuo' 'yone' 'yorick' 'zac']
JUNGLE_champion (47)
['ambessa' 'amumu' 'brand' 'diana' 'dr. mundo' 'fiddlesticks' 'gragas'
 'graves' 'gwen' 'hecarim' 'ivern' 'jarvan iv' 'jax' 'karthus' 'khazix'
 'kindred' 'lee sin' 'lillia' 'maokai' 'morgana' 'naafiri' 'nidalee'
 'nocturne' 'nunu' 'olaf' 'pantheon' 'poppy' 'qiyana' 'reksai' 'rengar'
 'sejuani' 'shen' 'shyvana' 'skarner' 'sylas' 'taliyah' 'talon' 'trundle'
 'tryndamere' 'vi' 'viego' 'voli

In [11]:
patch_encoder = OneHotEncoder(sparse_output=False, drop="first") 
patch_encoded = patch_encoder.fit_transform(df[["Patch"]])
patch_features = patch_encoder.get_feature_names_out(["Patch"])
patch_df = pd.DataFrame(patch_encoded, columns=patch_features, index=df.index)
df = pd.concat([df.drop("Patch", axis=1), patch_df], axis=1)
joblib.dump(patch_encoder, "patch_encoder.pkl")

['patch_encoder.pkl']

In [12]:
# initialize and fit the onehotencoder on the region and combine to get the encoded columns
region_encoder = OneHotEncoder(sparse_output=False, drop="first")
region_encoded = region_encoder.fit_transform(df[["Region"]])
feature_names = region_encoder.get_feature_names_out(["Region"])
region_df = pd.DataFrame(region_encoded, columns=feature_names, index=df.index)
df = pd.concat([df.drop("Region", axis=1), region_df], axis=1)

joblib.dump(region_encoder, "region_encoder.pkl")

['region_encoder.pkl']

In [13]:
# encode team names
team_encoder = LabelEncoder()
df["Team"] = team_encoder.fit_transform(df["Team"])
joblib.dump(team_encoder, "team_encoder.pkl")

['team_encoder.pkl']

In [14]:
print(df.columns.tolist())

['GameID', 'Team', 'Date', 'Kills', 'Deaths', 'Assists', 'Result', 'Game Time', 'Side', 'GD@15', 'ADC_champion', 'JUNGLE_champion', 'MID_champion', 'SUPPORT_champion', 'TOP_champion', 'ADC_player', 'JUNGLE_player', 'MID_player', 'SUPPORT_player', 'TOP_player', 'ADC_kills', 'JUNGLE_kills', 'MID_kills', 'SUPPORT_kills', 'TOP_kills', 'ADC_assists', 'JUNGLE_assists', 'MID_assists', 'SUPPORT_assists', 'TOP_assists', 'ADC_deaths', 'JUNGLE_deaths', 'MID_deaths', 'SUPPORT_deaths', 'TOP_deaths', 'ADC_kp%', 'JUNGLE_kp%', 'MID_kp%', 'SUPPORT_kp%', 'TOP_kp%', 'ADC_dmg%', 'JUNGLE_dmg%', 'MID_dmg%', 'SUPPORT_dmg%', 'TOP_dmg%', 'ADC_ka_per_minute', 'JUNGLE_ka_per_minute', 'MID_ka_per_minute', 'SUPPORT_ka_per_minute', 'TOP_ka_per_minute', 'ADC_gd@15', 'JUNGLE_gd@15', 'MID_gd@15', 'SUPPORT_gd@15', 'TOP_gd@15', 'Patch_15.10', 'Patch_15.11', 'Patch_15.13', 'Patch_15.14', 'Patch_15.15', 'Patch_15.16', 'Patch_15.17', 'Patch_15.18', 'Patch_15.2', 'Patch_15.20', 'Patch_15.3', 'Patch_15.4', 'Patch_15.5', 'Pat

In [15]:
# calculates historical averages per player per stat
df = df.sort_values("GameID").reset_index(drop=True)
roles = ["TOP", "JUNGLE", "MID", "ADC", "SUPPORT"]
stat_columns = ["kills", "deaths", "assists", "kp%", "dmg%", "gd@15"]

for role in roles:
    player_col = f"{role}_player"
    for stat in stat_columns:
        stat_col = f"{role}_{stat}"
        avg_col = f"{role}_historical_avg_{stat}"
        # .shift(1) for historical averages to avoid data leakage
        df[avg_col] = df.groupby(player_col)[stat_col].transform(lambda x: x.expanding().mean().shift(1))
        # fill first game with the actual stats from the game (initially will be NAN since using .shift(1))
        df[avg_col] = df[avg_col].fillna(df[stat_col])


In [16]:
df.head(10)

Unnamed: 0,GameID,Team,Date,Kills,Deaths,Assists,Result,Game Time,Side,GD@15,ADC_champion,JUNGLE_champion,MID_champion,SUPPORT_champion,TOP_champion,ADC_player,JUNGLE_player,MID_player,SUPPORT_player,TOP_player,ADC_kills,JUNGLE_kills,MID_kills,SUPPORT_kills,TOP_kills,ADC_assists,JUNGLE_assists,MID_assists,SUPPORT_assists,TOP_assists,ADC_deaths,JUNGLE_deaths,MID_deaths,SUPPORT_deaths,TOP_deaths,ADC_kp%,JUNGLE_kp%,MID_kp%,SUPPORT_kp%,TOP_kp%,ADC_dmg%,JUNGLE_dmg%,MID_dmg%,SUPPORT_dmg%,TOP_dmg%,ADC_ka_per_minute,JUNGLE_ka_per_minute,MID_ka_per_minute,SUPPORT_ka_per_minute,TOP_ka_per_minute,ADC_gd@15,JUNGLE_gd@15,MID_gd@15,SUPPORT_gd@15,TOP_gd@15,Patch_15.10,Patch_15.11,Patch_15.13,Patch_15.14,Patch_15.15,Patch_15.16,Patch_15.17,Patch_15.18,Patch_15.2,Patch_15.20,Patch_15.3,Patch_15.4,Patch_15.5,Patch_15.6,Patch_15.7,Patch_15.8,Patch_15.9,Region_euw,Region_kr,Region_na,Region_wr,TOP_historical_avg_kills,TOP_historical_avg_deaths,TOP_historical_avg_assists,TOP_historical_avg_kp%,TOP_historical_avg_dmg%,TOP_historical_avg_gd@15,JUNGLE_historical_avg_kills,JUNGLE_historical_avg_deaths,JUNGLE_historical_avg_assists,JUNGLE_historical_avg_kp%,JUNGLE_historical_avg_dmg%,JUNGLE_historical_avg_gd@15,MID_historical_avg_kills,MID_historical_avg_deaths,MID_historical_avg_assists,MID_historical_avg_kp%,MID_historical_avg_dmg%,MID_historical_avg_gd@15,ADC_historical_avg_kills,ADC_historical_avg_deaths,ADC_historical_avg_assists,ADC_historical_avg_kp%,ADC_historical_avg_dmg%,ADC_historical_avg_gd@15,SUPPORT_historical_avg_kills,SUPPORT_historical_avg_deaths,SUPPORT_historical_avg_assists,SUPPORT_historical_avg_kp%,SUPPORT_historical_avg_dmg%,SUPPORT_historical_avg_gd@15
0,62896,40,2025-01-12,5,17,9,0,2123.0,0,-1051.0,25,22,29,24,15,starry,heng,linfeng,moham,hery,2,2,0,0,1,2,1,3,3,0,4,4,3,4,2,0.8,0.6,0.6,0.6,0.2,0.213,0.146,0.231,0.147,0.262,0.11,0.08,0.08,0.08,0.03,199.0,-528.0,-1072.0,-250.0,600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.2,0.262,600.0,2.0,4.0,1.0,0.6,0.146,-528.0,0.0,3.0,3.0,0.6,0.231,-1072.0,2.0,4.0,2.0,0.8,0.213,199.0,0.0,4.0,3.0,0.6,0.147,-250.0
1,62896,59,2025-01-12,17,5,33,1,2123.0,1,1051.0,1,40,6,5,25,light,tian,xiaohu,hang,breathe,3,4,5,1,4,6,5,8,11,3,2,0,1,2,0,0.529,0.529,0.765,0.706,0.412,0.197,0.126,0.474,0.079,0.124,0.25,0.25,0.37,0.34,0.2,-199.0,528.0,1072.0,250.0,-600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.412,0.124,-600.0,4.0,0.0,5.0,0.529,0.126,528.0,5.0,1.0,8.0,0.765,0.474,1072.0,3.0,2.0,6.0,0.529,0.197,-199.0,1.0,2.0,11.0,0.706,0.079,250.0
2,62897,40,2025-01-12,11,18,27,0,1952.0,1,-2888.0,7,18,2,31,40,starry,heng,linfeng,moham,hery,0,1,4,0,6,5,7,2,9,4,3,6,4,4,1,0.455,0.727,0.545,0.818,0.909,0.097,0.213,0.243,0.056,0.39,0.15,0.25,0.18,0.28,0.31,-1727.0,197.0,-922.0,-245.0,-191.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.2,0.262,600.0,2.0,4.0,1.0,0.6,0.146,-528.0,0.0,3.0,3.0,0.6,0.231,-1072.0,2.0,4.0,2.0,0.8,0.213,199.0,0.0,4.0,3.0,0.6,0.147,-250.0
3,62897,59,2025-01-12,18,11,40,1,1952.0,0,2888.0,14,42,47,29,20,light,tian,xiaohu,hang,breathe,4,6,1,2,5,9,6,7,11,7,1,2,3,2,3,0.722,0.667,0.444,0.722,0.667,0.316,0.157,0.306,0.044,0.178,0.4,0.37,0.25,0.4,0.37,1727.0,-197.0,922.0,245.0,191.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.412,0.124,-600.0,4.0,0.0,5.0,0.529,0.126,528.0,5.0,1.0,8.0,0.765,0.474,1072.0,3.0,2.0,6.0,0.529,0.197,-199.0,1.0,2.0,11.0,0.706,0.079,250.0
4,62898,59,2025-01-12,22,18,44,1,2014.0,0,-2460.0,8,30,38,27,19,light,tian,xiaohu,hang,breathe,8,0,4,2,8,6,17,7,10,4,1,4,8,4,1,0.636,0.773,0.5,0.545,0.545,0.38,0.172,0.184,0.09,0.174,0.42,0.51,0.33,0.36,0.36,-672.0,-936.0,-178.0,-1077.0,403.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,1.5,5.0,0.5395,0.151,-204.5,5.0,1.0,5.5,0.598,0.1415,165.5,3.0,2.0,7.5,0.6045,0.39,997.0,3.5,1.5,7.5,0.6255,0.2565,764.0,1.5,2.0,11.0,0.714,0.0615,247.5
5,62898,40,2025-01-12,18,22,40,0,2014.0,1,2460.0,32,46,51,16,0,starry,heng,linfeng,moham,hery,5,6,6,0,1,9,9,4,13,5,3,6,6,3,4,0.778,0.833,0.556,0.722,0.333,0.314,0.285,0.166,0.091,0.144,0.42,0.45,0.3,0.39,0.18,672.0,936.0,178.0,1077.0,-403.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,1.5,2.0,0.5545,0.326,204.5,1.5,5.0,4.0,0.6635,0.1795,-165.5,2.0,3.5,2.5,0.5725,0.237,-997.0,1.0,3.5,3.5,0.6275,0.155,-764.0,0.0,4.0,6.0,0.709,0.1015,-247.5
6,62901,31,2025-01-13,20,14,42,1,2098.0,1,-465.0,10,25,48,24,25,photic,weiwei,haichao,zhuo,zika,7,3,6,0,4,4,6,10,8,14,4,3,1,5,1,0.55,0.45,0.8,0.4,0.9,0.22,0.18,0.251,0.119,0.23,0.31,0.26,0.46,0.23,0.51,-985.0,217.0,-260.0,-65.0,628.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,14.0,0.9,0.23,628.0,3.0,3.0,6.0,0.45,0.18,217.0,6.0,1.0,10.0,0.8,0.251,-260.0,7.0,4.0,4.0,0.55,0.22,-985.0,0.0,5.0,8.0,0.4,0.119,-65.0
7,62901,56,2025-01-13,14,20,24,0,2098.0,0,465.0,5,39,6,16,40,1xn,beichuan,setab,feather,hoya,10,2,2,0,0,3,6,3,6,6,3,3,4,5,5,0.929,0.571,0.357,0.429,0.429,0.398,0.099,0.294,0.052,0.156,0.37,0.23,0.14,0.17,0.17,985.0,-217.0,260.0,65.0,-628.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,0.429,0.156,-628.0,2.0,3.0,6.0,0.571,0.099,-217.0,2.0,4.0,3.0,0.357,0.294,260.0,10.0,3.0,3.0,0.929,0.398,985.0,0.0,5.0,6.0,0.429,0.052,65.0
8,62902,31,2025-01-13,7,19,16,0,2053.0,0,524.0,3,16,40,31,38,photic,weiwei,haichao,zhuo,zika,0,3,0,0,4,3,4,2,4,3,4,5,2,5,3,0.429,1.0,0.286,0.571,1.0,0.251,0.146,0.264,0.06,0.278,0.09,0.2,0.06,0.12,0.2,-225.0,185.0,-109.0,-72.0,745.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,14.0,0.9,0.23,628.0,3.0,3.0,6.0,0.45,0.18,217.0,6.0,1.0,10.0,0.8,0.251,-260.0,7.0,4.0,4.0,0.55,0.22,-985.0,0.0,5.0,8.0,0.4,0.119,-65.0
9,62902,56,2025-01-13,19,7,61,1,2053.0,1,-524.0,9,42,1,29,15,1xn,beichuan,setab,feather,hoya,7,1,4,1,6,11,15,9,16,10,1,2,0,2,2,0.947,0.842,0.684,0.895,0.842,0.322,0.13,0.195,0.05,0.303,0.53,0.47,0.38,0.5,0.47,225.0,-185.0,109.0,72.0,-745.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,0.429,0.156,-628.0,2.0,3.0,6.0,0.571,0.099,-217.0,2.0,4.0,3.0,0.357,0.294,260.0,10.0,3.0,3.0,0.929,0.398,985.0,0.0,5.0,6.0,0.429,0.052,65.0


In [17]:
player_columns = ["TOP_player", "JUNGLE_player", "MID_player", "ADC_player", "SUPPORT_player"]
# dictionary to store encoders for each player role
player_encoders = {}

# use labelencoder for players 
for col in player_columns:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    player_encoders[col] = encoder
    print(f"{col}: {len(encoder.classes_)}")

joblib.dump(player_encoders, "player_encoders.pkl")
print(df[player_columns].head())

TOP_player: 72
JUNGLE_player: 80
MID_player: 68
ADC_player: 77
SUPPORT_player: 71
   TOP_player  JUNGLE_player  MID_player  ADC_player  SUPPORT_player
0          29             26          40          62              42
1           8             65          65          40              18
2          29             26          40          62              42
3           8             65          65          40              18
4           8             65          65          40              18


In [18]:
# finds the first game a team plays to see which region they belong to
def get_team_region(df_original, encoded_team):
    team_games = df_original[df_original["Team"] == encoded_team]
    first_row = team_games.iloc[0]
    # regions are one hot encoded so if none are true it has to be cn
    if first_row.get("Region_kr", 0) == 1.0:
        return "kr"
    elif first_row.get("Region_euw", 0) == 1.0:
        return "euw"
    elif first_row.get("Region_na", 0) == 1.0:
        return "na"
    elif first_row.get("Region_wr", 0) == 1.0:
        return "wr"
    else:
        return "cn"


def calculate_team_elo_regional(df_original, k_factor):
    region_base_elo = {
        "kr": 1650, "cn": 1600, "euw": 1500, "na": 1450, "wr": 1450,
    }
    
    df = df_original.sort_values("GameID").reset_index(drop=True)
    team_elos = {}
    team_encoder = joblib.load("team_encoder.pkl")

    # group by region
    teams_by_region = {}
    for encoded_team in df["Team"].unique():
        region = get_team_region(df, encoded_team)     
        if region not in teams_by_region:
            teams_by_region[region] = []
        
        teams_by_region[region].append(encoded_team)
    
    # initialize elo
    for region, teams in teams_by_region.items():
        base_elo = region_base_elo.get(region)        
        for encoded_team in teams:
            team_elos[encoded_team] = base_elo
            team_name = team_encoder.inverse_transform([encoded_team])[0]
            print(f"{team_name}:{base_elo}")
    
    # store initial elo for each row
    df["team_elo_rating"] = 0.0
    for id, row in df.iterrows():
        team = row["Team"]
        df.loc[id, "team_elo_rating"] = team_elos[team]
    
    processed_games = set()
    # process each game and update elo
    for id, row in df.iterrows():
        game_id = row["GameID"]
        if game_id in processed_games:
            continue
        game_data = df[df["GameID"] == game_id]
        # makes sure there is two rows (one for each team)
        if len(game_data) != 2:
            continue
        # gets both teams
        team1 = game_data.iloc[0]
        team2 = game_data.iloc[1]
        # identify winner and loser
        if team1["Result"] == 1:
            winner_team, loser_team = team1["Team"], team2["Team"]
            winner_id, loser_id = team1.name, team2.name
        else:
            winner_team, loser_team = team2["Team"], team1["Team"]
            winner_id, loser_id = team2.name, team1.name
        
        winner_elo = team_elos[winner_team]
        loser_elo = team_elos[loser_team]

        # standard elo formula
        expected_winner = 1 / (1 + 10**((loser_elo - winner_elo) / 400))
        winner_change = k_factor * (1 - expected_winner)
        loser_change = k_factor * (0 - (1 - expected_winner))

        # update new elos
        team_elos[winner_team] += winner_change
        team_elos[loser_team] += loser_change
        
        df.loc[winner_id, "team_elo_rating"] = team_elos[winner_team]
        df.loc[loser_id, "team_elo_rating"] = team_elos[loser_team]
        
        processed_games.add(game_id)
    
    joblib.dump(team_elos, "final_team_elos.pkl")    
    print_final_rankings(df, teams_by_region, team_elos, team_encoder, region_base_elo)
    
    return df

def print_final_rankings(df, teams_by_region, team_elos, team_encoder, region_base_elo):
    # print each team from region sorted by highest to lowest elo
    for region in teams_by_region.keys():
        print(f"{region.upper} Region:")
        
        teams_in_region = []
        for encoded_team in teams_by_region[region]:
            team_name = team_encoder.inverse_transform([encoded_team])[0]
            final_elo = team_elos[encoded_team]
            teams_in_region.append((team_name, final_elo))
        
        # sort teams by highest to lowest elo
        teams_in_region.sort(key=lambda x: x[1], reverse=True)
        
        for team_name, elo in teams_in_region:
            print(f"{team_name}: {elo:.0f}")

def get_team_current_elo(df_original_with_elo, team_name):
    final_team_elos = joblib.load("final_team_elos.pkl")
    team_encoder = joblib.load("team_encoder.pkl")
    
    encoded_team = team_encoder.transform([team_name.lower()])[0]
    return final_team_elos[encoded_team]

def add_team_elo_to_combined_data(df_combined, df_original_with_elo):

    # initialize elo
    df_combined["blue_team_elo"] = 0.0
    df_combined["red_team_elo"] = 0.0
    # add team elo ratings for each game
    for id, row in df_combined.iterrows():
        game_id = row["GameID"]        
        blue_elo = df_original_with_elo[(df_original_with_elo["GameID"]== game_id)&(df_original_with_elo["Side"]== 1)]["team_elo_rating"].iloc[0]
        red_elo = df_original_with_elo[(df_original_with_elo["GameID"]== game_id)&(df_original_with_elo["Side"]== 0)]["team_elo_rating"].iloc[0]
        # store elo ratings
        df_combined.loc[id, "blue_team_elo"] = blue_elo
        df_combined.loc[id, "red_team_elo"] = red_elo
    
    df_combined["elo_difference"] = df_combined["blue_team_elo"] - df_combined["red_team_elo"]
    
    return df_combined


In [19]:
# use a k factor of 32 
df_original_with_elo = calculate_team_elo_regional(df.copy(), 32)
df = df_original_with_elo.copy()

omg:1600
weibo gaming:1600
lng esports:1600
tt:1600
royal never give up:1600
funplus phoenix:1600
ultra prime:1600
anyones legend:1600
lgd gaming:1600
bilibili gaming:1600
edward gaming:1600
team we:1600
top esports:1600
ninjas in pyjamas:1600
invictus gaming:1600
jd gaming:1600
drx:1650
ok brion:1650
nongshim redforce:1650
dn freecs:1650
kt rolster:1650
bnk fearx:1650
dplus kia:1650
t1:1650
hanwha life esports:1650
gen.g esports:1650
rogue:1500
team heretics:1500
team bds:1500
team vitality:1500
sk gaming:1500
giantx:1500
movistar koi:1500
fnatic:1500
karmine corp:1500
g2 esports:1500
natus vincere:1500
mgn vikings esports:1450
ctbc flying oyster:1450
gam esports:1450
fukuoka softbank hawks gaming:1450
detonation focusme:1450
team secret whales:1450
psg talon:1450
chiefs esports club:1450
pain gaming:1450
leviatan:1450
loud:1450
isurus estral:1450
furia:1450
red canids:1450
vivo keyd stars:1450
flyquest:1450
shopify rebellion:1450
lyon:1450
cloud9:1450
100 thieves:1450
disguised:1450


In [20]:
# get the one hot encoded patch columns
patch_columns = [col for col in df.columns if col.startswith("Patch_")]

# add the patch columns to the game level columns so it isn"t duplicated
game_level_columns = ["GameID", "Date", "Game Time", "Region_euw", "Region_kr", "Region_na", "Region_wr"] + patch_columns

# split by side
blue_side = df[df["Side"] == 1].reset_index(drop=True)
red_side = df[df["Side"] == 0].reset_index(drop=True)

# keep game level columns (no prefix)
game_info = blue_side[game_level_columns].copy()

# drop game level columns from team specific columns and add prefixes
blue_team_data = blue_side.drop(columns=game_level_columns).add_prefix("blue_")
red_team_data = red_side.drop(columns=game_level_columns).add_prefix("red_")

# combine game and team level data
df = pd.concat([game_info, blue_team_data, red_team_data], axis=1)

print("Game-level:", [col for col in df.columns if not col.startswith(("blue_", "red_"))])
print("Blue team:", [col for col in df.columns if col.startswith("blue_")])  
print("Red team:", [col for col in df.columns if col.startswith("red_")])

Game-level: ['GameID', 'Date', 'Game Time', 'Region_euw', 'Region_kr', 'Region_na', 'Region_wr', 'Patch_15.10', 'Patch_15.11', 'Patch_15.13', 'Patch_15.14', 'Patch_15.15', 'Patch_15.16', 'Patch_15.17', 'Patch_15.18', 'Patch_15.2', 'Patch_15.20', 'Patch_15.3', 'Patch_15.4', 'Patch_15.5', 'Patch_15.6', 'Patch_15.7', 'Patch_15.8', 'Patch_15.9']
Blue team: ['blue_Team', 'blue_Kills', 'blue_Deaths', 'blue_Assists', 'blue_Result', 'blue_Side', 'blue_GD@15', 'blue_ADC_champion', 'blue_JUNGLE_champion', 'blue_MID_champion', 'blue_SUPPORT_champion', 'blue_TOP_champion', 'blue_ADC_player', 'blue_JUNGLE_player', 'blue_MID_player', 'blue_SUPPORT_player', 'blue_TOP_player', 'blue_ADC_kills', 'blue_JUNGLE_kills', 'blue_MID_kills', 'blue_SUPPORT_kills', 'blue_TOP_kills', 'blue_ADC_assists', 'blue_JUNGLE_assists', 'blue_MID_assists', 'blue_SUPPORT_assists', 'blue_TOP_assists', 'blue_ADC_deaths', 'blue_JUNGLE_deaths', 'blue_MID_deaths', 'blue_SUPPORT_deaths', 'blue_TOP_deaths', 'blue_ADC_kp%', 'blue_JU

In [21]:
# show mappings of encoded champions per role
for role, encoder in champion_encoders.items():
    print(f"{role} mappings:")
    for i, champion in enumerate(encoder.classes_):
        print(f"{champion} -> {i}")

TOP_champion mappings:
aatrox -> 0
akali -> 1
ambessa -> 2
anivia -> 3
annie -> 4
aurora -> 5
camille -> 6
chogath -> 7
corki -> 8
diana -> 9
dr. mundo -> 10
fiora -> 11
galio -> 12
gangplank -> 13
garen -> 14
gnar -> 15
gragas -> 16
gwen -> 17
irelia -> 18
jax -> 19
jayce -> 20
karma -> 21
kayle -> 22
kennen -> 23
kled -> 24
ksante -> 25
malphite -> 26
maokai -> 27
mordekaiser -> 28
naafiri -> 29
nasus -> 30
neeko -> 31
nidalee -> 32
olaf -> 33
ornn -> 34
poppy -> 35
quinn -> 36
reksai -> 37
renekton -> 38
riven -> 39
rumble -> 40
ryze -> 41
sejuani -> 42
sett -> 43
shen -> 44
shyvana -> 45
singed -> 46
sion -> 47
smolder -> 48
swain -> 49
sylas -> 50
trundle -> 51
udyr -> 52
urgot -> 53
varus -> 54
vayne -> 55
vex -> 56
vladimir -> 57
volibear -> 58
warwick -> 59
wukong -> 60
yasuo -> 61
yone -> 62
yorick -> 63
zac -> 64
JUNGLE_champion mappings:
ambessa -> 0
amumu -> 1
brand -> 2
diana -> 3
dr. mundo -> 4
fiddlesticks -> 5
gragas -> 6
graves -> 7
gwen -> 8
hecarim -> 9
ivern -> 10
j

In [22]:
print(df.columns.tolist())

['GameID', 'Date', 'Game Time', 'Region_euw', 'Region_kr', 'Region_na', 'Region_wr', 'Patch_15.10', 'Patch_15.11', 'Patch_15.13', 'Patch_15.14', 'Patch_15.15', 'Patch_15.16', 'Patch_15.17', 'Patch_15.18', 'Patch_15.2', 'Patch_15.20', 'Patch_15.3', 'Patch_15.4', 'Patch_15.5', 'Patch_15.6', 'Patch_15.7', 'Patch_15.8', 'Patch_15.9', 'blue_Team', 'blue_Kills', 'blue_Deaths', 'blue_Assists', 'blue_Result', 'blue_Side', 'blue_GD@15', 'blue_ADC_champion', 'blue_JUNGLE_champion', 'blue_MID_champion', 'blue_SUPPORT_champion', 'blue_TOP_champion', 'blue_ADC_player', 'blue_JUNGLE_player', 'blue_MID_player', 'blue_SUPPORT_player', 'blue_TOP_player', 'blue_ADC_kills', 'blue_JUNGLE_kills', 'blue_MID_kills', 'blue_SUPPORT_kills', 'blue_TOP_kills', 'blue_ADC_assists', 'blue_JUNGLE_assists', 'blue_MID_assists', 'blue_SUPPORT_assists', 'blue_TOP_assists', 'blue_ADC_deaths', 'blue_JUNGLE_deaths', 'blue_MID_deaths', 'blue_SUPPORT_deaths', 'blue_TOP_deaths', 'blue_ADC_kp%', 'blue_JUNGLE_kp%', 'blue_MID_kp%

In [23]:
# make sure everything in number form for the model
with pd.option_context("display.max_columns", None, "display.max_rows", None):
    print(df.dtypes)

GameID                                          int64
Date                                   datetime64[ns]
Game Time                                     float64
Region_euw                                    float64
Region_kr                                     float64
Region_na                                     float64
Region_wr                                     float64
Patch_15.10                                   float64
Patch_15.11                                   float64
Patch_15.13                                   float64
Patch_15.14                                   float64
Patch_15.15                                   float64
Patch_15.16                                   float64
Patch_15.17                                   float64
Patch_15.18                                   float64
Patch_15.2                                    float64
Patch_15.20                                   float64
Patch_15.3                                    float64
Patch_15.4                  

In [24]:
# these are results from the game, remove to avoid data leakage
in_game_stats = [
    "red_Result", "GameID", "Date", "Game Time", "blue_Kills", "red_Kills", "blue_Deaths", "red_Deaths", "blue_Assists", "red_Assists","blue_GD@15", "red_GD@15", "blue_ADC_kills", "blue_JUNGLE_kills", "blue_MID_kills", 
    "blue_SUPPORT_kills", "blue_TOP_kills","red_ADC_kills", "red_JUNGLE_kills", "red_MID_kills", "red_SUPPORT_kills", "red_TOP_kills","blue_ADC_deaths", "blue_JUNGLE_deaths", "blue_MID_deaths", "blue_SUPPORT_deaths", "blue_TOP_deaths",
    "red_ADC_deaths", "red_JUNGLE_deaths", "red_MID_deaths", "red_SUPPORT_deaths", "red_TOP_deaths","blue_ADC_assists", "blue_JUNGLE_assists", "blue_MID_assists", "blue_SUPPORT_assists", "blue_TOP_assists",
    "red_ADC_assists", "red_JUNGLE_assists", "red_MID_assists", "red_SUPPORT_assists", "red_TOP_assists","blue_ADC_kp%", "blue_JUNGLE_kp%", "blue_MID_kp%", "blue_SUPPORT_kp%", "blue_TOP_kp%",
    "red_ADC_kp%", "red_JUNGLE_kp%", "red_MID_kp%", "red_SUPPORT_kp%", "red_TOP_kp%", "blue_ADC_dmg%", "blue_JUNGLE_dmg%", "blue_MID_dmg%", "blue_SUPPORT_dmg%", "blue_TOP_dmg%","red_ADC_dmg%", 
    "red_JUNGLE_dmg%", "red_MID_dmg%", "red_SUPPORT_dmg%", "red_TOP_dmg%","blue_ADC_ka_per_minute", "blue_JUNGLE_ka_per_minute", "blue_MID_ka_per_minute", "blue_SUPPORT_ka_per_minute", "blue_TOP_ka_per_minute",
    "red_ADC_ka_per_minute", "red_JUNGLE_ka_per_minute", "red_MID_ka_per_minute", "red_SUPPORT_ka_per_minute", "red_TOP_ka_per_minute", "blue_ADC_gd@15", "blue_JUNGLE_gd@15", "blue_MID_gd@15", "blue_SUPPORT_gd@15", "blue_TOP_gd@15",
    "red_ADC_gd@15", "red_JUNGLE_gd@15", "red_MID_gd@15", "red_SUPPORT_gd@15", "red_TOP_gd@15", "blue_Side", "red_Side"
]

# predict blue team result
target = "blue_Result"
y = df[target]
# only use pre-game features
feature_columns = [col for col in df.columns if col != target and col not in in_game_stats]
data = df[feature_columns]

red_wins, blue_wins = np.bincount(y)
total = red_wins + blue_wins

print(f"Win rate analysis:")
print(f"Blue team win rate: {round(100 * blue_wins / total, 2)}%")
print(f"Red team win rate: {round(100 * red_wins / total, 2)}%")

# 80/20 split for test and training
train_val_data, test_data, train_val_labels, test_labels = train_test_split(
    data, y, 
    test_size=0.20, 
    train_size=0.80, 
    random_state=42,
    stratify=y  
)

# split again for validation set
train_data, validation_data, train_labels, validation_labels = train_test_split(
    train_val_data, train_val_labels, 
    test_size=0.20, 
    train_size=0.80, 
    random_state=42,
    stratify=train_val_labels
)

train_percentage = round((len(train_data) / len(data)) * 100, 1)
validation_percentage = round((len(validation_data) / len(data)) * 100, 1)
test_percentage = round((len(test_data) / len(data)) * 100, 1)

print("Training set:", len(train_data), "games (", train_percentage, "%)")
print("Validation set:", len(validation_data), "games (", validation_percentage, "%)")
print("Test set:", len(test_data), "games (", test_percentage, "%)")
print("Total:", len(data), "games")

print("Win rate balance check:")
print("Train blue win rate:", round(train_labels.mean(), 3))
print("Validation blue win rate:", round(validation_labels.mean(), 3))
print("Test blue win rate:", round(test_labels.mean(), 3))

# hyperparameter grids for tuning
# different regularization strengths 
elastic_params = {
    "classifier__C": [0.01, 0.05, 0.1, 0.5, 1],
    "classifier__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
    "classifier__max_iter": [1000, 2000]
}


xgb_params = {
    "classifier__n_estimators": [30, 50],
    "classifier__learning_rate": [0.03, 0.05],
    "classifier__max_depth": [3, 4],
    "classifier__reg_alpha": [0.1, 0.3],
    "classifier__reg_lambda": [0.1, 0.3],
    "classifier__subsample": [0.7, 0.8,],
    "classifier__colsample_bytree": [0.6, 0.7, 0.8]
}

ridge_params = {
    "classifier__alpha": [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
}

svm_params = {
    "classifier__C": [0.01, 0.1, 0.5, 1.0],
    "classifier__gamma": ["scale", "auto", 0.001, 0.01]
}

rf_params = {
    "classifier__n_estimators": [30, 50],
    "classifier__max_depth": [3, 4, 5],
    "classifier__min_samples_split": [20, 30, 50],
    "classifier__min_samples_leaf": [10, 15, 20],
    "classifier__max_features": ["sqrt", 0.3, 0.5]
}

gb_params = {
    "classifier__n_estimators": [30, 50, 100],
    "classifier__learning_rate": [0.01, 0.05, 0.1],
    "classifier__max_depth": [2, 3, 4, 5],
    "classifier__subsample": [0.7, 0.8, 0.9]
}

elastic_base = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(
        random_state=42,
        penalty="elasticnet",
        solver="saga"
    ))
])

xgb_base = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", xgb.XGBClassifier(
        random_state=42,
        eval_metric="logloss"
    ))
])

ridge_base = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", RidgeClassifier(
        random_state=42,
        class_weight="balanced"
    ))
])

svm_base = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", SVC(
        random_state=42,
        kernel="rbf",
        class_weight="balanced",
        probability=True
    ))
])

rf_base = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", RandomForestClassifier(
        random_state=42,
        class_weight="balanced",
        max_features="sqrt"
    ))
])

gb_base = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", GradientBoostingClassifier(
        random_state=42,
        max_features="sqrt"
    ))
])

# create gridsearch models to get optimal combination of hyperparameters
models = {
    "Elastic Net": GridSearchCV(
        elastic_base, elastic_params, cv=5, scoring="accuracy", n_jobs=-1
    ),
    "XGBoost": GridSearchCV(
        xgb_base, xgb_params, cv=5, scoring="accuracy", n_jobs=-1
    ),
    "Ridge": GridSearchCV(
        ridge_base, ridge_params, cv=5, scoring="accuracy", n_jobs=-1
    ),
    "SVM": GridSearchCV(
        svm_base, svm_params, cv=5, scoring="accuracy", n_jobs=-1
    ),
    "Random Forest": GridSearchCV(
        rf_base, rf_params, cv=5, scoring="accuracy", n_jobs=-1
    ),
    "Gradient Boosting": GridSearchCV(
        gb_base, gb_params, cv=5, scoring="accuracy", n_jobs=-1
    )
}

# train and evaluate all models
for name, model in models.items():
    print(f"\n{name}:")
    
    model.fit(train_data, train_labels)
    print(f"Best parameters: {model.best_params_}")
    print("Best CV score:", round(model.best_score_, 4))

    # get predictions on best models
    train_pred = model.predict(train_data)
    val_pred = model.predict(validation_data)
    test_pred = model.predict(test_data)

    # calculate accuracies
    train_acc = accuracy_score(train_labels, train_pred)
    val_acc = accuracy_score(validation_labels, val_pred)
    test_acc = accuracy_score(test_labels, test_pred)
    overfitting = train_acc - val_acc
    
    print("Training accuracy:", round(train_acc, 4))
    print("Validation accuracy:", round(val_acc, 4))
    print("Test accuracy:", round(test_acc, 4))
    print("Overfitting:", round(overfitting, 4))
    
    print("Test set classification report:")
    print(classification_report(test_labels, test_pred, target_names=["Red Wins", "Blue Wins"]))

    cm = confusion_matrix(test_labels, test_pred)
    print("Confusion Matrix (Test Set):")
    print("         Predicted")
    print("Actual Red Blue")
    print("Red ", cm[0,0], cm[0,1])
    print("Blue", cm[1,0], cm[1,1])

    # 5-fold cross validation
    cv_scores = cross_val_score(model, data, y, cv=5, scoring="accuracy")
    print("Cross-validation scores:", cv_scores)
    print("Mean CV accuracy:", round(cv_scores.mean(), 4))
    print("Standard deviation:", round(cv_scores.std(), 4))
    print("95% confidence interval:", round(cv_scores.mean(), 4), "(+/-", round(cv_scores.std()*2, 4), ")")


Win rate analysis:
Blue team win rate: 52.92%
Red team win rate: 47.08%
Training set: 1478 games ( 64.0 %)
Validation set: 370 games ( 16.0 %)
Test set: 463 games ( 20.0 %)
Total: 2311 games
Win rate balance check:
Train blue win rate: 0.529
Validation blue win rate: 0.53
Test blue win rate: 0.529

Elastic Net:
Best parameters: {'classifier__C': 0.05, 'classifier__l1_ratio': 0.9, 'classifier__max_iter': 1000}
Best CV score: 0.774
Training accuracy: 0.7828
Validation accuracy: 0.7703
Test accuracy: 0.7819
Overfitting: 0.0125
Test set classification report:
              precision    recall  f1-score   support

    Red Wins       0.76      0.78      0.77       218
   Blue Wins       0.80      0.79      0.79       245

    accuracy                           0.78       463
   macro avg       0.78      0.78      0.78       463
weighted avg       0.78      0.78      0.78       463

Confusion Matrix (Test Set):
         Predicted
Actual Red Blue
Red  169 49
Blue 52 193
Cross-validation scores

In [24]:
# these are results from the game, remove to avoid data leakage
in_game_stats = [
    "red_Result", "GameID", "Date", "Game Time", "blue_Kills", "red_Kills", "blue_Deaths", "red_Deaths", "blue_Assists", "red_Assists","blue_GD@15", "red_GD@15", "blue_ADC_kills", "blue_JUNGLE_kills", "blue_MID_kills", 
    "blue_SUPPORT_kills", "blue_TOP_kills","red_ADC_kills", "red_JUNGLE_kills", "red_MID_kills", "red_SUPPORT_kills", "red_TOP_kills","blue_ADC_deaths", "blue_JUNGLE_deaths", "blue_MID_deaths", "blue_SUPPORT_deaths", "blue_TOP_deaths",
    "red_ADC_deaths", "red_JUNGLE_deaths", "red_MID_deaths", "red_SUPPORT_deaths", "red_TOP_deaths","blue_ADC_assists", "blue_JUNGLE_assists", "blue_MID_assists", "blue_SUPPORT_assists", "blue_TOP_assists",
    "red_ADC_assists", "red_JUNGLE_assists", "red_MID_assists", "red_SUPPORT_assists", "red_TOP_assists","blue_ADC_kp%", "blue_JUNGLE_kp%", "blue_MID_kp%", "blue_SUPPORT_kp%", "blue_TOP_kp%",
    "red_ADC_kp%", "red_JUNGLE_kp%", "red_MID_kp%", "red_SUPPORT_kp%", "red_TOP_kp%", "blue_ADC_dmg%", "blue_JUNGLE_dmg%", "blue_MID_dmg%", "blue_SUPPORT_dmg%", "blue_TOP_dmg%","red_ADC_dmg%", 
    "red_JUNGLE_dmg%", "red_MID_dmg%", "red_SUPPORT_dmg%", "red_TOP_dmg%","blue_ADC_ka_per_minute", "blue_JUNGLE_ka_per_minute", "blue_MID_ka_per_minute", "blue_SUPPORT_ka_per_minute", "blue_TOP_ka_per_minute",
    "red_ADC_ka_per_minute", "red_JUNGLE_ka_per_minute", "red_MID_ka_per_minute", "red_SUPPORT_ka_per_minute", "red_TOP_ka_per_minute", "blue_ADC_gd@15", "blue_JUNGLE_gd@15", "blue_MID_gd@15", "blue_SUPPORT_gd@15", "blue_TOP_gd@15",
    "red_ADC_gd@15", "red_JUNGLE_gd@15", "red_MID_gd@15", "red_SUPPORT_gd@15", "red_TOP_gd@15", "blue_Side", "red_Side"
]
# predict blue team result
target = "blue_Result"
y = df[target]
# only use pre-game features
feature_columns = [col for col in df.columns if col != target and col not in in_game_stats]
joblib.dump(feature_columns, "feature_columns.pkl")
data = df[feature_columns]

red_wins, blue_wins = np.bincount(y)
total = red_wins + blue_wins

print(f"Win rate analysis:")
print(f"Blue team win rate: {round(100 * blue_wins / total, 2)}%")
print(f"Red team win rate: {round(100 * red_wins / total, 2)}%")

# 80/20 split for test and training
train_val_data, test_data, train_val_labels, test_labels = train_test_split(
    data, y, 
    test_size=0.20, 
    train_size=0.80, 
    random_state=42,
    stratify=y  
)

# split again for validation set
train_data, validation_data, train_labels, validation_labels = train_test_split(
    train_val_data, train_val_labels, 
    test_size=0.20, 
    train_size=0.80, 
    random_state=42,
    stratify=train_val_labels
)

train_percentage = round((len(train_data) / len(data)) * 100, 1)
validation_percentage = round((len(validation_data) / len(data)) * 100, 1)
test_percentage = round((len(test_data) / len(data)) * 100, 1)

print("Training set:", len(train_data), "games (", train_percentage, "%)")
print("Validation set:", len(validation_data), "games (", validation_percentage, "%)")
print("Test set:", len(test_data), "games (", test_percentage, "%)")
print("Total:", len(data), "games")

print("Win rate balance check:")
print("Train blue win rate:", round(train_labels.mean(), 3))
print("Validation blue win rate:", round(validation_labels.mean(), 3))
print("Test blue win rate:", round(test_labels.mean(), 3))


models = {
    "Elastic Net": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(
            random_state=42,
            penalty="elasticnet",
            solver="saga",
            C=0.05,
            l1_ratio=0.9,
            max_iter=1000
        ))
    ]),
    
    "Voting Ensemble": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", VotingClassifier([
            ("elastic", LogisticRegression(
                random_state=42,
                penalty="elasticnet",
                solver="saga",
                C=0.05,
                l1_ratio=0.9,
                max_iter=1000
            )),
            ("xgb", xgb.XGBClassifier(
                random_state=42,
                n_estimators=50,
                learning_rate=0.05,
                max_depth=3,
                reg_alpha=0.3,
                reg_lambda=0.3,
                subsample=0.7,
                colsample_bytree=0.8,
                eval_metric="logloss"
            ))
        ], voting="soft"))
    ])
}

# Train and evaluate models
for name, model in models.items():
    print(f"\n{name}:")
    
    model.fit(train_data, train_labels)

    # get predictions on best models
    train_pred = model.predict(train_data)
    val_pred = model.predict(validation_data)
    test_pred = model.predict(test_data)

    # calculate accuracies
    train_acc = accuracy_score(train_labels, train_pred)
    val_acc = accuracy_score(validation_labels, val_pred)
    test_acc = accuracy_score(test_labels, test_pred)
    overfitting = train_acc - val_acc
    
    print("Training accuracy:", round(train_acc, 4))
    print("Validation accuracy:", round(val_acc, 4))
    print("Test accuracy:", round(test_acc, 4))
    print("Overfitting:", round(overfitting, 4))
    
    print("Test set classification report:")
    print(classification_report(test_labels, test_pred, target_names=["Red Wins", "Blue Wins"]))
    
    cm = confusion_matrix(test_labels, test_pred)
    print("Confusion Matrix (Test Set):")
    print("         Predicted")
    print("Actual Red Blue")
    print("Red ", cm[0,0], cm[0,1])
    print("Blue", cm[1,0], cm[1,1])

    # 5-fold cross validation
    cv_scores = cross_val_score(model, data, y, cv=5, scoring="accuracy")
    print("Cross-validation scores:", cv_scores)
    print("Mean CV accuracy:", round(cv_scores.mean(), 4))
    print("Standard deviation:", round(cv_scores.std(), 4))
    print("95% confidence interval:", round(cv_scores.mean(), 4), "(+/-", round(cv_scores.std()*2, 4), ")")


# save the models
joblib.dump(models["Voting Ensemble"], "voting_ensemble_model.pkl")
joblib.dump(models["Elastic Net"], "elastic_net_model.pkl")

# save the processed dataframe
df.to_csv("processed_historical_data.csv", index=False)

Win rate analysis:
Blue team win rate: 52.82%
Red team win rate: 47.18%
Training set: 1498 games ( 64.0 %)
Validation set: 375 games ( 16.0 %)
Test set: 469 games ( 20.0 %)
Total: 2342 games
Win rate balance check:
Train blue win rate: 0.528
Validation blue win rate: 0.528
Test blue win rate: 0.529

Elastic Net:
Training accuracy: 0.7817
Validation accuracy: 0.7867
Test accuracy: 0.7825
Overfitting: -0.005
Test set classification report:
              precision    recall  f1-score   support

    Red Wins       0.77      0.77      0.77       221
   Blue Wins       0.80      0.79      0.79       248

    accuracy                           0.78       469
   macro avg       0.78      0.78      0.78       469
weighted avg       0.78      0.78      0.78       469

Confusion Matrix (Test Set):
         Predicted
Actual Red Blue
Red  171 50
Blue 52 196
Cross-validation scores: [0.75692964 0.75906183 0.70940171 0.7457265  0.72435897]
Mean CV accuracy: 0.7391
Standard deviation: 0.0193
95% confi

In [25]:
class LolPredictor:
    def __init__(self):
        self.load_data()

    # load encoders, models and model inputs
    def load_data(self):
        self.encoders = {
            "champion": joblib.load("champion_encoders.pkl"),
            "player": joblib.load("player_encoders.pkl"),
            "team": joblib.load("team_encoder.pkl"),
            "region": joblib.load("region_encoder.pkl"),
            "patch": joblib.load("patch_encoder.pkl")
        }
        self.final_team_elos = joblib.load("final_team_elos.pkl")
        self.feature_columns = joblib.load("feature_columns.pkl")
        self.voting_model = joblib.load("voting_ensemble_model.pkl")
        self.elastic_model = joblib.load("elastic_net_model.pkl")
        self.df_original = pd.read_csv("processed_historical_data.csv")
    
    def get_player_historical_stats(self, player_name, role):
        stat_columns = ["kills", "deaths", "assists", "kp%", "dmg%", "gd@15"]
        historical_stats = {}
        
        # get all games for this player in this role (from both blue and red sides)
        blue_player_col = f"blue_{role}_player"
        red_player_col = f"red_{role}_player"

        encoded_player = self.encoders["player"][f"{role}_player"].transform([player_name.lower()])[0]
        blue_games = self.df_original[self.df_original[f"blue_{role}_player"] == encoded_player]
        red_games = self.df_original[self.df_original[f"red_{role}_player"] == encoded_player]
        
        for stat in stat_columns:
            blue_stat_col = f"blue_{role}_{stat}"
            red_stat_col = f"red_{role}_{stat}"
            
            # combine stats from both blue and red games
            all_stat_values = []
            
            if len(blue_games) > 0:
                all_stat_values.extend(blue_games[blue_stat_col].tolist())
            if len(red_games) > 0:
                all_stat_values.extend(red_games[red_stat_col].tolist())
            if len(all_stat_values) == 0:
                # if no historical data use historical data
                blue_avg = self.df_original[blue_stat_col].mean()
                red_avg = self.df_original[red_stat_col].mean()
                historical_stats[stat] = (blue_avg + red_avg) / 2
            else:
                # calculate player"s historical average
                historical_stats[stat] = sum(all_stat_values) / len(all_stat_values)
        
        return historical_stats

    # gets their latest elo
    def get_team_elo(self, team_name):
        encoded_team = self.encoders["team"].transform([team_name.lower()])[0]
        return self.final_team_elos[encoded_team]

    
    def predict_match(self, match_info, model):
        prediction_data = {}
        
        # one hot encode the patch number
        patch_df = pd.DataFrame({"Patch": [str(match_info["patch"])]})
        patch_encoded = self.encoders["patch"].transform(patch_df)
        patch_features = self.encoders["patch"].get_feature_names_out(["Patch"])
        for i, feature_name in enumerate(patch_features):
            prediction_data[feature_name] = patch_encoded[0][i]
        
        # one hot encode the region
        region_df = pd.DataFrame({"Region": [match_info["region"].lower()]})
        region_encoded = self.encoders["region"].transform(region_df)
        region_features = self.encoders["region"].get_feature_names_out(["Region"])
        for i, feature_name in enumerate(region_features):
            prediction_data[feature_name] = region_encoded[0][i]
        
        # add team elo per team
        prediction_data["blue_team_elo_rating"] = self.get_team_elo(match_info["blue_team"]["team_name"])
        prediction_data["red_team_elo_rating"] = self.get_team_elo(match_info["red_team"]["team_name"])
        
        # encode teams, players and champions
        for team_color in ["blue", "red"]:
            team_data = match_info[f"{team_color}_team"]
            prediction_data[f"{team_color}_Team"] = self.encoders["team"].transform([team_data["team_name"].lower()])[0]
            
            for role in ["TOP", "JUNGLE", "MID", "ADC", "SUPPORT"]:
                player_name = team_data["players"][role]
                champion_name = team_data["champions"][role]

                # encode players for each role
                prediction_data[f"{team_color}_{role}_player"] = self.encoders["player"][f"{role}_player"].transform([player_name.lower()])[0]
                # encode champions for each role
                prediction_data[f"{team_color}_{role}_champion"] = self.encoders["champion"][f"{role}_champion"].transform([champion_name.lower()])[0]

                # add historical average stats
                historical_stats = self.get_player_historical_stats(player_name, role)
                for stat, avg_value in historical_stats.items():
                    prediction_data[f"{team_color}_{role}_historical_avg_{stat}"] = avg_value
        
        # create prediction dataframe and wrape prediction_data
        pred_df = pd.DataFrame([prediction_data])
        pred_df = pred_df.reindex(columns=self.feature_columns, fill_value=0.0)
        
        # make prediction and get probability of blue team winning
        blue_win_prob = model.predict_proba(pred_df)[0][1]
        # assign winner if win prob > 0.5
        predicted_winner = "Blue" if blue_win_prob > 0.5 else "Red"

        
        return {
            "predicted_winner": predicted_winner, "blue_win_probability": blue_win_prob
        }
    
   
    def predict_voting(self, match_info):
        return self.predict_match(match_info, self.voting_model)
    
    def predict_elastic(self, match_info):
        return self.predict_match(match_info, self.elastic_model)

    # get all teams
    def get_teams(self):
        return sorted(self.encoders["team"].classes_)

    # get all champions for a specific role
    def get_champions(self, role):
        return sorted(self.encoders["champion"][f"{role}_champion"].classes_)
            
    # get all players for a specific role
    def get_players(self, role):
        return sorted(self.encoders["player"][f"{role}_player"].classes_)

    # get all players who have played for a specific team
    def get_team_players(self, team_name):
        encoded_team = self.encoders["team"].transform([team_name.lower()])[0]
        team_players = {}
        
        for role in ["TOP", "JUNGLE", "MID", "ADC", "SUPPORT"]:
            # combine both blue and red games
            blue_players = set(self.df_original[self.df_original["blue_Team"] == encoded_team][f"blue_{role}_player"])
            red_players = set(self.df_original[self.df_original["red_Team"] == encoded_team][f"red_{role}_player"])
            
            all_players = blue_players | red_players
            decoded_players = [self.encoders["player"][f"{role}_player"].inverse_transform([p])[0] for p in all_players]
            team_players[role] = sorted(decoded_players)
        
        return team_players
    def get_regions(self):
        # get regions from one hot encoded columns
        region_columns = [col for col in self.df_original.columns if col.startswith('Region_')]
        # default region not one hot encoded
        regions = ['cn']
        
        # add regions that have been one-hot encoded
        for col in region_columns:
            region_name = col.replace('Region_', '')
            regions.append(region_name)
        
        return sorted(regions)

    def get_patches(self):
        # get patches from one hot encoded columns
        patch_columns = [col for col in self.df_original.columns if col.startswith('Patch_')]
        # default patch not one hot encoded
        patches = ["15.1"]
    
        # add patches that have one hot columns
        for col in patch_columns:
            patch_str = col.replace('Patch_', '')
            patches.append(patch_str)
        sorted_patches = sorted(patches, key=lambda x: (int(str(x).split('.')[0]), int(str(x).split('.')[1])))
        return sorted_patches

    def create_match_info(self, patch, region, blue_team, red_team):
        return {
            "patch": patch, "region": region, "blue_team": blue_team, "red_team": red_team
            }
            


def print_prediction(result, model_name):
        winner = result["predicted_winner"]
        blue_prob = result["blue_win_probability"]
        red_prob = 1 - blue_prob
        
        print(f"{model_name} Team:")
        print(f"Winner: {winner}")
        print(f"Blue win probability:{blue_prob:.1%} || Red win probability:{red_prob:.1%}")


if __name__ == "__main__":
    # initialize predictor
    predictor = LolPredictor()

    # match data from before the game starts
    blue_team = {
        "team_name": "t1",
        "players": {"TOP": "doran", "JUNGLE": "oner", "MID": "faker", "ADC": "gumayusi", "SUPPORT": "keria"},
        "champions": {"TOP": "gwen", "JUNGLE": "jarvan iv", "MID": "azir", "ADC": "corki", "SUPPORT": "poppy"}
    }

    red_team = {
        "team_name": "gen.g esports",
        "players": {"TOP": "kiin", "JUNGLE": "canyon", "MID": "chovy", "ADC": "ruler", "SUPPORT": "duro"},
        "champions": {"TOP": "sion", "JUNGLE": "wukong", "MID": "annie", "ADC": "kaisa", "SUPPORT": "neeko"}
    }
    match_info = predictor.create_match_info(15.17, "kr",  blue_team, red_team)
    # test both models
    voting_result = predictor.predict_voting(match_info)
    elastic_result = predictor.predict_elastic(match_info)
        
    print_prediction(voting_result, "Voting Model")
    print_prediction(elastic_result, "Elastic Net Model")
    print(predictor.get_teams())
    print(predictor.get_team_players("bilibili gaming"))
    print(predictor.get_champions("TOP"))
    print(predictor.get_regions())
    print(predictor.get_patches())

Voting Model Team:
Winner: Red
Blue win probability:34.5% || Red win probability:65.5%
Elastic Net Model Team:
Winner: Red
Blue win probability:27.6% || Red win probability:72.4%
['100 thieves', 'anyones legend', 'bilibili gaming', 'bnk fearx', 'chiefs esports club', 'cloud9', 'ctbc flying oyster', 'detonation focusme', 'dignitas', 'disguised', 'dn freecs', 'dplus kia', 'drx', 'edward gaming', 'flyquest', 'fnatic', 'fukuoka softbank hawks gaming', 'funplus phoenix', 'furia', 'g2 esports', 'gam esports', 'gen.g esports', 'giantx', 'hanwha life esports', 'invictus gaming', 'isurus estral', 'jd gaming', 'karmine corp', 'kt rolster', 'leviatan', 'lgd gaming', 'lng esports', 'loud', 'lyon', 'mgn vikings esports', 'movistar koi', 'natus vincere', 'ninjas in pyjamas', 'nongshim redforce', 'ok brion', 'omg', 'pain gaming', 'psg talon', 'red canids', 'rogue', 'royal never give up', 'shopify rebellion', 'sk gaming', 't1', 'team bds', 'team heretics', 'team liquid', 'team secret whales', 'team vi