# import

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# data

In [3]:
data_train = pd.read_csv('./data/training.csv')

In [179]:
data_test = pd.read_csv('./data/testing.csv')

# explore

In [4]:
data_train.shape

(6500, 28)

In [5]:
data_train.columns

Index(['matchId', 'gameLength', 'team1Top', 'team1Jg', 'team1Mid', 'team1Adc',
       'team1Supp', 'team2Top', 'team2Jg', 'team2Mid', 'team2Adc', 'team2Supp',
       't1Dragons', 't2Dragons', 't1Rift', 't2Rift', 'topGoldDiff',
       'jgGoldDiff', 'midGoldDiff', 'adcGoldDiff', 'suppGoldDiff',
       't1TopTowerTaken', 't1MidTowerTaken', 't1BotTowerTaken',
       't2TopTowerTaken', 't2MidTowerTaken', 't2BotTowerTaken', 'winningTeam'],
      dtype='object')

In [6]:
data_train.head()

Unnamed: 0,matchId,gameLength,team1Top,team1Jg,team1Mid,team1Adc,team1Supp,team2Top,team2Jg,team2Mid,...,midGoldDiff,adcGoldDiff,suppGoldDiff,t1TopTowerTaken,t1MidTowerTaken,t1BotTowerTaken,t2TopTowerTaken,t2MidTowerTaken,t2BotTowerTaken,winningTeam
0,NA1_4243514565,1942,Garen,Sylas,Hecarim,Draven,Yuumi,Gragas,Graves,Ekko,...,-465.0,-50.0,-1315.0,0.0,0.0,1.0,0.0,0.0,0.0,Red
1,NA1_4255034369,1284,Trundle,Karthus,Syndra,Jhin,Lulu,Sion,Kayn,Heimerdinger,...,306.0,-1621.0,-419.0,1.0,0.0,1.0,0.0,0.0,0.0,Red
2,NA1_4244786483,1106,Galio,Kindred,Sylas,Zeri,Soraka,Mordekaiser,Nocturne,Pantheon,...,-4982.0,-2883.0,-2906.0,0.0,1.0,1.0,0.0,0.0,0.0,Red
3,NA1_4242168068,2110,Garen,Ekko,Qiyana,Kaisa,Thresh,Poppy,Ivern,Vladimir,...,2865.0,-1535.0,-1747.0,0.0,0.0,0.0,1.0,0.0,0.0,Red
4,NA1_4243090657,1660,Riven,Graves,Ahri,Vayne,Karma,Nasus,Brand,Xerath,...,172.0,-933.0,-596.0,0.0,0.0,0.0,0.0,0.0,0.0,Red


In [7]:
data_train.describe()

Unnamed: 0,gameLength,t1Dragons,t2Dragons,t1Rift,t2Rift,topGoldDiff,jgGoldDiff,midGoldDiff,adcGoldDiff,suppGoldDiff,t1TopTowerTaken,t1MidTowerTaken,t1BotTowerTaken,t2TopTowerTaken,t2MidTowerTaken,t2BotTowerTaken
count,6500.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0,6498.0
mean,1732.938462,0.721299,0.743613,0.414743,0.373346,34.021853,-39.087873,-0.807325,11.194829,16.100646,0.21422,0.13281,0.137889,0.234688,0.135426,0.143429
std,415.890586,0.694342,0.706928,0.492716,0.48373,1665.659707,1307.462901,1399.969521,1545.382121,1062.987108,0.467475,0.402085,0.36265,0.493938,0.410127,0.370184
min,191.0,0.0,0.0,0.0,0.0,-7594.0,-4670.0,-5737.0,-7044.0,-4429.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1480.0,0.0,0.0,0.0,0.0,-1061.75,-891.0,-908.5,-952.0,-631.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1732.0,1.0,1.0,0.0,0.0,0.0,-30.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1993.0,1.0,1.0,1.0,1.0,1108.0,814.75,893.0,991.75,680.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3161.0,2.0,2.0,1.0,1.0,6010.0,6603.0,7113.0,6580.0,4780.0,4.0,6.0,3.0,4.0,6.0,4.0


# Cleaning

In [8]:
data_train.dropna(inplace=True)

In [9]:
def unique_characters(data_train):
    character_cols = ['team1Top', 'team1Jg', 'team1Mid', 'team1Adc', 'team1Supp', 'team2Top', 'team2Jg', 'team2Mid', 'team2Adc', 'team2Supp']
    characters = set()

    for col in character_cols:
        unique = data_train[col].unique().tolist()
        #print(col, len(unique))
        for character in unique:
            characters.add(character)

    return characters


In [10]:
characters = unique_characters(data_train)

In [11]:
characters = list(characters)

In [12]:
characters_map = {character: i+1 for i, character in enumerate(characters)}

# defining input

In [13]:
numerical_cols = ['gameLength', 't1Dragons', 't2Dragons', 't1Rift', 't2Rift', 'topGoldDiff',
       'jgGoldDiff', 'midGoldDiff', 'adcGoldDiff', 'suppGoldDiff',
       't1TopTowerTaken', 't1MidTowerTaken', 't1BotTowerTaken',
       't2TopTowerTaken', 't2MidTowerTaken', 't2BotTowerTaken']
character_cols = ['team1Top', 'team1Jg', 'team1Mid', 'team1Adc',
       'team1Supp', 'team2Top', 'team2Jg', 'team2Mid', 'team2Adc', 'team2Supp']
target_col = ['winningTeam']

In [14]:
data_train.replace({ "winningTeam" : {
    "Blue": 0,
    "Red": 1
} }, inplace=True)

In [15]:
scaler = StandardScaler()

data_train[numerical_cols] = scaler.fit_transform(data_train[numerical_cols])

for col in character_cols:
    data_train[col] = data_train[col].map(characters_map)

In [16]:
data_train.head()

Unnamed: 0,matchId,gameLength,team1Top,team1Jg,team1Mid,team1Adc,team1Supp,team2Top,team2Jg,team2Mid,...,midGoldDiff,adcGoldDiff,suppGoldDiff,t1TopTowerTaken,t1MidTowerTaken,t1BotTowerTaken,t2TopTowerTaken,t2MidTowerTaken,t2BotTowerTaken,winningTeam
0,NA1_4243514565,0.496586,141,94,144,136,40,21,79,18,...,-0.330228,-0.0394,-1.251065,-0.461591,-0.331878,2.377986,-0.47876,-0.335368,-0.389294,1
1,NA1_4255034369,-1.184683,50,53,58,125,93,9,124,113,...,0.218266,-1.05453,-0.408123,1.680725,-0.331878,2.377986,-0.47876,-0.335368,-0.389294,1
2,NA1_4244786483,-1.639495,10,103,94,102,14,30,29,73,...,-3.543643,-1.869994,-2.74785,-0.461591,2.151602,2.377986,-0.47876,-0.335368,-0.389294,1
3,NA1_4242168068,0.925846,141,18,116,129,35,1,25,64,...,2.03875,-0.99896,-1.657483,-0.461591,-0.331878,-0.381663,1.547223,-0.335368,-0.389294,1
4,NA1_4243090657,-0.223958,20,79,76,16,100,92,68,87,...,0.122937,-0.609966,-0.574642,-0.461591,-0.331878,-0.381663,-0.47876,-0.335368,-0.389294,1


# X & Y

In [17]:
X = data_train.drop(columns=['matchId', 'winningTeam'])
y = data_train['winningTeam']

# Feature selection

In [175]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X, y)
mi_scores_df = pd.DataFrame({"feature": X.columns, "mutual_info": mi_scores})
mi_scores_df = mi_scores_df.sort_values(by="mutual_info", ascending=False)

print(mi_scores_df)

            feature  mutual_info
18      adcGoldDiff     0.072796
16       jgGoldDiff     0.059213
12        t2Dragons     0.052905
17      midGoldDiff     0.050599
11        t1Dragons     0.040699
15      topGoldDiff     0.040042
19     suppGoldDiff     0.033104
20  t1TopTowerTaken     0.030943
25  t2BotTowerTaken     0.026510
24  t2MidTowerTaken     0.017699
23  t2TopTowerTaken     0.014769
7           team2Jg     0.014671
22  t1BotTowerTaken     0.010881
14           t2Rift     0.009897
13           t1Rift     0.009282
21  t1MidTowerTaken     0.009217
10        team2Supp     0.006394
4          team1Adc     0.005713
6          team2Top     0.002668
5         team1Supp     0.001697
9          team2Adc     0.000426
1          team1Top     0.000000
8          team2Mid     0.000000
3          team1Mid     0.000000
2           team1Jg     0.000000
0        gameLength     0.000000


In [176]:
cols_reduced = mi_scores_df['feature'].head(16).tolist()

In [177]:
X_reduced = X[mi_scores_df['feature'].head(16).tolist()]
X_reduced

Unnamed: 0,adcGoldDiff,jgGoldDiff,t2Dragons,midGoldDiff,t1Dragons,topGoldDiff,suppGoldDiff,t1TopTowerTaken,t2BotTowerTaken,t2MidTowerTaken,t2TopTowerTaken,team2Jg,t1BotTowerTaken,t2Rift,t1Rift,t1MidTowerTaken
0,-0.039400,-0.201959,1.769075,-0.330228,-1.048777,-0.607859,-1.251065,-0.461591,-0.389294,-0.335368,-0.478760,79,2.377986,-0.777298,1.179563,-0.331878
1,-1.054530,-2.924979,-1.061445,0.218266,-1.048777,-0.786392,-0.408123,1.680725,-0.389294,-0.335368,-0.478760,124,2.377986,-0.777298,-0.847772,-0.331878
2,-1.869994,1.121022,0.353815,-3.543643,-1.048777,-1.085345,-2.747850,-0.461591,-0.389294,-0.335368,-0.478760,29,2.377986,-0.777298,1.179563,2.151602
3,-0.998960,-0.256892,1.769075,2.038750,-1.048777,0.124846,-1.657483,-0.461591,-0.389294,-0.335368,1.547223,25,-0.381663,-0.777298,1.179563,-0.331878
4,-0.609966,0.760140,0.353815,0.122937,0.392955,-0.235815,-0.574642,-0.461591,-0.389294,-0.335368,-0.478760,68,-0.381663,-0.777298,1.179563,-0.331878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,-1.408630,-0.942797,1.769075,-1.121310,-1.048777,1.453659,-1.308452,-0.461591,-0.389294,-0.335368,1.547223,50,2.377986,1.286507,-0.847772,-0.331878
6496,2.418624,1.314815,-1.061445,-0.218537,0.392955,0.528642,0.577817,-0.461591,2.307237,2.153945,-0.478760,79,-0.381663,-0.777298,1.179563,-0.331878
6497,0.432303,0.505309,-1.061445,0.998677,0.392955,0.536431,0.972005,-0.461591,-0.389294,-0.335368,1.547223,79,-0.381663,-0.777298,1.179563,-0.331878
6498,-0.195126,0.190968,1.769075,-1.065109,-1.048777,-0.292730,0.194918,-0.461591,-0.389294,-0.335368,-0.478760,56,-0.381663,1.286507,-0.847772,2.151602


# ML model

In [178]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss')

param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.05, 0.015]
}

grid_search = GridSearchCV(xgb, param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=3)

grid_search.fit(X_reduced, y)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best score:  0.7515532129911469


In [189]:
xgb_1 = grid_search.best_estimator_

# DL model

In [199]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_reduced.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.1),
    #BatchNormalization(),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

In [200]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [202]:
model.fit(X_reduced, y, epochs=25, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


<keras.callbacks.History at 0x24a021178b0>

# Test

In [180]:
def treat(data_train, numerical_cols=numerical_cols, character_cols=character_cols, scaler=scaler, characters_map=characters_map):

    data_train.replace({ "winningTeam" : {
        "Blue": 0,
        "Red": 1
    } }, inplace=True)

    data_train[numerical_cols] = scaler.transform(data_train[numerical_cols])

    for col in character_cols:
        data_train[col] = data_train[col].map(characters_map)

    data_train = data_train[cols_reduced+['matchId']]

    return data_train

In [181]:
data_test = treat(data_test)

In [182]:
X_test = data_test.drop(columns=['matchId'])

# predictions

In [183]:
ids = data_test['matchId']

In [190]:
predictions_dl = [ proba[0] for proba in model.predict(X_test) ]
predictions_ml = [ proba[1] for proba in xgb_1.predict_proba(X_test)]



In [144]:
def get_predictions(mode, predictions_dl=predictions_dl, predictions_ml=predictions_ml, threshold=0.5):
    if mode == "ml":
        predictions = ["Red" if proba > threshold else "Blue" for proba in predictions_ml]
    elif mode == "dl":
        predictions = ["Red" if proba > threshold else "Blue" for proba in predictions_dl]
    elif mode == "both":
        predictions = []
        for prediction_dl, prediction_ml in zip(predictions_dl, predictions_ml):
            predictions.append("Red" if ( ( prediction_dl + prediction_ml ) / 2 ) > 0.5 else "Blue")
    
    return predictions

In [213]:
columns = {
    'matchId': ids,
    'winningTeam': get_predictions("both")
}

In [214]:
pd.DataFrame(columns).to_csv('./submissions/merge.csv', index=False)