In [56]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.special import comb

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

import timeit

### Creating random forest model using only team hero picks

In [3]:
team_picks_df = pd.read_csv('data/hero_pick_dummy_df.csv', index_col='match_id')

In [4]:
team_picks_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3210 entries, 4225872520 to 3916525126
Columns: 233 entries, radiant_win to dire_pick__121
dtypes: bool(1), int64(232)
memory usage: 5.7 MB


In [5]:
team_picks_df.drop(columns='radiant_win')

Unnamed: 0_level_0,radiant_pick__1,radiant_pick__2,radiant_pick__3,radiant_pick__4,radiant_pick__5,radiant_pick__6,radiant_pick__7,radiant_pick__8,radiant_pick__9,radiant_pick__10,...,dire_pick__108,dire_pick__109,dire_pick__110,dire_pick__111,dire_pick__112,dire_pick__113,dire_pick__114,dire_pick__119,dire_pick__120,dire_pick__121
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4225872520,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4225756508,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4225649985,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4225546373,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4225544531,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4225454337,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4225316847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4225246100,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4224306491,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4224201762,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0


In [6]:
team_picks_df[['radiant_win']]

Unnamed: 0_level_0,radiant_win
match_id,Unnamed: 1_level_1
4225872520,True
4225756508,False
4225649985,False
4225546373,True
4225544531,True
4225454337,False
4225316847,True
4225246100,True
4224306491,False
4224201762,True


In [7]:
X_train, X_test, y_train, y_test = train_test_split(team_picks_df.drop(columns='radiant_win'),
                                                    team_picks_df['radiant_win'],
                                                    test_size=.2,
                                                    random_state=8675309)

In [8]:
rf = RandomForestClassifier(n_estimators=10000, max_features=.5 , random_state=8675309)

In [37]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=8675309, verbose=0,
            warm_start=False)

In [38]:
pred = rf.predict_proba(X_test)
print(f"log loss = {log_loss(y_test, rf.predict_proba(X_test)[:, 1])}")
print(f"accuracy = {rf.score(X_test, y_test)}")

log loss = 0.6947572007591879
accuracy = 0.5358255451713395


In [9]:
rf2 = RandomForestClassifier(n_estimators=10000, max_features=.8 , random_state=8675309)

In [40]:
rf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=8675309, verbose=0,
            warm_start=False)

In [41]:
pred = rf2.predict_proba(X_test)
print(f"log loss = {log_loss(y_test, rf2.predict_proba(X_test)[:, 1])}")
print(f"accuracy = {rf2.score(X_test, y_test)}")

log loss = 0.6965435907430655
accuracy = 0.5373831775700935


### Creating random forest model using team hero picks and team category composition

In [10]:
team_picks_comp_df = pd.read_csv('data/full_match_df.csv', index_col='match_id')

In [11]:
team_picks_comp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3210 entries, 4225872520 to 3916525126
Columns: 261 entries, radiant_win to dire_heroes__Support
dtypes: bool(1), float64(28), int64(232)
memory usage: 6.4 MB


In [12]:
team_picks_comp_df.drop(columns='radiant_win')

Unnamed: 0_level_0,radiant_pick__1,radiant_pick__2,radiant_pick__3,radiant_pick__4,radiant_pick__5,radiant_pick__6,radiant_pick__7,radiant_pick__8,radiant_pick__9,radiant_pick__10,...,dire_heroes__str,dire_heroes__Carry,dire_heroes__Disabler,dire_heroes__Durable,dire_heroes__Escape,dire_heroes__Initiator,dire_heroes__Jungler,dire_heroes__Nuker,dire_heroes__Pusher,dire_heroes__Support
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4225872520,0,0,0,0,1,0,0,0,0,0,...,2.0,3.0,3.0,2.0,2.0,2.0,0.0,5.0,1.0,2.0
4225756508,0,0,0,0,0,0,0,0,0,1,...,1.0,3.0,4.0,1.0,0.0,3.0,0.0,5.0,3.0,2.0
4225649985,0,1,0,0,0,0,0,0,0,0,...,0.0,2.0,3.0,0.0,2.0,1.0,0.0,4.0,1.0,2.0
4225546373,0,0,0,0,0,0,0,0,0,0,...,2.0,3.0,5.0,1.0,3.0,3.0,1.0,4.0,3.0,1.0
4225544531,0,0,0,0,0,0,0,0,0,0,...,3.0,4.0,3.0,3.0,2.0,3.0,0.0,3.0,0.0,1.0
4225454337,0,0,0,0,0,0,0,0,0,0,...,3.0,2.0,5.0,2.0,3.0,5.0,1.0,4.0,1.0,1.0
4225316847,0,0,0,0,0,0,0,0,0,0,...,1.0,3.0,4.0,2.0,2.0,3.0,1.0,3.0,2.0,2.0
4225246100,0,0,1,0,0,0,0,0,0,0,...,2.0,3.0,4.0,2.0,0.0,3.0,1.0,4.0,2.0,2.0
4224306491,0,0,0,0,0,0,1,0,0,0,...,1.0,3.0,4.0,1.0,3.0,3.0,0.0,5.0,0.0,2.0
4224201762,0,0,0,0,0,0,0,0,0,1,...,0.0,3.0,4.0,2.0,2.0,2.0,0.0,5.0,0.0,2.0


In [13]:
team_picks_comp_df[['radiant_win']]

Unnamed: 0_level_0,radiant_win
match_id,Unnamed: 1_level_1
4225872520,True
4225756508,False
4225649985,False
4225546373,True
4225544531,True
4225454337,False
4225316847,True
4225246100,True
4224306491,False
4224201762,True


In [14]:
X_train, X_test, y_train, y_test = train_test_split(team_picks_comp_df.drop(columns='radiant_win'),
                                                    team_picks_comp_df['radiant_win'],
                                                    test_size=.2,
                                                    random_state=8675309)

In [34]:
rf = RandomForestClassifier(n_estimators=10000, criterion='entropy', max_features=.5 , random_state=8675309)

In [35]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=8675309, verbose=0,
            warm_start=False)

In [79]:
pred_rf = rf.predict_proba(X_test)
print(f"log loss = {log_loss(y_test, rf.predict_proba(X_test)[:, 1])}")
print(f"accuracy = {rf.score(X_test, y_test)}")

log loss = 0.687723669537263
accuracy = 0.5654205607476636


In [43]:
rf2 = RandomForestClassifier(n_estimators=10000, criterion='entropy', max_features=.8 , random_state=8675309)

In [44]:
rf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=8675309, verbose=0,
            warm_start=False)

In [80]:
pred = rf2.predict_proba(X_test)
print(f"log loss = {log_loss(y_test, rf2.predict_proba(X_test)[:, 1])}")
print(f"accuracy = {rf2.score(X_test, y_test)}")

log loss = 0.6869018453318196
accuracy = 0.5654205607476636


In [81]:
pred_rf2 = rf2.predict_proba(X_test)
pred_rf2

array([[0.4855, 0.5145],
       [0.3689, 0.6311],
       [0.5813, 0.4187],
       ...,
       [0.5469, 0.4531],
       [0.3448, 0.6552],
       [0.4043, 0.5957]])

### Creating random forest model using team hero picks abd bans and team category composition

In [107]:
team_picks_bans_comp_df = pd.read_csv('data/full_match_df.csv', index_col='match_id')

In [108]:
team_picks_bans_comp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3210 entries, 4225872520 to 3916525126
Columns: 493 entries, radiant_win to dire_heroes__Support
dtypes: bool(1), float64(28), int64(464)
memory usage: 12.1 MB


In [109]:
team_picks_bans_comp_df.drop(columns='radiant_win')

Unnamed: 0_level_0,radiant_pick__1,radiant_pick__2,radiant_pick__3,radiant_pick__4,radiant_pick__5,radiant_pick__6,radiant_pick__7,radiant_pick__8,radiant_pick__9,radiant_pick__10,...,dire_heroes__str,dire_heroes__Carry,dire_heroes__Disabler,dire_heroes__Durable,dire_heroes__Escape,dire_heroes__Initiator,dire_heroes__Jungler,dire_heroes__Nuker,dire_heroes__Pusher,dire_heroes__Support
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4225872520,0,0,0,0,1,0,0,0,0,0,...,2.0,3.0,3.0,2.0,2.0,2.0,0.0,5.0,1.0,2.0
4225756508,0,0,0,0,0,0,0,0,0,1,...,1.0,3.0,4.0,1.0,0.0,3.0,0.0,5.0,3.0,2.0
4225649985,0,1,0,0,0,0,0,0,0,0,...,0.0,2.0,3.0,0.0,2.0,1.0,0.0,4.0,1.0,2.0
4225546373,0,0,0,0,0,0,0,0,0,0,...,2.0,3.0,5.0,1.0,3.0,3.0,1.0,4.0,3.0,1.0
4225544531,0,0,0,0,0,0,0,0,0,0,...,3.0,4.0,3.0,3.0,2.0,3.0,0.0,3.0,0.0,1.0
4225454337,0,0,0,0,0,0,0,0,0,0,...,3.0,2.0,5.0,2.0,3.0,5.0,1.0,4.0,1.0,1.0
4225316847,0,0,0,0,0,0,0,0,0,0,...,1.0,3.0,4.0,2.0,2.0,3.0,1.0,3.0,2.0,2.0
4225246100,0,0,1,0,0,0,0,0,0,0,...,2.0,3.0,4.0,2.0,0.0,3.0,1.0,4.0,2.0,2.0
4224306491,0,0,0,0,0,0,1,0,0,0,...,1.0,3.0,4.0,1.0,3.0,3.0,0.0,5.0,0.0,2.0
4224201762,0,0,0,0,0,0,0,0,0,1,...,0.0,3.0,4.0,2.0,2.0,2.0,0.0,5.0,0.0,2.0


In [110]:
team_picks_bans_comp_df[['radiant_win']]

Unnamed: 0_level_0,radiant_win
match_id,Unnamed: 1_level_1
4225872520,True
4225756508,False
4225649985,False
4225546373,True
4225544531,True
4225454337,False
4225316847,True
4225246100,True
4224306491,False
4224201762,True


In [111]:
X_train, X_test, y_train, y_test = train_test_split(team_picks_bans_comp_df.drop(columns='radiant_win'),
                                                    team_picks_bans_comp_df['radiant_win'],
                                                    test_size=.2,
                                                    random_state=8675309)

In [122]:
rf3 = RandomForestClassifier(n_estimators=10000, criterion='entropy', max_features=.5 , random_state=8675309)

In [123]:
rf3.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=8675309, verbose=0,
            warm_start=False)

In [124]:
pred_rf3 = rf3.predict_proba(X_test)
print(f"log loss = {log_loss(y_test, rf3.predict_proba(X_test)[:, 1])}")
print(f"accuracy = {rf3.score(X_test, y_test)}")

log loss = 0.6766049683660874
accuracy = 0.5685358255451713


## Use Logistic Regression with the Dataset

In [67]:
team_comp_cols = [
    'radiant_heroes__Melee', 
    'radiant_heroes__Ranged', 
    'radiant_heroes__agi', 
    'radiant_heroes__int', 
    'radiant_heroes__str', 
    'radiant_heroes__Carry', 
    'radiant_heroes__Disabler', 
    'radiant_heroes__Durable', 
    'radiant_heroes__Escape', 
    'radiant_heroes__Initiator', 
    'radiant_heroes__Jungler', 
    'radiant_heroes__Nuker', 
    'radiant_heroes__Pusher', 
    'radiant_heroes__Support', 
    'dire_heroes__Melee', 
    'dire_heroes__Ranged', 
    'dire_heroes__agi', 
    'dire_heroes__int', 
    'dire_heroes__str', 
    'dire_heroes__Carry', 
    'dire_heroes__Disabler', 
    'dire_heroes__Durable', 
    'dire_heroes__Escape', 
    'dire_heroes__Initiator', 
    'dire_heroes__Jungler', 
    'dire_heroes__Nuker', 
    'dire_heroes__Pusher', 
    'dire_heroes__Support'
]

In [73]:
X_train, X_test, y_train, y_test = train_test_split(team_picks_comp_df.drop(columns='radiant_win'),
                                                    team_picks_comp_df['radiant_win'],
                                                    test_size=.2,
                                                    random_state=8675309)

In [76]:
model = LogisticRegression(C=.1, random_state=8675309)
model.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=8675309, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [82]:
pred_model = model.predict_proba(X_test)
print(f"log loss = {log_loss(y_test, model.predict_proba(X_test)[:, 1])}")
print(f"accuracy = {model.score(X_test, y_test)}")

log loss = 0.6814289713771959
accuracy = 0.5607476635514018


In [83]:
pred_model

array([[0.59254891, 0.40745109],
       [0.40685627, 0.59314373],
       [0.5319692 , 0.4680308 ],
       ...,
       [0.72207392, 0.27792608],
       [0.25999096, 0.74000904],
       [0.39116148, 0.60883852]])

In [95]:
pred_avg = np.average(np.array(list(zip(pred_rf[:, 1], pred_model[:, 1]))), axis=1)

In [98]:
len(pred_avg)

642

In [105]:
np.mean(y_train)

0.5338785046728972

In [106]:
np.mean(y_test)

0.5373831775700935

In [103]:
sum(y_test == np.rint(pred_avg))/len(y_test)

0.5560747663551402