In [9]:
import pandas
import numpy as np
import time
import datetime

from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


In [10]:
data = pandas.read_csv('features.csv', index_col='match_id')

# 1-1. Explanation for absent data

In [40]:
pandas.isnull(data).sum()

start_time                         0
lobby_type                         0
r1_hero                            0
r1_level                           0
r1_xp                              0
r1_gold                            0
r1_lh                              0
r1_kills                           0
r1_deaths                          0
r1_items                           0
r2_hero                            0
r2_level                           0
r2_xp                              0
r2_gold                            0
r2_lh                              0
r2_kills                           0
r2_deaths                          0
r2_items                           0
r3_hero                            0
r3_level                           0
r3_xp                              0
r3_gold                            0
r3_lh                              0
r3_kills                           0
r3_deaths                          0
r3_items                           0
r4_hero                            0
r

__first_blood_time, first_blood_team, first_blood_player1, first_blood_player2__  
These values are absent because there wasn't "First Blood" event within first 5 minutes of gameplay.  
  
__radiant_first_ward_time, dire_bottle_time, dire_courier_time, dire_flying_courier_time__  
These values are absent because players did not acquired this items.

# 1-2. Target variable

In [None]:
data.head()

Target variable is: __radiant_win__

In [11]:
y_train = data['radiant_win']

# 1-3. GBC model training and validation

In [12]:
# prepare training set
data = data.fillna(0)
X_train = data.drop(labels=['start_time', 'duration', 'radiant_win', 'tower_status_radiant',
                    'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'],
                   axis=1)

In [6]:
kf = KFold(n_splits=5, shuffle=True)

In [46]:
for n in np.arange(10, 60, 10):
    start_time = datetime.datetime.now()
    #clf = GradientBoostingClassifier(n_estimators=n)
    clf = GradientBoostingClassifier(n_estimators=n, max_depth=1, learning_rate=0.3)
    score = cross_val_score(clf, X_train, y_train, cv=kf, scoring='roc_auc')
    
    print('N: {}; Score: {}, Time: {}'.format(n, round(score.mean(), 2), datetime.datetime.now() - start_time))


N: 10; Score: 0.65, Time: 0:00:07.800433
N: 20; Score: 0.68, Time: 0:00:11.602616
N: 30; Score: 0.69, Time: 0:00:17.671740
N: 40; Score: 0.69, Time: 0:00:18.645810
N: 50; Score: 0.7, Time: 0:00:22.322764


Cross validation with __30 estimators__ _without optimisation_ takes __59 seconds__ on current hardware  
ROC AUC score was __0.69__
> N: 30; Score: 0.69, Time: 0:00:59.729096

# 1-4. Recommendations

With 40+ estimators __without optimisation__ quality continue to grow slowly. But also it takes significantly more time for training:  
> N: 10; Score: 0.66, Time: 0:00:22.314665  
> N: 20; Score: 0.68, Time: 0:00:38.638311  
> N: 30; Score: 0.69, Time: 0:00:59.729096  
> N: 40; Score: 0.69, Time: 0:01:21.803030  
> N: 50; Score: 0.7, Time: 0:01:31.827706  
  
To reduce learinig time we can decrease max_depth and increase learning rate. (max_depth=1, learning_rate=0.3):  
> N: 50; Score: 0.7, Time: 0:00:22.322764

__Optimal parameters:__ n_estimators=50, max_depth=1, learning_rate=0.3

# 2-1. Logistic regression

In [7]:
# normalize data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [63]:
for c in np.power(10.0,np.arange(-5,3)):
    start_time = datetime.datetime.now()
    clf = LogisticRegression(C=c, penalty='l2')
    score = cross_val_score(clf, X_train_scaled, y_train, cv=kf, scoring='roc_auc')
    
    print('C: {}; Score: {}, Time: {}'.format(c, round(score.mean(), 2), datetime.datetime.now() - start_time))


C: 1e-05; Score: 0.7, Time: 0:00:02.599329
C: 0.0001; Score: 0.71, Time: 0:00:04.681454
C: 0.001; Score: 0.72, Time: 0:00:06.879551
C: 0.01; Score: 0.72, Time: 0:00:10.063020
C: 0.1; Score: 0.72, Time: 0:00:10.956006
C: 1.0; Score: 0.72, Time: 0:00:11.570537
C: 10.0; Score: 0.72, Time: 0:00:11.331998
C: 100.0; Score: 0.72, Time: 0:00:11.262324


Linear Regression quality stops to grow after C==0.001. Best result is:
> C: 0.001; Score: 0.72, Time: 0:00:07.986668  
  
So, this model has approximately the same uqality, but 3x faster cross validation speed

# 2-2. Logistic regression without categorial attributes

In [13]:
X_train = X_train.drop(labels=['lobby_type','r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                         'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [8]:
for c in np.power(10.0,np.arange(-5,3)):
    start_time = datetime.datetime.now()
    clf = LogisticRegression(C=c, penalty='l2')
    score = cross_val_score(clf, X_train_scaled, y_train, cv=kf, scoring='roc_auc')
    
    print('C: {}; Score: {}, Time: {}'.format(c, round(score.mean(), 2), datetime.datetime.now() - start_time))

  after removing the cwd from sys.path.


C: 1e-05; Score: 0.7, Time: 0:00:02.624111
C: 0.0001; Score: 0.71, Time: 0:00:03.983182
C: 0.001; Score: 0.72, Time: 0:00:06.447332
C: 0.01; Score: 0.72, Time: 0:00:09.481324
C: 0.1; Score: 0.72, Time: 0:00:09.903293
C: 1.0; Score: 0.72, Time: 0:00:09.920212
C: 10.0; Score: 0.72, Time: 0:00:10.133123
C: 100.0; Score: 0.72, Time: 0:00:10.459339


After removing of categorial attributes there is some improvement in cross validation time, but quality not changed:
> C: 0.001; Score: 0.72, Time: 0:00:05.904471  

# 2-3. Heroes

In [48]:
heroes = data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
              'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].copy()
unique_heroes = np.unique(heroes)
print (unique_heroes.max(), unique_heroes.shape)

(112, (108,))


There are __112__ unique IDs of heroes in this game, but in current dataset appears only __108__  
Other 4 heroes can be very unpopular, or was blocked for selection.

# 2-4. Bag of words

In [15]:
X_pick = np.zeros((data.shape[0], 112))

for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1 
        X_pick[i, data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
X_train_scaled = np.concatenate((X_pick, X_train_scaled),axis=1)

In [14]:
for c in np.power(10.0,np.arange(-5,3)):
    start_time = datetime.datetime.now()
    clf = LogisticRegression(C=c, penalty='l2')
    score = cross_val_score(clf, X_train_scaled, y_train, cv=kf, scoring='roc_auc')
    
    print('C: {}; Score: {}, Time: {}'.format(c, round(score.mean(), 2), datetime.datetime.now() - start_time))

C: 1e-05; Score: 0.7, Time: 0:00:03.268058
C: 0.0001; Score: 0.72, Time: 0:00:04.539747
C: 0.001; Score: 0.75, Time: 0:00:11.135644
C: 0.01; Score: 0.75, Time: 0:00:18.631219
C: 0.1; Score: 0.75, Time: 0:00:22.087059
C: 1.0; Score: 0.75, Time: 0:00:26.460977
C: 10.0; Score: 0.75, Time: 0:00:26.161372
C: 100.0; Score: 0.75, Time: 0:00:26.263226


Usage of "Bag of words" improved quality up to __0.75__

# 2-5. Prediction

In [16]:
data_test = pandas.read_csv('features_test.csv', index_col='match_id')
data_test = data_test.drop('start_time', axis=1)
data_test = data_test.fillna(0)

In [17]:
X_pick = np.zeros((data_test.shape[0], 112))

for i, match_id in enumerate(data_test.index):
    for p in range(5):
        X_pick[i, data_test.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1 
        X_pick[i, data_test.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
X_test = data_test.drop(labels=['lobby_type','r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                         'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1)
scaler = StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = np.concatenate((X_pick, X_test_scaled),axis=1)

In [19]:
clf = LogisticRegression(C=1e-3, penalty='l2')
clf.fit(X_train_scaled, y_train)
pred = clf.predict_proba(X_test_scaled)

In [26]:
print (round(pred[:,1].min(), 2), round(pred[:,1].max()))


0.01 1.0


Min and max prediction values are: __0.01 1.0__