In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
#1
#loading data
dataTrain = pd.read_csv('./features.csv', index_col='match_id')
dataTest = pd.read_csv('./features_test.csv', index_col='match_id')

#4
#target variable
yTrain = dataTrain['radiant_win']

In [3]:
#deleting features conected to end of match
difference = set(dataTrain.columns.values) - set(dataTest.columns.values)
dataTrain.drop(difference, axis=1, inplace=True)

In [4]:
#2
#checking for missing variables
totalSize = len(dataTrain)
print('Total size = ', totalSize)

print('\n Column |  Number of missing values \n')

for col in dataTrain:
    colSize = dataTrain[col].count()
    if colSize != totalSize:
        print(col, ' | ' , totalSize - colSize)

Total size =  97230

 Column |  Number of missing values 

first_blood_time  |  19553
first_blood_team  |  19553
first_blood_player1  |  19553
first_blood_player2  |  43987
radiant_bottle_time  |  15691
radiant_courier_time  |  692
radiant_flying_courier_time  |  27479
radiant_first_ward_time  |  1836
dire_bottle_time  |  16143
dire_courier_time  |  676
dire_flying_courier_time  |  26098
dire_first_ward_time  |  1826


In [5]:
#3
#replacing missing variables with zeroes
dataTrain.fillna(0, method=None, axis=1, inplace=True)

## Experiment 1: Gradient Boosting ("naive")

In [6]:
#5
#GB trees
kf = KFold(n_splits=5,shuffle=True)

nsTrees = [10, 20, 30, 40, 50, 60, 70]
qualities = []

for nTrees in nsTrees:
    clf = GradientBoostingClassifier(n_estimators=nTrees, random_state=42)
    
    start_time = datetime.datetime.now()
    scores = cross_val_score(clf, dataTrain, yTrain, scoring='roc_auc', cv=kf)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    
    quality = scores.mean() * 100
    qualities.append(quality)
    
    print("Number of trees(n_estimators):", nTrees, 'Quality assessment: ', quality)


Time elapsed: 0:00:44.910946
Number of trees(n_estimators): 10 Quality assessment:  66.3867612978
Time elapsed: 0:01:15.082123
Number of trees(n_estimators): 20 Quality assessment:  68.204986854
Time elapsed: 0:01:44.978186
Number of trees(n_estimators): 30 Quality assessment:  69.017613343
Time elapsed: 0:02:18.634746
Number of trees(n_estimators): 40 Quality assessment:  69.46467938
Time elapsed: 0:02:52.281933
Number of trees(n_estimators): 50 Quality assessment:  69.7650533861
Time elapsed: 0:03:25.689071
Number of trees(n_estimators): 60 Quality assessment:  70.0232288054
Time elapsed: 0:04:02.081581
Number of trees(n_estimators): 70 Quality assessment:  70.2432088261


## Experimant 2: Logistic regression

In [7]:
#1
#LR (naive)
grid = {'C': np.logspace(-3, -1, 10)}

def scoreLR(data):
    #looking for best C
    clf_grid = GridSearchCV(LogisticRegression(random_state=42,n_jobs=-1), grid, cv=kf, n_jobs=1, verbose=1, scoring='roc_auc')
    clf_grid.fit(data, yTrain)
    
    #creating LR with best C
    lr = LogisticRegression(n_jobs=-1,random_state=42,**clf_grid.best_params_)
    lr.fit(data, yTrain)
    
    #Cross Validation
    start_time = datetime.datetime.now()
    scores = cross_val_score(lr, data, yTrain, scoring='roc_auc', cv=kf)
    print('Time elapsed:', datetime.datetime.now() - start_time, end=' ')
    quality = scores.mean()*100
    print("Quality assessment:", quality)
    
print("Without scaling:")
scoreLR(dataTrain)

print("\nWith scaling:")
scoreLR(StandardScaler().fit_transform(dataTrain))

Without scaling:
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   36.0s finished


Time elapsed: 0:00:03.129821 Quality assessment: 51.3444570606

With scaling:
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.4min finished


Time elapsed: 0:00:18.729711 Quality assessment: 71.6366215074


In [25]:
#2
#Removing categoracal features
colsToDrop = ['r%s_hero' % i for i in range(1, 6)] + ['d%s_hero' % i for i in range(1, 6)]
colsToDrop.append('lobby_type')

dataTrainNorm_NoCateg = pd.DataFrame(data=StandardScaler().fit_transform(dataTrain.drop(colsToDrop, axis=1)))

In [28]:
scoreLR(dataTrainNorm_NoCateg)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.1min finished


Time elapsed: 0:00:22.585292 Quality assessment: 71.6590480143


In [39]:
#3
#number of types of heroes
colsToDrop.remove('lobby_type')
N = len(set(dataTrain[colsToDrop].values.flatten()))
print(N)
m = max(set(dataTrain[colsToDrop].values.flatten()))

108


In [30]:
#4
#Bag of words to code info obout type of hero
X_pick = np.zeros((dataTrain.shape[0], m))

for i, match_id in enumerate(dataTrain.index):
    for p in range(5):
        X_pick[i, dataTrain.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, dataTrain.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  import sys


In [37]:
#5
#LR (after BoW)

dataTrainBW = dataTrainNorm_NoCateg.join(pd.DataFrame(X_pick),rsuffix='_',how='inner')
scoreLR(dataTrainBW)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  5.4min finished


Time elapsed: 0:00:42.937456 Quality assessment: 75.1834640228


In [42]:
#6
#Predictions for test data

#looking for best C
clf_grid = GridSearchCV(LogisticRegression(random_state=42,n_jobs=-1), grid, cv=kf, n_jobs=1, verbose=1, scoring='roc_auc')
clf_grid.fit(dataTrainBW, yTrain)
    
#creating LR with best C
lr = LogisticRegression(n_jobs=-1,random_state=42,**clf_grid.best_params_)
lr.fit(dataTrainBW, yTrain)  

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  5.5min finished


LogisticRegression(C=0.059948425031894091, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [65]:
#Preparing testing data
dataTest.fillna(0, method=None, axis=1, inplace=True)

#colsToDrop.append('lobby_type')

dataTestDrop = pd.DataFrame(data=StandardScaler().fit_transform(dataTest.drop(colsToDrop, axis=1)))

X_pick = np.zeros((dataTest.shape[0], m))

for i, match_id in enumerate(dataTest.index):
    for p in range(5):
        X_pick[i, dataTest.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, dataTest.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

dataTestBW = dataTestDrop.join(pd.DataFrame(X_pick),rsuffix='_',how='inner')

#testing
yPred = lr.predict_proba(dataTestBW)

print("Max:" , yPred.max(), "Min:", yPred.min())

Max: 0.996420513116 Min: 0.00357948688392
