In [157]:
import pandas as pd
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
import numpy as np
from random import randint
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import time
import datetime
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from pprint import pprint

rawDataTrain = pd.read_csv("Downloads/features.csv", index_col="match_id")
rawDataTest = pd.read_csv("Downloads/features_test.csv", index_col="match_id")

trainX = rawDataTrain.ix[:,:"dire_first_ward_time"]
trainy = rawDataTrain.ix[:,"radiant_win"]

columnsWithMissingData = trainX.columns[trainX.count() != 97230] #magic number antipattern don't do that, I'm a bad person.
print(columnsWithMissingData)
trainX.fillna(0, inplace=True)

cv = KFold(n=len(trainX), n_folds=5, shuffle=True)
gbc = GradientBoostingClassifier(verbose=True)
grid = {"n_estimators": [10, 20, 30]}
gs = GridSearchCV(estimator = gbc, param_grid = grid, cv = cv, scoring = "roc_auc", n_jobs=-1) #n_jobs sets number of threads. 
                                                                                            #Verbose=True doesn't work when 
                                                                                            #n_jobs != 1 for obvious reasons.
start_time = datetime.datetime.now()
gs.fit(trainX, trainy)
print('Time elapsed:', datetime.datetime.now() - start_time)
pprint(gs.grid_scores_)

gbc = GradientBoostingClassifier(n_estimators=30)
gbc.fit(trainX, trainy)
pred = gbc.predict_proba(trainX)[:, 1]
print(roc_auc_score(y_true = trainy, y_score = pred))

sc = StandardScaler()
trainX_scaled = pd.DataFrame(sc.fit_transform(trainX))
trainX_scaled.columns = trainX.columns

lrm = LogisticRegression(penalty = "l2")
grid = {'C': np.power(10.0, np.arange(-5, 6))}
gs = GridSearchCV(estimator = lrm, param_grid = grid, cv = cv, scoring = "roc_auc", n_jobs=-1)

gs.fit(trainX_scaled, trainy)
print("scaled")
pprint(gs.grid_scores_)


cols = [col for col in trainX_scaled.columns if col not in ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]
trainX_scaled_Removed = trainX_scaled[cols]

lrm = LogisticRegression(penalty = "l2")
gs.fit(trainX_scaled_Removed, trainy)
print("removed")
pprint(gs.grid_scores_)

allHeroes = []
allHeroes = trainX.r1_hero.unique()
allHeroes = np.concatenate((allHeroes, trainX.r2_hero.unique(), trainX.r3_hero.unique(), trainX.r4_hero.unique(), 
trainX.r5_hero.unique(), trainX.d1_hero.unique(), trainX.d2_hero.unique(), trainX.d3_hero.unique(), trainX.d4_hero.unique(), 
trainX.d5_hero.unique())) #I am a python master

costylDF = pd.DataFrame(allHeroes)

costylColumn = costylDF[0]
uniqueHeroes = np.sort(costylColumn.unique())
print("number of unique heroes", len(uniqueHeroes))

X_pick = np.zeros((trainX.shape[0], np.max(uniqueHeroes)))

for i, match_id in enumerate(trainX.index):
    for p in [0, 1, 2, 3, 4]:
        X_pick[i, trainX.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, trainX.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
trainX_scaled_meshok_slov_dobavled = np.hstack((trainX_scaled_Removed, X_pick))

lrm = LogisticRegression(penalty="l2")

gs.fit(trainX_scaled_meshok_slov_dobavled, trainy)
print("word bag")
pprint(gs.grid_scores_)

testX = rawDataTest.fillna(0)
testX_scaled = pd.DataFrame(sc.transform(testX))
testX_scaled.columns = rawDataTest.columns
cols = [col for col in testX_scaled.columns if col not in ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]
testX_scaled_Removed = testX_scaled[cols]

X_pick = np.zeros((testX.shape[0], np.max(uniqueHeroes)))

for i, match_id in enumerate(testX.index):
    for p in [0, 1, 2, 3, 4]:
        X_pick[i, testX.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, testX.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
testX_scaled_with_wordbag = np.hstack((testX_scaled_Removed, X_pick))
y_pred = gs.predict_proba(testX_scaled_with_wordbag)
print(np.min(y_pred), np.max(y_pred))

Index(['first_blood_time', 'first_blood_team', 'first_blood_player1',
       'first_blood_player2', 'radiant_bottle_time', 'radiant_courier_time',
       'radiant_flying_courier_time', 'radiant_first_ward_time',
       'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time',
       'dire_first_ward_time'],
      dtype='object')
      Iter       Train Loss   Remaining Time 
         1           1.3786           29.36s
         2           1.3732           28.94s
         3           1.3681           27.63s
         4           1.3636           26.54s
         5           1.3589           25.20s
         6           1.3547           24.27s
         7           1.3502           23.18s
         8           1.3461           21.97s
         9           1.3422           20.88s
        10           1.3385           19.91s
        20           1.3092           10.03s
        30           1.2892            0.00s
Time elapsed: 0:02:20.638795
[mean: 0.66471, std: 0.00471, params: {'n_e

0.00362911441158 0.996370885588


In [158]:
print("number of unique heroes", len(uniqueHeroes))

number of unique heroes 108
