In [99]:
import numpy as np
import pandas
data = pandas.read_csv('./features.csv',index_col='match_id')
cols = [col for col in data.columns if col not in ['duration', 'radiant_win','tower_status_radiant','tower_status_dire',
                                                   'barracks_status_radiant','barracks_status_dire']]
X_train = data[cols]
y_train = data['radiant_win']

In [100]:
#находим признаки с пропусками и заполняем их нулями
c_total=len(X_train)
for s in list(X_train.columns.values):
    if X_train[s].count()<c_total:
        print (s)
        X_train[s]=X_train[s].fillna(0)

first_blood_time
first_blood_team
first_blood_player1
first_blood_player2
radiant_bottle_time
radiant_courier_time
radiant_flying_courier_time
radiant_first_ward_time
dire_bottle_time
dire_courier_time
dire_flying_courier_time
dire_first_ward_time


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [101]:
#градиентный бустинг с 30 деревьями
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import time
import datetime

kf=KFold(len(X_train), n_folds=5, shuffle=True)
clf = GradientBoostingClassifier(n_estimators=30)
scores=[]
start_time = datetime.datetime.now()
for train, test in kf:
    clf.fit(X_train.iloc[train],y_train.iloc[train])
    pred = clf.predict_proba(X_train.iloc[test])[:, 1]
    scores.append(roc_auc_score(y_train.iloc[test],pred))
print ('Time elapsed:', datetime.datetime.now() - start_time)
print ('Roc_auc_score:',np.mean(scores))

Time elapsed: 0:02:06.695564
Roc_auc_score: 0.689317616006


In [102]:
#тесты для 10 и 20 деревьев
clf = GradientBoostingClassifier(n_estimators=10)
scores=[]
start_time = datetime.datetime.now()
for train, test in kf:
    clf.fit(X_train.iloc[train],y_train.iloc[train])
    pred = clf.predict_proba(X_train.iloc[test])[:, 1]
    scores.append(roc_auc_score(y_train.iloc[test],pred))
print ('Time elapsed 10:', datetime.datetime.now() - start_time)
print ('Roc_auc_score 10:',np.mean(scores))

clf = GradientBoostingClassifier(n_estimators=20)
scores=[]
start_time = datetime.datetime.now()
for train, test in kf:
    clf.fit(X_train.iloc[train],y_train.iloc[train])
    pred = clf.predict_proba(X_train.iloc[test])[:, 1]
    scores.append(roc_auc_score(y_train.iloc[test],pred))
print ('Time elapsed 20:', datetime.datetime.now() - start_time)
print ('Roc_auc_score 20:',np.mean(scores))

Time elapsed 10: 0:00:43.149494
Roc_auc_score 10: 0.663843012008
Time elapsed 20: 0:01:24.645728
Roc_auc_score 20: 0.681949655856


In [103]:
#тесты для большего числа деревьев
for n in [40,50,70]:
    clf = GradientBoostingClassifier(n_estimators=n)
    scores=[]
    start_time = datetime.datetime.now()
    for train, test in kf:
        clf.fit(X_train.iloc[train],y_train.iloc[train])
        pred = clf.predict_proba(X_train.iloc[test])[:, 1]
        scores.append(roc_auc_score(y_train.iloc[test],pred))
    print ('Time elapsed',n,': ', datetime.datetime.now() - start_time)
    print ('Roc_auc_score',n,': ',np.mean(scores))

Time elapsed 40 :  0:02:51.870963
Roc_auc_score 40 :  0.694169927929
Time elapsed 50 :  0:03:36.205800
Roc_auc_score 50 :  0.69720137533
Time elapsed 70 :  0:05:02.147261
Roc_auc_score 70 :  0.701689204291


In [62]:
#масштабирование признаков
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_tr_scaled=scaler.fit_transform(X_train)

In [66]:
#логистическая регрессия с поиском оптимального параметра С
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
grid = {'C': np.power(10.0, np.arange(-5, 5))}
lr = LogisticRegression(penalty='l2')
gs = GridSearchCV(lr, grid, scoring='roc_auc', cv=kf)
gs.fit(X_tr_scaled, y_train)
print ('C: ',gs.best_params_['C'])
print ('Roc_auc_score: ',gs.best_score_)

C:  0.01
Roc_auc_score:  0.716349032459


In [67]:
#для сравнения времени работы с градиентным бустингом
lr = LogisticRegression(penalty='l2',C=0.01)
scores=[]
start_time = datetime.datetime.now()
for train, test in kf:
    lr.fit(X_tr_scaled[train],y_train.iloc[train])
    pred = lr.predict_proba(X_tr_scaled[test])[:, 1]
    scores.append(roc_auc_score(y_train.iloc[test],pred))
print ('Time elapsed:', datetime.datetime.now() - start_time)
print ('Roc_auc_score:',np.mean(scores))

Time elapsed: 0:00:10.915144
Roc_auc_score: 0.716349034577


In [68]:
#убираем категориальные признаки
cols = [col for col in X_train.columns if col not in ['lobby_type','r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
                                                   'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]
X_train_new = X_train[cols]

In [70]:
#заново масштабируем
X_train_new_scaled=scaler.fit_transform(X_train_new)

In [71]:
#логистическая регрессия с поиском оптимального параметра С для новой выборки
grid = {'C': np.power(10.0, np.arange(-5, 5))}
lr = LogisticRegression(penalty='l2')
gs = GridSearchCV(lr, grid, scoring='roc_auc', cv=kf)
gs.fit(X_train_new_scaled, y_train)
print ('C: ',gs.best_params_['C'])
print ('Roc_auc_score: ',gs.best_score_)

C:  0.01
Roc_auc_score:  0.716362219745


In [74]:
#считаем число различных героев
heroes=data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values
print (len(np.unique(heroes.reshape(-1))))

108


In [90]:
#мешок слов по героям
N=max(np.unique(heroes))
X_pick = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
X_h = pandas.DataFrame(np.hstack((X_train_new.values, X_pick)))

In [84]:
#заново масштабируем
X_h_scaled=scaler.fit_transform(X_h)

In [85]:
#логистическая регрессия с поиском оптимального параметра С для выборки с героями
grid = {'C': np.power(10.0, np.arange(-5, 5))}
lr = LogisticRegression(penalty='l2')
gs = GridSearchCV(lr, grid, scoring='roc_auc', cv=kf)
gs.fit(X_h_scaled, y_train)
print ('C: ',gs.best_params_['C'])
print ('Roc_auc_score: ',gs.best_score_)

C:  0.01
Roc_auc_score:  0.751907874923


In [94]:
#загружаем тестовую выборку и трансформируем
test=pandas.read_csv('./features_test.csv',index_col='match_id')
c_total=len(test)
for s in list(test.columns.values):
    if test[s].count()<c_total:
        test[s]=test[s].fillna(0)
cols = [col for col in test.columns if col not in ['lobby_type','r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
                                                   'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]
test_new = test[cols]

test_pick = np.zeros((test.shape[0], N))

for i, match_id in enumerate(test.index):
    for p in range(5):
        test_pick[i, test.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        test_pick[i, test.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
X_test = pandas.DataFrame(np.hstack((test_new.values, test_pick)))
X_test_scaled=scaler.transform(X_test)

In [98]:
#строим прогноз для тестовой выборки по последнему алгоритму, т.к. у него максимальное качество из всех
print ('Prediction: ',gs.predict_proba(X_test_scaled)[:, 1])
print ('Proba min: ',min(gs.predict_proba(X_test_scaled)[:, 1]))
print ('Proba max: ',max(gs.predict_proba(X_test_scaled)[:, 1]))

Prediction:  [ 0.82270839  0.75210182  0.18893925 ...,  0.23787704  0.62823222
  0.42755372]
Proba min:  0.00849095194724
Proba max:  0.996277624036
