In [32]:
import pandas as pd
import numpy as np
import time 
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [33]:
from sklearn.cross_validation import KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

## 1. Load data

In [34]:
df = pd.read_csv('features.csv')
df_test = pd.read_csv('features_test.csv')

In [35]:
print('shape of data', df.shape)
print('shape of test data', df_test.shape)

shape of data (97230, 109)
shape of test data (17177, 103)


Выделим целевую переменную

In [36]:
y = df['radiant_win']

## 2. Preprocessing

#### Удаление признаков из выбоки
Которые не влияют на выигрыш

In [6]:
del df['start_time']
del df['match_id']
del df_test['start_time']
del df_test['match_id']

Которые отсутствуют в тестовых данных


In [7]:
df.columns

Index(['lobby_type', 'r1_hero', 'r1_level', 'r1_xp', 'r1_gold', 'r1_lh',
       'r1_kills', 'r1_deaths', 'r1_items', 'r2_hero',
       ...
       'dire_boots_count', 'dire_ward_observer_count',
       'dire_ward_sentry_count', 'dire_first_ward_time', 'duration',
       'radiant_win', 'tower_status_radiant', 'tower_status_dire',
       'barracks_status_radiant', 'barracks_status_dire'],
      dtype='object', length=107)

In [8]:
df = df.drop(['duration', 'tower_status_radiant', 'tower_status_dire', 
              'barracks_status_radiant', 'barracks_status_dire', 
              'radiant_win'], axis=1)

#### Работаем с NaN признаками
Вывод NaN призаков

In [9]:
features_nan = []
for column in df.columns:
    if df[column].count() != len(df):
        features_nan.append(column)

In [10]:
features_nan

['first_blood_time',
 'first_blood_team',
 'first_blood_player1',
 'first_blood_player2',
 'radiant_bottle_time',
 'radiant_courier_time',
 'radiant_flying_courier_time',
 'radiant_first_ward_time',
 'dire_bottle_time',
 'dire_courier_time',
 'dire_flying_courier_time',
 'dire_first_ward_time']

**Пояснение:** если не было убийства, то нет и команды (соответсвенно, игроков), его совершившего. Остальные признаки означают отсутсвие покупки bottle, courier, flying_courier и установки первого "наблюдателя" за первые 5 минут матча.

Заполним все значения NaN одним из значений: **0**, **1**, **median** или **mean**:
1. **'first_blood_time'** - median
2. **'first_blood_team'** - 1, если в этой команде больше убийств, 0 - в других случаях
3. **'first_blood_player1'** - median
4. **'first_blood_player2'** - median
5. **'radiant_bottle_time'** - median
6. **'radiant_courier_time'** - median
7. **'radiant_flying_courier_time'** - median
8. **'radiant_first_ward_time'** - mean
9. **'dire_bottle_time'** - median
10. **'dire_courier_time'** - median
11. **'dire_flying_courier_time'** - median
12. **'dire_first_ward_time'** - mean

In [11]:
# выводим list кол-ва убийств обеих команд
# для сравнения числа убийств Radiand и Dire
def get_kills(df):
    d_kills = []
    r_kills = []
    for i in range(df.shape[0]):
        r_kills.append(sum(np.array((df.r1_kills[i], df.r2_kills[i], df.r3_kills[i], 
                                     df.r4_kills[i], df.r5_kills[i]))))
        d_kills.append(sum(np.array((df.d1_kills[i], df.d2_kills[i], df.d3_kills[i], 
                                     df.d4_kills[i], df.d5_kills[i]))))
    kills = list(zip(r_kills, d_kills))
    return kills

# выводим list опыта обеих команд
# для сравнения опыта Radiand и Dire
def get_xp(df):
    d_xp = []
    r_xp = []
    for i in range(df.shape[0]):
        r_xp.append(sum(np.array((df.r1_xp[i], df.r2_xp[i], df.r3_xp[i], 
                                  df.r4_xp[i], df.r5_xp[i]))))
        d_xp.append(sum(np.array((df.d1_xp[i], df.d2_xp[i], df.d3_xp[i], 
                                  df.d4_xp[i], df.d5_xp[i]))))
    xp = list(zip(r_xp, d_xp))
    return xp

In [12]:
kills = get_kills(df)

In [13]:
def change_nan(df):
    # medians
    df['first_blood_time'] = df['first_blood_time'].fillna(np.median(df[column].dropna()))
    df['first_blood_player1'] = df['first_blood_player1'].fillna(np.median(df[column].dropna()))
    df['first_blood_player2'] = df['first_blood_player2'].fillna(np.median(df[column].dropna()))
    df['radiant_bottle_time'] = df['radiant_bottle_time'].fillna(np.median(df[column].dropna()))
    df['radiant_courier_time'] = df['radiant_courier_time'].fillna(np.median(df[column].dropna()))
    df['radiant_flying_courier_time'] = df['radiant_flying_courier_time'].fillna(np.median(df[column].dropna()))
    df['dire_bottle_time'] = df['dire_bottle_time'].fillna(np.median(df[column].dropna()))
    df['dire_courier_time'] = df['dire_courier_time'].fillna(np.median(df[column].dropna()))
    df['dire_flying_courier_time'] = df['dire_flying_courier_time'].fillna(np.median(df[column].dropna()))
    # means
    df['radiant_first_ward_time'] = df['radiant_first_ward_time'].fillna(np.mean(df[column].dropna()))
    df['dire_first_ward_time'] = df['dire_first_ward_time'].fillna(np.mean(df[column].dropna())) 
    # 0 or 1
    for i in range(df.shape[0]):
        if kills[i][0] > kills[i][1]: # убийств Radiant больше
            df['first_blood_team'] = df['first_blood_team'].fillna(value=1) # пишем 1
        else: 
            df['first_blood_team'] = df['first_blood_team'].fillna(value=0) # пишем 0

In [14]:
change_nan(df)

Добавление новых признаков (?)

## 3. Classification pipeline

In [15]:
X = df

Разделим выборку

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

**StratifiedKFold**: зафиксируем генератор разбиений для кросс-валидации c n_folds=5:

In [17]:
skf = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=0)
# kf = KFold(n=len(y_train), n_folds=5, shuffle=True, random_state=0)

## 3.1 Gradient Boosting "head-on"

In [None]:
# clf_gb = GradientBoostingClassifier(random_state=0)
# for n in [10, 20, 30]:
#     starttime = datetime.datetime.now()
#     clf_gb.n_estimators = n
#     scores = cross_val_score(clf_gb, X_train.values, y_train.values, cv=skf, scoring='roc_auc')
#     print(n, 'estimators')
#     print('time:', datetime.datetime.now() - starttime)    
#     print('ROC AUC:', scores.mean())

Можно получить ROC AUC больше, если использовать > 30 деревьев. Для ускорения процесса обучения можно уменьшить глубину деревьев (по умолчанию 3).

In [None]:
# clf_gb.n_estimators = 40
# starttime = datetime.datetime.now()
# scores = cross_val_score(clf_gb, X_train, y_train, cv=skf, scoring='roc_auc')
# print(n, 'estimators')
# print('time:', datetime.datetime.now() - starttime)   
# print('ROC AUC:', scores.mean())

## 3.2 Logistic regression

Для начала выполним масштабирование признаков (**StandardScaler** or MinMaxScaler?)

In [18]:
SS = StandardScaler()
MMS = MinMaxScaler()

df_scaled = SS.fit_transform(df)
X_scaled = pd.DataFrame(SS.fit_transform(X))

Разделим замаштабированные данные 

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=0)

Проведем grid search по коэффициентам регуляризации C. 

In [20]:
grid_lr = {'C': np.power(10.0, np.arange(-3, 3))}
clf_lr = LogisticRegression(random_state=0)

clf_lr_grid = GridSearchCV(clf_lr, grid_lr, cv=skf, scoring='roc_auc')
clf_lr_grid.fit(X_train, y_train)

y_train_score_lr = clf_lr_grid.decision_function(X_train)
y_test_score_lr = clf_lr_grid.decision_function(X_test)

print('the best parameter C:', clf_lr_grid.best_params_['C'])
print('ROC AUC for train', roc_auc_score(y_train, y_train_score_lr))
print('ROC AUC for test', roc_auc_score(y_test, y_test_score_lr))

the best parameter C: 0.01
ROC AUC for train 0.718080283786
ROC AUC for test 0.716533155897


predict_proba

In [None]:
y_train_score_lr = clf_lr_grid.decision_function(X_train)
y_test_score_lr = clf_lr_grid.decision_function(X_test)

print('the best parameter C:', clf_lr_grid.best_params_['C'])
print('ROC AUC for train', roc_auc_score(y_train, y_train_score_lr))
print('ROC AUC for test', roc_auc_score(y_test, y_test_score_lr))

###  Categorical features

Исключим из датафрейма категориальные признаки и посмотрим, как меняется  ROC AUC

In [21]:
df_scaled = pd.DataFrame(df_scaled, columns = df.columns)
df_scaled_no_cat = df_scaled.drop(['lobby_type', 
                'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
                'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'],
                 axis=1)

Исключим из выборки целевую переменную

In [22]:
X_no_cat = df_scaled_no_cat
X_no_cat.head()

Unnamed: 0,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,r2_gold,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
0,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,-0.332256,-0.625222,-0.255162,...,-0.987486,1.070083,-0.005601,-0.263972,0.640074,0.018054,0.562864,-0.551154,1.846004,-1.118582
1,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,0.578881,0.732454,-0.250795,...,-0.987486,-0.335357,0.583775,-0.263972,0.395694,1.066668,0.562864,0.67817,0.437788,0.047174
2,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,-0.332256,0.224676,-0.726779,...,0.391203,-0.820873,-0.748729,0.170355,0.640074,0.018054,0.562864,0.67817,0.437788,0.493634
3,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,-1.243393,-1.170813,-1.242065,...,-0.987486,-0.590892,0.263462,-0.015785,0.292303,-1.554868,0.562864,-0.551154,-0.970428,0.84088
4,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,-1.243393,-1.008757,-1.21368,...,-0.987486,1.351171,1.006589,-0.015785,0.677671,1.590976,-0.302485,0.67817,-0.970428,-0.225663


Повторим обучениe на данных без категориальных признаков

In [23]:
X_train_no_cat, X_test_no_cat, y_train_no_cat, y_test_no_cat = train_test_split(
    X_no_cat, y, test_size=0.3, random_state=0)
clf_lr_grid.fit(X_train_no_cat, y_train_no_cat)

y_train_score_lr = clf_lr_grid.decision_function(X_train_no_cat)
y_test_score_lr = clf_lr_grid.decision_function(X_test_no_cat)

print('the best parameter C:', clf_lr_grid.best_params_['C'])
print('ROC AUC for train', roc_auc_score(y_train_no_cat, y_train_score_lr))
print('ROC AUC for test', roc_auc_score(y_test_no_cat, y_test_score_lr))

the best parameter C: 0.01
ROC AUC for train 0.717865324779
ROC AUC for test 0.716677625768


Как и следовало ожидать, значение ROC AUC практически не изменилось, т.е категориальные признаки не дают дополнительной информации о победителе (номера героев не связаны с характеристиками героев).

### Bag of words

Лист категориальных признаков (герои), нуждающихся в представлении bag_of_words

In [24]:
df_heroes = df[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
                'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]

heroes = X[df_heroes.columns].stack()

Создадим **"мешок слов"** для героев и добавим его к отмасштабированным данным. Сформируем N признаков, при этом:
1. i-й будет равен нулю, если i-й герой не участвовал в матче; 
2. единице, если i-й герой играл за команду Radiant; 
3. минус единице, если i-й герой играл за команду Dire.

In [25]:
# в качестве аргументов будем подавать давафрейм и лист, 
# нуждающийся в представлении bag_of_words
def bag_of_words(df, heroes):
    n = heroes.max() # кол-во различных идентификаторов героев (n = 112)
        
    X_pick = np.zeros((len(df), n))
    for i, match_id in enumerate(df.index):
        for p in range(5):
            X_pick[i, df.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, df.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

    X_pick_df = pd.DataFrame(X_pick)
    X_pick_df.columns = range(1, 113)
    cols = [col for col in X_pick_df.columns if col in heroes.unique()]
    
    X_pick_df = X_pick_df[cols]
    return X_pick_df


In [26]:
# bag_of_words(df, heroes)[:3]

In [27]:
X_pick_df = bag_of_words(df, heroes) 

Конкатинация

In [28]:
X_fin = pd.concat([X_no_cat, X_pick_df], axis = 1)

Разбиение на train и test

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X_fin, y, test_size=0.3, random_state=0)

In [30]:
clf_lr_grid.fit(X_train, y_train)

y_train_score_lr = clf_lr_grid.decision_function(X_train)
y_test_score_lr = clf_lr_grid.decision_function(X_test)

print('the best parameter C:', clf_lr_grid.best_params_['C'])
print('ROC AUC for train', roc_auc_score(y_train, y_train_score_lr))
print('ROC AUC for test', roc_auc_score(y_test, y_test_score_lr))

the best parameter C: 0.1
ROC AUC for train 0.755165194278
ROC AUC for test 0.751048310798


## 3.3 k-Neighbours

In [31]:
knn = KNeighborsClassifier(n_neighbors=50)
scores = cross_val_score(knn, X_train, y_train, cv=skf).mean()
print(scores)

0.62260327185


## 3.4 LinearSVC

С grid search

In [44]:
grid_lsvc = {'C': np.logspace(-4, 2, 7)}
clf_lsvc = LinearSVC(random_state=0)

clf_lsvc_grid = GridSearchCV(clf_lsvc, grid_lsvc, cv=skf, scoring='roc_auc', n_jobs=4)
clf_lsvc_grid.fit(X_train, y_train)

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=5, shuffle=True, random_state=0),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'C': array([  1.00000e-04,   1.00000e-03,   1.00000e-02,   1.00000e-01,
         1.00000e+00,   1.00000e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [46]:
y_train_score_lsvc = clf_lsvc_grid.decision_function(X_train)
y_test_score_lsvc = clf_lsvc_grid.decision_function(X_test)

print('the best parameter C:', clf_lsvc_grid.best_params_['C'])
print('ROC AUC for train', roc_auc_score(y_train, y_train_score_lsvc))
print('ROC AUC for test', roc_auc_score(y_test, y_test_score_lsvc))

the best parameter C: 0.01
ROC AUC for train 0.755154861963
ROC AUC for test 0.751053100401


Без grid search

In [49]:
clf_lsvc.fit(X_train, y_train)

y_train_score_lsvc = clf_lsvc.decision_function(X_train)
y_test_score_lsvc = clf_lsvc.decision_function(X_test)

print('ROC AUC for train', roc_auc_score(y_train, y_train_score_lsvc))
print('ROC AUC for test', roc_auc_score(y_test, y_test_score_lsvc))

ROC AUC for train 0.755045730725
ROC AUC for test 0.751063741609


## 3.5 SVM with the rbf kernel

Без сетки

In [50]:
clf_svc = SVC(kernel='rbf', random_state=0)
clf_svc.fit(X_train, y_train)

y_train_score_svc = clf_svc.decision_function(X_train)
y_test_score_svc = clf_svc.decision_function(X_test)

print('ROC AUC for train', roc_auc_score(y_train, y_train_score_svc))
print('ROC AUC for test', roc_auc_score(y_test, y_test_score_svc))

ROC AUC for train 0.80969568486
ROC AUC for test 0.745948728266


probability=True

In [38]:
clf_svc1 = SVC(kernel='rbf', random_state=0, probability=True)
clf_svc1.fit(X_train, y_train)

y_train_score_svc = clf_svc1.decision_function(X_train)
y_test_score_svc = clf_svc1.decision_function(X_test)

print('ROC AUC for train', roc_auc_score(y_train, y_train_score_svc))
print('ROC AUC for test', roc_auc_score(y_test, y_test_score_svc))

ROC AUC for train 0.80969568486
ROC AUC for test 0.745948728266


NOT BAD

 Добавим сетку

In [None]:
grid_svc = {'C': np.logspace(-4, 2, 7)}
clf_svc = SVC(kernel='rbf', random_state=0, probability=True)
clf_svc_grid = GridSearchCV(clf_svc, grid_svc, cv=skf, scoring='roc_auc', n_jobs=4)
clf_svc_grid.fit(X_train, y_train)

In [None]:
y_train_score_svc = clf_svc_grid.predict_proba(X_train)
y_test_score_svc = clf_svc_grid.predict_proba(X_test)

print('the best parameter C:', clf_svc_grid.best_params_['C'])
print('ROC AUC for train', roc_auc_score(y_train, y_train_score_svc))
print('ROC AUC for test', roc_auc_score(y_test, y_test_score_svc))

## 3.6 RF

In [None]:
clf_rf = RandomForestClassifier(random_state=0)
for n in [10, 20, 30]:
    starttime = datetime.datetime.now()
    clf_rf.n_estimators = n
    scores = cross_val_score(clf_rf, X_train.values, y_train.values, cv=skf, scoring='roc_auc')
    print(n, 'estimators')
    print('time:', datetime.datetime.now() - starttime)    
    print('ROC AUC:', scores.mean())


## ADABOOST and XGBOOST

# Test data + KAGGLE

##### Работа с NaN

In [None]:
kills = get_kills(df_test)

In [None]:
change_nan(df_test)

##### Создание мешка слов из героев

In [None]:
df_heroes = df[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
                'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]

heroes = X[df_heroes.columns].stack()

X_pick_df_1 = bag_of_words(df_test, heroes)
# X_pick_df_1

##### Масштабирование

In [None]:
cols = [col for col in df_test.columns 
        if col not in ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
                       'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero',
                       'lobby_type']]

X_scaled_1 = pd.DataFrame(SS.fit_transform(df_test[cols]))
X_scaled_1.columns = df_test[cols].columns

##### Конкатинация

In [None]:
X_fin_1 = pd.concat([X_scaled_1[cols], X_pick_df_1], axis=1)

In [None]:
logistic = clf_lr_grid.best_estimator_.predict_proba(X_fin_1)
result = pd.DataFrame(list(zip(df_test.index, logistic[:, 1])))
result.columns = ['match_id', 'radiant_win']

In [None]:
print('Min probability:', min(result['radiant_win']))
print('Max probability:', max(result['radiant_win']))

Запишем результаты для Kaggle

In [57]:
result.to_csv('result.csv', index = False)

NameError: name 'result' is not defined