### Part 1 - Gradient Boosting

In [32]:
from __future__ import print_function
from datetime import datetime
from pandas import read_csv, DataFrame
from sklearn.cross_validation import KFold, cross_val_predict
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import scale
from numpy import logspace, zeros, hstack

In [33]:
run_gradient_boosting = True
run_logistic_regression_for_all_features = True
run_logistic_regression_for_truncated_features = True

In [34]:
features = read_csv("features.csv", index_col="match_id")
features_data_columns = ['start_time',
    'duration',
    'tower_status_radiant',
    'tower_status_dire',
    'barracks_status_radiant',
    'barracks_status_dire']
features.drop(features_data_columns, axis=1, inplace=True)
# The data ready

In [35]:
f_counts = features.count()
na_features = f_counts[f_counts != features.shape[0]].index.values
print ("features with null:")
for f in na_features:
    print ("-" + f)

features with null:
-first_blood_time
-first_blood_team
-first_blood_player1
-first_blood_player2
-radiant_bottle_time
-radiant_courier_time
-radiant_flying_courier_time
-radiant_first_ward_time
-dire_bottle_time
-dire_courier_time
-dire_flying_courier_time
-dire_first_ward_time


In [36]:
for name in na_features: features[name].fillna(0, inplace=True) # заполнение нулями ... и отбор эталонов выборки

In [37]:
target = 'radiant_win'

y = features[target]
features.drop(target, axis=1, inplace=True)
X = features

# Predict class probabilities for X gradient boosting

In [43]:
folds = KFold(features.shape[0], n_folds=5, shuffle=True)

for num in range(10,31)[::10]:
    start_time = datetime.now()
    probas = cross_val_predict(GradientBoostingClassifier(n_estimators=num), X=X, y=y, cv=folds, n_jobs=-1)
    score = roc_auc_score(y, probas)
    
    elapsed_time = datetime.now() - start_time
    print('{0} trees, {1} to fit, ROC-AUC score: {2:.2f}'.format(num, elapsed_time, score))


10 trees, 0:01:04.943582 to fit, ROC-AUC score: 0.61
20 trees, 0:01:50.147437 to fit, ROC-AUC score: 0.63
30 trees, 0:02:38.960601 to fit, ROC-AUC score: 0.63
