# Библиотеки

In [401]:
import json
import bz2
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
import time
import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Откроем и прочитаем файл

In [402]:
abilities = pd.read_csv('data/dictionaries/abilities.csv')
heroes = pd.read_csv('data/dictionaries/heroes.csv')
items = pd.read_csv('data/dictionaries/items.csv')
lobbies = pd.read_csv('data/dictionaries/lobbies.csv')
mods = pd.read_csv('data/dictionaries/mods.csv')
regions = pd.read_csv('data/dictionaries/regions.csv')
features = pd.read_csv('data/features.csv')

In [403]:
features

Unnamed: 0,match_id,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
0,0,1430198770,7,11,5,2098,1489,20,0,0,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1,1430220345,0,42,4,1188,1033,9,0,1,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,2,1430227081,7,33,4,1319,1270,22,0,0,...,4,3,1,13.0,2130,0,0,1830,0,63
3,3,1430263531,1,29,4,1779,1056,14,0,0,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,4,1430282290,7,13,4,1431,1090,8,1,0,...,3,3,0,-16.0,2449,0,4,1974,3,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97225,114402,1450265551,1,47,4,1706,1198,17,0,1,...,4,3,0,-24.0,2032,0,1792,1975,48,63
97226,114403,1450277704,0,43,4,1793,1416,17,0,1,...,3,2,0,-17.0,1734,1,2038,6,63,3
97227,114404,1450291848,1,98,4,1399,540,1,0,0,...,1,3,1,-15.0,2906,0,1796,1846,51,63
97228,114405,1450292986,1,100,3,1135,766,6,0,2,...,3,3,1,-42.0,951,0,2039,2047,63,63


### Удалим ненужные признаки

In [404]:
X = features.drop(['match_id',# я УДАЛИЛ ЕЩЁ MATCH_ID
                   'duration' , 
                   'radiant_win',
                   'tower_status_radiant',
                   'tower_status_dire',
                   'barracks_status_radiant',
                   'barracks_status_dire'], axis = 1)

## Названия признаков имеющих пропуски

In [405]:
include_nan = X.columns[X.count() != 97230]
include_nan

Index(['first_blood_time', 'first_blood_team', 'first_blood_player1',
       'first_blood_player2', 'radiant_bottle_time', 'radiant_courier_time',
       'radiant_flying_courier_time', 'radiant_first_ward_time',
       'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time',
       'dire_first_ward_time'],
      dtype='object')

### Больше всего пропусков у событий связанных с first_blood и radiant coureier, т.к. эти события иногда происходят после 5 минуты

In [406]:
X[include_nan].count()

first_blood_time               77677
first_blood_team               77677
first_blood_player1            77677
first_blood_player2            53243
radiant_bottle_time            81539
radiant_courier_time           96538
radiant_flying_courier_time    69751
radiant_first_ward_time        95394
dire_bottle_time               81087
dire_courier_time              96554
dire_flying_courier_time       71132
dire_first_ward_time           95404
dtype: int64

In [407]:
# Заменяем пропущенные значения на 0
for i in include_nan:
    X[i].fillna(value = 0, inplace=True)

In [408]:
# Целевая переменная в столбце 'radiant_win', так как надо предсказать победит radiant или dire(не победит radiant)
Y = features['radiant_win'].values

# Генератор разбиений

In [409]:
kf = KFold(random_state=1,shuffle=True)

## Обучим модель градиентного бустинга

In [410]:
for i in [10,20,30]:
    start_time = datetime.datetime.now()
    clf = GradientBoostingClassifier(n_estimators=i)
    print(i, np.mean(cross_val_score(clf, X, Y, cv=kf, scoring='roc_auc')))
    print('Time elapsed:', datetime.datetime.now() - start_time)

10 0.66483292280491
Time elapsed: 0:00:44.367515
20 0.6821140369500348
Time elapsed: 0:01:25.230522
30 0.6896947542059906
Time elapsed: 0:02:07.375144


### Посмотрим, увиличиться ли качество при увеличении количества деревьев

In [411]:
i = 31
clf = GradientBoostingClassifier(n_estimators=i)
print(i, np.mean(cross_val_score(clf, X, Y, cv=kf, scoring='roc_auc')))

31 0.6902468267043145


Ответ: да, алгоритм ещё не переобучился, поэтому увеличение количества деревьев имеет смысл

## Как уменьшить время на обучение модели?

1. Как и было сказано в рекомедациях для того чтоб ускорить обучение можно уменьшить глубину деревьев

2. Так же можно избавиться от некоторых признаков

# Логистическая регрессия

Стандартизируем признаки

In [412]:
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

In [413]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
log = LogisticRegression(penalty = 'l2')
gs = GridSearchCV(log, grid, scoring='accuracy', cv=kf)
gs.fit(X_scaler, Y)

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             scoring='accuracy')

In [414]:
print(gs.best_params_)

{'C': 0.1}


In [415]:
log = LogisticRegression(random_state=1 ,penalty = 'l2', C = 0.1)
start_time = datetime.datetime.now()
print(np.mean(cross_val_score(log, X_scaler, Y, cv=kf, scoring='roc_auc')))
print('Time elapsed:', datetime.datetime.now() - start_time)

0.7163509484793968
Time elapsed: 0:00:05.103469


## Логистическая регрессия работает быстрее градиентного бустига, её точность выше

In [416]:
New_X = X.drop(['lobby_type',
                'r1_hero',
                'r2_hero',
                'r3_hero',
                'r4_hero',
                'r5_hero',
                'd1_hero',
                'd2_hero',
                'd3_hero',
                'd4_hero',
                'd5_hero'
               ],axis = 1
              )

In [417]:
New_X_scaler = scaler.fit_transform(New_X)

In [418]:
log = LogisticRegression(random_state=1 ,penalty = 'l2', C = 0.1)
start_time = datetime.datetime.now()
print(np.mean(cross_val_score(log, New_X_scaler, Y, cv=kf, scoring='roc_auc')))
print('Time elapsed:', datetime.datetime.now() - start_time)

0.7163838498885824
Time elapsed: 0:00:05.025543


# Модель имеет, почти такую же точность, но быстрее обучается

Количество героев 

In [419]:
N = len(heroes) # Я решил просто взять героив из файла с героями
N

112

# Мешок слов

In [420]:
# N — количество различных героев в выборке
X_pick = np.zeros((X.shape[0], N))

for i, match_id in enumerate(X.index):
    for p in range(5):
        X_pick[i, X.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, X.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [421]:
x_pick = pd.DataFrame(X_pick)
x_pick

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97225,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97226,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
97227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
97228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,...,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Добавим его к столбцу признаков

In [422]:
for i in range(0,112):
    column = str(i)
    New_X[column] = x_pick[i]

### Обучим заново логистическую регрессию

In [423]:
New_X_scaler = scaler.fit_transform(New_X)

In [424]:
log2 = LogisticRegression(penalty = 'l2', C = 0.1)
start_time = datetime.datetime.now()
print(np.mean(cross_val_score(log2, New_X_scaler, Y, cv=kf, scoring='roc_auc')))
print('Time elapsed:', datetime.datetime.now() - start_time)

0.7518611029875298
Time elapsed: 0:00:09.156308


Точность стала выше на 3,5%

## Логистическая регрессия показывает лучшие результаты на тренировоных данных, используем её модель для тестовых данных

In [425]:
features_test = pd.read_csv('data/features_test.csv')
features_test

Unnamed: 0,match_id,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
0,6,1430287923,0,93,4,1103,1089,8,0,1,...,0,12.0,247.0,-86.0,272.0,3,4,2,0,118.0
1,7,1430293357,1,20,2,556,570,1,0,0,...,2,-29.0,168.0,-54.0,,3,2,2,1,16.0
2,10,1430301774,1,112,2,751,808,1,0,0,...,1,-22.0,46.0,-87.0,186.0,1,3,3,0,-34.0
3,13,1430323933,1,27,3,708,903,1,1,1,...,2,-49.0,30.0,-89.0,210.0,3,4,2,1,-26.0
4,16,1430331112,1,39,4,1259,661,4,0,0,...,0,36.0,180.0,-86.0,180.0,1,3,2,1,-33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17172,114369,1450212780,7,11,5,2054,1941,27,0,1,...,1,8.0,253.0,-87.0,,4,3,2,1,-33.0
17173,114377,1450222875,1,3,3,748,605,1,0,0,...,1,-1.0,133.0,-85.0,184.0,2,3,4,1,-18.0
17174,114378,1450223593,1,85,2,575,499,0,0,0,...,0,20.0,133.0,-88.0,239.0,4,4,4,0,-36.0
17175,114393,1450244771,0,7,4,1844,1176,8,1,2,...,0,-28.0,,-83.0,,1,4,1,0,


Проделаем все те же действия признаков, что и выше

In [426]:
X_test = features_test.drop('match_id', axis = 1)
New_X_test = X_test.drop(['lobby_type',
                'r1_hero',
                'r2_hero',
                'r3_hero',
                'r4_hero',
                'r5_hero',
                'd1_hero',
                'd2_hero',
                'd3_hero',
                'd4_hero',
                'd5_hero'
               ],axis = 1
              )
include_nan = New_X_test.columns[New_X_test.count() != 17177]
# Заменяем пропущенные значения на 0
for i in include_nan:
    New_X_test[i].fillna(value = 0, inplace=True)
# N — количество различных героев в выборке
N = 112
X_pick_test = np.zeros((X_test.shape[0], N))

for i, match_id in enumerate(X_test.index):
    for p in range(5):
        X_pick_test[i, X_test.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, X_test.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
x_pick_test = pd.DataFrame(X_pick_test)
for i in range(0,112):
    column = str(i)
    New_X_test[column] = x_pick_test[i]

In [427]:
New_X_test = scaler.fit_transform(New_X_test)

Модель для предсказания

In [428]:
log2 = LogisticRegression(penalty = 'l2').fit(New_X_scaler, Y)

Делаем предсказания

In [429]:
y_pred_log_reg = log2.predict_proba(New_X_test)

## Минимальное и максимальное значение прогноза

In [433]:
print(np.min(y_pred_log_reg[:,0]))
print(np.max(y_pred_log_reg[:,1]))

0.003514991626024755
0.9964850083739752
