In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [2]:
df = pd.read_csv('../data/df_games.csv')
df = df.drop(df[(df.best_of == 5) & (df['5_map'] == 'did not play')].index.values, axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 842 entries, 0 to 844
Columns: 1029 entries, version to 5_map
dtypes: float64(339), int64(679), object(11)
memory usage: 6.6+ MB


In [3]:
df = df.drop(['version', 'url', 'number_of_confrontation', 'datetime1', 'ratio1', 'ratio2', 'name1', 'number1', 'name2', 'number2',
         'team1_stat_url', 'team2_stat_url'], axis=1)
df.head()

Unnamed: 0,team1_world_rank,team2_world_rank,1_47_top5_Times played,1_47_top5_wins,1_47_top5_draws,1_47_top5_losses,1_47_top5_Total rounds played,1_47_top5_Rounds won,1_47_top5_Win percent,1_47_top5_Pistol rounds,...,2_46_topAll_Pistol round win percent,2_46_topAll_CT round win percent,2_46_topAll_T round win percent,result,best_of,1_map,2_map,3_map,4_map,5_map
0,28,46,0,0,0,0,0,0,0.0,0,...,0.5,0.59,0.526,-1,3,Nuke,Ancient,Inferno,did not play,did not play
1,1,223,2,2,0,0,54,32,1.0,4,...,0.5,0.448,0.552,1,3,Mirage,Ancient,Inferno,did not play,did not play
2,14,101,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,1,3,Overpass,Dust2,Vertigo,did not play,did not play
3,3,46,0,0,0,0,0,0,0.0,0,...,0.5,0.59,0.526,1,3,Dust2,Inferno,Nuke,did not play,did not play
4,7,1,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,1,3,Nuke,Overpass,Mirage,did not play,did not play


In [4]:
maps_decoder = LabelEncoder()
maps_decoder.fit(df['3_map'])
for num_map in range(1,6):
    df[f'{num_map}_map'] = maps_decoder.transform(df[f'{num_map}_map'])


In [5]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

<h3>PCA</h3>
Подберем такое количество компонент, что бы они объясняли 99 процентов изменчивости данных, в ходе исследований можем поизменять это число

In [6]:
transform = PCA(n_components=50)
X_pca = pd.DataFrame(transform.fit_transform(X))
transform.explained_variance_ratio_.sum()

0.9920327061834668

In [7]:
X_pca.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,842.0,2.16032e-14,492.421734,-724.338402,-438.412625,-45.41039,361.725083,1552.266777
1,842.0,-6.428218e-15,303.719362,-1100.155949,-163.548987,-22.520345,176.101093,1110.610566
2,842.0,-5.210928e-15,268.333777,-651.114271,-186.939257,-23.344049,160.345933,953.157246
3,842.0,-8.620183e-15,186.793392,-808.667866,-68.872657,14.488943,68.790069,844.12506
4,842.0,1.181847e-14,167.297117,-567.389253,-94.798129,-6.665606,60.506018,663.356569
5,842.0,7.198254e-15,162.371922,-474.677659,-88.52717,1.463945,72.549942,833.187387
6,842.0,-3.259045e-14,160.271418,-627.648246,-75.709649,-0.54673,77.336618,685.944058
7,842.0,-1.186699e-14,146.346628,-525.821768,-64.053072,-12.523822,70.471115,552.335869
8,842.0,1.046405e-15,141.520641,-476.011442,-86.944526,1.970237,81.41566,496.497312
9,842.0,-1.849774e-14,139.002936,-435.436353,-67.114658,7.727244,62.778241,513.89493


Очень большой разброс. Давайте стандартизируем.

In [8]:
scaler = StandardScaler()
X_pca_scaled = scaler.fit_transform(X_pca)

In [10]:
pd.DataFrame(X_pca).head().iloc[:,:10].to_excel('standart.xlsx')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-440.123499,-134.30071,35.155989,72.356797,20.239936,-93.186579,-141.534898,66.090362,-42.643316,-76.824591
1,262.586561,-18.824813,575.088506,-341.585038,-31.82841,-300.333827,-51.879588,228.379963,-35.245972,92.777063
2,-56.467931,442.484628,416.711355,-103.748906,-157.424444,-82.762536,-170.413862,82.920085,-96.734546,-18.608072
3,158.495153,236.349291,391.157137,-293.570464,-208.453806,-204.540333,-342.431121,143.849038,-205.930876,-160.427214
4,365.533394,-163.568859,578.731583,-415.111346,156.741297,-194.845959,126.349197,195.727069,-31.055166,-110.829676


<h2>Обучение моделей. Сравнение результатов.</h2>

In [19]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X_pca, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.59763314 0.55621302 0.5297619  0.58333333 0.56547619], mean: 0.5664835164835165


<h3>Ошибка перцептрона</h3> 

In [33]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X_pca_scaled, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.56804734 0.65680473 0.48214286 0.58928571 0.54761905], mean: 0.5687799380107073


In [34]:
param = {'penalty': ['l2', 'l1'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [37]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 84.1 ms


({'alpha': 1e-05, 'penalty': 'l2'}, 0.5927021696252466)

In [38]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 124 ms


({'alpha': 1, 'penalty': 'l1'}, 0.5737109044801353)

Как мы видим на стандартизированном множестве перцептрон обучается лучше.

<h3>Метод опорных векторов</h3>

In [39]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.60946746 0.62130178 0.58333333 0.61904762 0.5952381 ], mean: 0.6056776556776557
Wall time: 48 ms


In [40]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_pca_scaled, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.59763314 0.61538462 0.5952381  0.60119048 0.58333333], mean: 0.598555931248239
Wall time: 47 ms


А вот SVM показывает почти одинаковые результаты

In [42]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [43]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 374 ms


({'C': 1, 'kernel': 'rbf'}, 0.6056776556776557)

In [44]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 346 ms


({'C': 1, 'kernel': 'sigmoid'}, 0.6056987883910961)

И снова почти одинаково, но при этом гораздо быстрее. можно попробовать еще поподбирать.

In [46]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [48]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 227 ms


({'C': 1, 'degree': 1, 'kernel': 'poly'}, 0.6175965060580445)

In [49]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 353 ms


({'C': 1, 'degree': 1, 'kernel': 'poly'}, 0.6032896590588898)

Такое себе.

<h3>Дерево решений.</h3>

In [50]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.52071006 0.49112426 0.49404762 0.5297619  0.5952381 ], mean: 0.5261763877148493
Wall time: 63 ms


In [51]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_pca_scaled, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.52662722 0.50295858 0.46428571 0.52380952 0.5952381 ], mean: 0.5225838264299802
Wall time: 57 ms


In [52]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [53]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1.64 s


({'criterion': 'entropy', 'max_depth': 3}, 0.5985488870104254)

In [54]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1.5 s


({'criterion': 'entropy', 'max_depth': 3}, 0.5997323189630882)

In [55]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50),
         'min_samples_split': range(2,20),
         'min_samples_leaf': range(1,10)}

In [56]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 3min 51s


({'criterion': 'entropy',
  'max_depth': 3,
  'min_samples_leaf': 1,
  'min_samples_split': 6},
 0.5997393632009016)

In [57]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 3min 30s


({'criterion': 'entropy',
  'max_depth': 3,
  'min_samples_leaf': 1,
  'min_samples_split': 9},
 0.5997323189630882)

Все печально, деревья не справились

<h3>Логистическая регрессия</h3>

In [58]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.62130178 0.62130178 0.60714286 0.6547619  0.58333333], mean: 0.6175683291067906
Wall time: 202 ms


In [59]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_pca_scaled, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.62130178 0.62130178 0.60714286 0.6547619  0.58928571], mean: 0.6187588052972668
Wall time: 652 ms


не плохо оно выдает, конечно. Подберем параметры.

In [60]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [63]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 32 ms


({'C': 0.1}, 0.6235207100591716)

Я думал будет лучше.

<h3>Случайный лес</h3>

In [64]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.60946746 0.58579882 0.61309524 0.6547619  0.58333333], mean: 0.6092913496759651
Wall time: 365 ms


In [65]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_pca_scaled, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.63905325 0.59171598 0.60119048 0.63690476 0.54166667], mean: 0.6021062271062271
Wall time: 359 ms


In [66]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [67]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1min 14s


({'criterion': 'gini', 'max_depth': 41, 'n_estimators': 100},
 0.6270780501549733)

In [68]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1min 14s


({'criterion': 'entropy', 'max_depth': 41, 'n_estimators': 200},
 0.6187588052972669)

Еще поподбираем параметры для случайного леса с pca без стандартизации

In [69]:
param = {'n_estimators': [100,200,500,1000],
         'criterion': ['gini'],
         'max_depth': range(10,41,5),
         'min_samples_split': range(2,20,2),
         'min_samples_leaf': range(1,10,2)}

In [70]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 14min 21s


({'criterion': 'gini',
  'max_depth': 15,
  'min_samples_leaf': 5,
  'min_samples_split': 18,
  'n_estimators': 100},
 0.6294096928712314)

Как мы видим PCA не улучшил Случайный лес

<h3>Градиентный бустинг</h3>

In [71]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.56213018 0.56804734 0.57142857 0.58333333 0.60119048], mean: 0.5772259791490562
Wall time: 1.96 s


In [72]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_pca_scaled, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.56213018 0.56804734 0.57142857 0.58333333 0.60119048], mean: 0.5772259791490562
Wall time: 1.53 s


В градиентном бустинге вообще очень грустно

In [74]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,51,2)}

In [75]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 20.9 s


({'max_depth': 16, 'n_estimators': 200}, 0.599774584389969)

In [76]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 20.3 s


({'max_depth': 16, 'n_estimators': 200}, 0.599774584389969)

Все печально. PCA только ухудшил градиентный бустинг