In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [2]:
df = pd.read_csv('../data/df_split_by_map_all_info.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1813 entries, 0 to 1812
Columns: 1025 entries, version to map
dtypes: float64(339), int64(679), object(7)
memory usage: 14.2+ MB


In [3]:
df = df.drop(['version', 'url', 'number_of_confrontation', 'datetime1', 'ratio1', 'ratio2', 'name1', 'number1', 'name2', 'number2',
         'team1_stat_url', 'team2_stat_url'], axis=1)
df.head()

Unnamed: 0,team1_world_rank,team2_world_rank,1_47_top5_Times played,1_47_top5_wins,1_47_top5_draws,1_47_top5_losses,1_47_top5_Total rounds played,1_47_top5_Rounds won,1_47_top5_Win percent,1_47_top5_Pistol rounds,...,2_46_topAll_Rounds won,2_46_topAll_Win percent,2_46_topAll_Pistol rounds,2_46_topAll_Pistol rounds won,2_46_topAll_Pistol round win percent,2_46_topAll_CT round win percent,2_46_topAll_T round win percent,result,best_of,map
0,28,46,0,0,0,0,0,0,0.0,0,...,53,0.75,8,4,0.5,0.59,0.526,-1,3,Nuke
1,28,46,0,0,0,0,0,0,0.0,0,...,53,0.75,8,4,0.5,0.59,0.526,-1,3,Ancient
2,1,223,2,2,0,0,54,32,1.0,4,...,29,0.5,4,2,0.5,0.448,0.552,1,3,Mirage
3,1,223,2,2,0,0,54,32,1.0,4,...,29,0.5,4,2,0.5,0.448,0.552,1,3,Ancient
4,14,101,0,0,0,0,0,0,0.0,0,...,0,0.0,0,0,0.0,0.0,0.0,-1,3,Overpass


In [4]:
maps_decoder = LabelEncoder()
df['map'] = maps_decoder.fit_transform(df['map'])

In [5]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

<h3>PCA</h3>
Подберем такое количество компонент, что бы они объясняли 99 процентов изменчивости данных, в ходе исследований можем поизменять это число

In [6]:
transform = PCA(n_components=50)
X_pca = pd.DataFrame(transform.fit_transform(X))
transform.explained_variance_ratio_.sum()

0.9921383420297364

In [7]:
X_pca.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,1813.0,-1.116175e-14,483.530792,-737.782266,-444.657587,-27.226583,350.021211,1543.100387
1,1813.0,2.014837e-14,312.478929,-1065.871158,-174.346783,-14.233293,188.434233,1114.60037
2,1813.0,-3.323443e-15,274.819497,-624.892456,-197.531442,-36.51228,164.446578,946.110976
3,1813.0,-3.629137e-14,185.650877,-792.545726,-66.121102,16.101377,72.170466,855.790232
4,1813.0,5.00105e-14,171.623681,-555.459685,-102.475212,-10.419311,65.880455,668.201735
5,1813.0,-4.070434e-14,164.22129,-549.894512,-87.491031,14.01282,67.965713,609.816582
6,1813.0,3.918371e-14,155.031985,-455.723889,-82.737301,-20.134646,83.923241,800.996455
7,1813.0,-3.572701e-14,151.570186,-609.938953,-89.736066,5.761029,72.5338,698.548152
8,1813.0,3.377136e-14,144.586502,-544.701085,-86.843489,5.249595,88.042743,641.714598
9,1813.0,8.316446e-15,140.613143,-436.745798,-73.679502,7.152507,70.747152,465.930728


Очень большой разброс. Давайте стандартизируем.

In [8]:
pd.DataFrame(X_pca).head().iloc[:,:10].to_excel('pca.xlsx')

<h2>Обучение моделей. Сравнение результатов.</h2>

In [9]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X_pca, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.56473829 0.5785124  0.56749311 0.50552486 0.53314917], mean: 0.5498835669604129


<h3>Ошибка перцептрона</h3> 

In [10]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X_pca, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.52066116 0.53168044 0.55096419 0.53314917 0.50828729], mean: 0.528948449842473


In [11]:
param = {'penalty': ['l2', 'l1'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [12]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1.75 s


({'alpha': 0.1, 'penalty': 'l1'}, 0.5510296333500754)

<h3>Метод опорных векторов</h3>

In [13]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.65013774 0.61707989 0.59504132 0.52486188 0.56629834], mean: 0.5906838348325039
Wall time: 183 ms


In [15]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf']}

In [16]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 827 ms


({'C': 10, 'kernel': 'rbf'}, 0.5923504253991446)

Такое себе.

<h3>Дерево решений.</h3>

In [21]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.61707989 0.61432507 0.61983471 0.5441989  0.57734807], mean: 0.5945573261494909
Wall time: 104 ms


In [22]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [23]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 4.45 s


({'criterion': 'entropy', 'max_depth': 39}, 0.606707456280535)

Все печально, деревья не справились

<h3>Логистическая регрессия</h3>

In [17]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.55096419 0.62534435 0.62534435 0.55248619 0.58839779], mean: 0.5885073740925073
Wall time: 44 ms


не плохо оно выдает, конечно. Подберем параметры.

In [19]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [20]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 167 ms


({'C': 1}, 0.5885073740925073)

Я думал будет лучше.

<h3>Случайный лес</h3>

In [24]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.62258953 0.60330579 0.61983471 0.5718232  0.61049724], mean: 0.6056100939074319
Wall time: 702 ms


In [25]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [26]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 2min 38s


({'criterion': 'gini', 'max_depth': 41, 'n_estimators': 1000},
 0.6182807482154544)

Еще поподбираем параметры для случайного леса с pca без стандартизации

<h3>Градиентный бустинг</h3>

In [27]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.63360882 0.61432507 0.61983471 0.55801105 0.60220994], mean: 0.6055979179032921
Wall time: 1.65 s


В градиентном бустинге вообще очень грустно

In [28]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,51,2)}

In [29]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 2min 28s


({'max_depth': 12, 'n_estimators': 100}, 0.6144331309072645)

Все печально. PCA только ухудшил градиентный бустинг