In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [2]:
df = pd.read_csv('../data/df_split_by_map_part_info.csv')
maps_decoder = LabelEncoder()
df['map'] = maps_decoder.fit_transform(df['map'])
df.head()

Unnamed: 0,map,result,team1_world_rank,team2_world_rank,1_top_Times played,1_top_wins,1_top_draws,1_top_losses,1_top_Total rounds played,1_top_Rounds won,...,2_All_draws,2_All_losses,2_All_Total rounds played,2_All_Rounds won,2_All_Win percent,2_All_Pistol rounds,2_All_Pistol rounds won,2_All_Pistol round win percent,2_All_CT round win percent,2_All_T round win percent
0,4,-1,28,46,3,0,0,3,80,32,...,0,2,140,82,0.667,12,7,0.583,0.774,0.436
1,0,-1,28,46,1,0,0,1,27,11,...,0,2,135,86,0.667,12,8,0.667,0.756,0.578
2,3,1,1,223,7,4,0,3,182,106,...,0,2,151,80,0.667,12,7,0.583,0.556,0.511
3,0,1,1,223,2,2,0,0,54,32,...,0,0,0,0,0.0,0,0,0.0,0.0,0.0
4,5,-1,14,101,4,2,0,2,106,60,...,0,0,0,0,0.0,0,0,0.0,0.0,0.0


In [3]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

In [4]:
transform = PCA(n_components=7)
X_pca = pd.DataFrame(transform.fit_transform(X))
transform.explained_variance_ratio_.sum()

0.9984858559604698

In [5]:
X_pca.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,1813.0,-4.554058e-15,216.876379,-353.239265,-160.919944,-23.895762,128.153592,817.622956
1,1813.0,9.719504e-15,192.329384,-395.43082,-148.087887,-32.40888,106.859297,730.467192
2,1813.0,-1.187112e-14,89.752206,-265.860488,-62.157475,-18.37494,48.64419,347.776248
3,1813.0,1.521906e-14,69.411648,-223.226782,-46.947616,-9.132695,38.831643,327.249998
4,1813.0,1.840631e-14,53.390029,-113.898587,-34.590452,-7.481863,30.990469,292.355667
5,1813.0,1.084822e-14,30.228456,-137.503874,-14.194324,-0.979217,13.420253,143.039846
6,1813.0,1.663191e-15,11.77551,-47.627658,-6.179987,-0.947828,5.937188,59.045546


In [6]:
X_pca.head().iloc[:,:10].to_excel('pca.xlsx')

<h2>Обучение моделей. Сравнение результатов.</h2>

In [10]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X_pca, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.56473829 0.5785124  0.56749311 0.50552486 0.53314917], mean: 0.5498835669604129


<h3>Ошибка перцептрона</h3> 


In [12]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X_pca, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.60330579 0.5399449  0.54820937 0.54696133 0.55524862], mean: 0.55873399996956


In [13]:
param = {'penalty': ['l2', 'l1', 'none'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [14]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.57 s


({'alpha': 1e-06, 'penalty': 'l1'}, 0.5631538894723225, 0.023823703697290505)

<h3>Метод опорных векторов</h3>

In [15]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.61432507 0.58126722 0.59504132 0.56077348 0.5718232 ], mean: 0.58464605877966
Wall time: 661 ms


In [16]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [17]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.95 s


({'C': 1, 'kernel': 'rbf'}, 0.58464605877966, 0.018626042474452584)

In [18]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [19]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 6.85 s


({'C': 1, 'degree': 1, 'kernel': 'poly'},
 0.5862898193385385,
 0.024883772901611846)

Почти не отличаются

<h3>Логистическая регрессия</h3>

In [20]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.61432507 0.60055096 0.61707989 0.58563536 0.58839779], mean: 0.6011978144072568
Wall time: 22 ms


In [21]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [23]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 76.1 ms


({'C': 0.001}, 0.6022997427819126, 0.014141796538373476)

<h3>Дерево решений.</h3>

In [24]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.52892562 0.53168044 0.5399449  0.53314917 0.53867403], mean: 0.5344748337214436
Wall time: 49.8 ms


In [25]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [26]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 825 ms


({'criterion': 'entropy', 'max_depth': 2},
 0.5658980564053391,
 0.020826670795880027)

In [27]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50,3),
         'min_samples_split': range(2,20,2),
         'min_samples_leaf': range(1,12,2)}

In [28]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 11.7 s


({'criterion': 'entropy',
  'max_depth': 2,
  'min_samples_leaf': 1,
  'min_samples_split': 2},
 0.5658980564053391)

Все грустно

<h3>Случайный лес</h3>


In [31]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.58402204 0.5785124  0.57024793 0.54696133 0.58287293], mean: 0.5725233246579304
Wall time: 311 ms


In [32]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [34]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1min 11s


({'criterion': 'gini', 'max_depth': 11, 'n_estimators': 1000},
 0.5874176217219913,
 0.011859176055623718)

Я хочу плакать, все очень плохо.

<h3>Градиентный бустинг</h3>

In [36]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_pca, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.56198347 0.58953168 0.58402204 0.56077348 0.59392265], mean: 0.578046664535866
Wall time: 769 ms


In [37]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,51,2)}

In [38]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_pca,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 23.6 s


({'max_depth': 24, 'n_estimators': 200},
 0.5752735795930171,
 0.024863465671416386)