In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.preprocessing import PowerTransformer

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [2]:
df = pd.read_csv('../data/df_split_by_map_part_info.csv')
df.head()

Unnamed: 0,map,result,team1_world_rank,team2_world_rank,1_top_Times played,1_top_wins,1_top_draws,1_top_losses,1_top_Total rounds played,1_top_Rounds won,...,2_All_draws,2_All_losses,2_All_Total rounds played,2_All_Rounds won,2_All_Win percent,2_All_Pistol rounds,2_All_Pistol rounds won,2_All_Pistol round win percent,2_All_CT round win percent,2_All_T round win percent
0,Nuke,-1,28,46,3,0,0,3,80,32,...,0,2,140,82,0.667,12,7,0.583,0.774,0.436
1,Ancient,-1,28,46,1,0,0,1,27,11,...,0,2,135,86,0.667,12,8,0.667,0.756,0.578
2,Mirage,1,1,223,7,4,0,3,182,106,...,0,2,151,80,0.667,12,7,0.583,0.556,0.511
3,Ancient,1,1,223,2,2,0,0,54,32,...,0,0,0,0,0.0,0,0,0.0,0.0,0.0
4,Overpass,-1,14,101,4,2,0,2,106,60,...,0,0,0,0,0.0,0,0,0.0,0.0,0.0


In [3]:
maps_decoder = LabelEncoder()
df['map'] = maps_decoder.fit_transform(df['map'])
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

In [4]:
X_norm = pd.DataFrame()
transform = PowerTransformer()
for i in X.columns:
    X_norm[i] = transform.fit_transform(np.array(X[i]).reshape(-1, 1)).reshape(1, -1)[0]
X_norm.head()

Unnamed: 0,map,team1_world_rank,team2_world_rank,1_top_Times played,1_top_wins,1_top_draws,1_top_losses,1_top_Total rounds played,1_top_Rounds won,1_top_Win percent,...,2_All_draws,2_All_losses,2_All_Total rounds played,2_All_Rounds won,2_All_Win percent,2_All_Pistol rounds,2_All_Pistol rounds won,2_All_Pistol round win percent,2_All_CT round win percent,2_All_T round win percent
0,0.546561,-0.461564,-0.325356,-0.329416,-1.372699,0.0,0.461334,-0.227559,-0.390533,-1.401007,...,0.0,-0.168699,-0.015431,0.131736,0.628533,0.106223,0.289221,0.509074,2.046168,0.021071
1,-1.697826,-0.461564,-0.325356,-1.014705,-1.372699,0.0,-0.511795,-0.857022,-0.95448,-1.401007,...,0.0,-0.168699,-0.051636,0.182263,0.628533,0.106223,0.448382,0.927165,1.854781,1.103045
2,0.048231,-2.183894,1.700456,0.505794,0.61737,0.0,0.461334,0.478438,0.616456,0.441944,...,0.0,-0.168699,0.061945,0.10598,0.628533,0.106223,0.289221,0.509074,0.121955,0.562669
3,-1.697826,-2.183894,1.700456,-0.630374,-0.017951,0.0,-1.416822,-0.489042,-0.390533,1.769659,...,0.0,-1.556556,-1.922741,-1.871938,-1.544899,-1.789572,-1.678101,-1.858974,-2.047743,-2.024563
4,1.022167,-0.99209,0.552079,-0.078349,-0.017951,0.0,0.048862,-0.012298,0.077822,0.21809,...,0.0,-1.556556,-1.922741,-1.871938,-1.544899,-1.789572,-1.678101,-1.858974,-2.047743,-2.024563


<h2>Обучение моделей. Сравнение результатов.</h2>

In [5]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X_norm, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.56473829 0.5785124  0.56749311 0.50552486 0.53314917], mean: 0.5498835669604129


<h3>Ошибка перцептрона</h3> 

In [6]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X_norm, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.57575758 0.44352617 0.53443526 0.56629834 0.52486188], mean: 0.5289758458517877


In [7]:
param = {'penalty': ['l2', 'l1','none'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [8]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.65 s


({'alpha': 0.001, 'penalty': 'l1'}, 0.5763937719738824, 0.0094157717855229)

+2.5%

<h3>Метод опорных векторов</h3>

In [9]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.61432507 0.60881543 0.58953168 0.56353591 0.55524862], mean: 0.586291341339056
Wall time: 178 ms


In [10]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [11]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.52 s


({'C': 1, 'kernel': 'rbf'}, 0.586291341339056, 0.023601628408420876)

In [12]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [13]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.19 s


({'C': 1, 'degree': 3, 'kernel': 'poly'},
 0.5785900187206063,
 0.009197126108774066)

+3.5%

<h3>Логистическая регрессия</h3>

In [14]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.60881543 0.61707989 0.5922865  0.58563536 0.56353591], mean: 0.5934706177800101
Wall time: 43.6 ms


In [15]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [16]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 118 ms


({'C': 0.1}, 0.5973395430954446, 0.019435847683895772)

не плохо

<h3>Дерево решений.</h3>

In [18]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.52341598 0.54820937 0.5261708  0.5359116  0.53867403], mean: 0.534476355721961
Wall time: 67.1 ms


In [19]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [20]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.91 s


({'criterion': 'gini', 'max_depth': 2},
 0.5907386268511331,
 0.030794618679773296)

In [21]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50,3),
         'min_samples_split': range(2,20,2),
         'min_samples_leaf': range(1,12,2)}

In [23]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 26.1 s


({'criterion': 'gini',
  'max_depth': 2,
  'min_samples_leaf': 1,
  'min_samples_split': 2},
 0.5907386268511331)

Это все печально

<h3>Случайный лес</h3>

In [24]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.56473829 0.59779614 0.54269972 0.59392265 0.54696133], mean: 0.5692236275360334
Wall time: 405 ms


In [25]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [27]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1min 23s


({'criterion': 'entropy', 'max_depth': 11, 'n_estimators': 1000},
 0.5857510311553507,
 0.018978187768778926)

Еле еле душа в теле, короче так не работает

<h3>Градиентный бустинг</h3>

In [28]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.58677686 0.58677686 0.57024793 0.56629834 0.52486188], mean: 0.5669923747774075
Wall time: 945 ms


In [29]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,51,2)}

In [31]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 37.9 s


({'max_depth': 16, 'n_estimators': 100},
 0.5830053422218164,
 0.01047082000614565)

In [32]:
model_xgb = XGBClassifier(max_depth=16, n_estimators=2000)
result_xgb = cross_val_score(model_xgb, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.57024793 0.58677686 0.59504132 0.6160221  0.52762431], mean: 0.5791425049084516
