In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [6]:
df = pd.read_csv('../data/df_split_by_map_part_info.csv')
df.head()

Unnamed: 0,map,result,team1_world_rank,team2_world_rank,1_top_Times played,1_top_wins,1_top_draws,1_top_losses,1_top_Total rounds played,1_top_Rounds won,...,2_All_draws,2_All_losses,2_All_Total rounds played,2_All_Rounds won,2_All_Win percent,2_All_Pistol rounds,2_All_Pistol rounds won,2_All_Pistol round win percent,2_All_CT round win percent,2_All_T round win percent
0,Nuke,-1,28,46,3,0,0,3,80,32,...,0,2,140,82,0.667,12,7,0.583,0.774,0.436
1,Ancient,-1,28,46,1,0,0,1,27,11,...,0,2,135,86,0.667,12,8,0.667,0.756,0.578
2,Mirage,1,1,223,7,4,0,3,182,106,...,0,2,151,80,0.667,12,7,0.583,0.556,0.511
3,Ancient,1,1,223,2,2,0,0,54,32,...,0,0,0,0,0.0,0,0,0.0,0.0,0.0
4,Overpass,-1,14,101,4,2,0,2,106,60,...,0,0,0,0,0.0,0,0,0.0,0.0,0.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1813 entries, 0 to 1812
Data columns (total 52 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   map                             1813 non-null   object 
 1   result                          1813 non-null   int64  
 2   team1_world_rank                1813 non-null   int64  
 3   team2_world_rank                1813 non-null   int64  
 4   1_top_Times played              1813 non-null   int64  
 5   1_top_wins                      1813 non-null   int64  
 6   1_top_draws                     1813 non-null   int64  
 7   1_top_losses                    1813 non-null   int64  
 8   1_top_Total rounds played       1813 non-null   int64  
 9   1_top_Rounds won                1813 non-null   int64  
 10  1_top_Win percent               1813 non-null   float64
 11  1_top_Pistol rounds             1813 non-null   int64  
 12  1_top_Pistol rounds won         18

In [8]:
maps_decoder = LabelEncoder()
df['map'] = maps_decoder.fit_transform(df['map'])

In [9]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

<h2>Обучение моделей. Сравнение результатов.</h2>

In [10]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.56473829 0.5785124  0.56749311 0.50552486 0.53314917], mean: 0.5498835669604129


<h3>Ошибка перцептрона</h3> 

In [11]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.56749311 0.46280992 0.42975207 0.58839779 0.54143646], mean: 0.5179778701124759


In [12]:
param = {'penalty': ['l2', 'l1'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [13]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1.65 s


({'alpha': 0.1, 'penalty': 'l1'}, 0.5499048749676575)

Еле-еле перешагнули порог

<h3>Метод опорных векторов</h3>

In [14]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.63636364 0.57300275 0.58402204 0.55801105 0.56629834], mean: 0.583539564403452
Wall time: 187 ms


И сразу хорошо

In [15]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [16]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1.72 s


({'C': 10, 'kernel': 'rbf'}, 0.5835502184070742)

In [17]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [18]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 20.7 s


({'C': 1, 'degree': 1, 'kernel': 'poly'}, 0.5857373331506932)

In [19]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_C,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1,1,poly,"{'C': 1, 'degree': 1, 'kernel': 'poly'}",0.61157,0.61157,0.592287,0.549724,0.563536,0.585737,0.025169,1
1,1,2,poly,"{'C': 1, 'degree': 2, 'kernel': 'poly'}",0.589532,0.600551,0.595041,0.524862,0.552486,0.572494,0.029172,2
2,1,3,poly,"{'C': 1, 'degree': 3, 'kernel': 'poly'}",0.573003,0.586777,0.573003,0.519337,0.549724,0.560369,0.023721,3
3,1,4,poly,"{'C': 1, 'degree': 4, 'kernel': 'poly'}",0.567493,0.592287,0.570248,0.513812,0.549724,0.558713,0.026208,4
9,1,10,poly,"{'C': 1, 'degree': 10, 'kernel': 'poly'}",0.573003,0.597796,0.575758,0.497238,0.546961,0.558151,0.034461,5


Пока что не очень хорошо

<h3>Логистическая регрессия</h3>

In [20]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.63636364 0.6446281  0.5922865  0.56629834 0.58563536], mean: 0.6050423877144118
Wall time: 45.6 ms


Супер быстро и не плохой счет

In [21]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [24]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 167 ms


({'C': 1}, 0.6050423877144118)

Как жаль, что тут нечего больше улучшать

<h3>Дерево решений.</h3>

In [25]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.51515152 0.54269972 0.5399449  0.55524862 0.50552486], mean: 0.5317139247827345
Wall time: 76.6 ms


Плохо, но зато быстро считается, что радует

In [27]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [28]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1.91 s


({'criterion': 'gini', 'max_depth': 2}, 0.5912895910384609)

уже лучше, как мы видим важна малая глубина => будут хорошо работать асамблеи из деревьев

In [30]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50),
         'min_samples_split': range(2,20),
         'min_samples_leaf': range(1,10)}

In [31]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 4min 12s


({'criterion': 'gini',
  'max_depth': 2,
  'min_samples_leaf': 1,
  'min_samples_split': 2},
 0.5912895910384609)

Все равно не очень хорошо

<h3>Случайный лес</h3>

In [32]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.56473829 0.58126722 0.55922865 0.59392265 0.50828729], mean: 0.5614888209061991
Wall time: 1.64 s


In [35]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [36]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1min 26s


({'criterion': 'entropy', 'max_depth': 11, 'n_estimators': 200},
 0.5857540751563856)

In [37]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(20)

Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
31,entropy,11,200,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.595041,0.603306,0.586777,0.588398,0.555249,0.585754,0.016327,1
32,entropy,11,500,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.603306,0.597796,0.592287,0.571823,0.552486,0.58354,0.018826,2
34,entropy,11,2000,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.589532,0.608815,0.581267,0.582873,0.552486,0.582995,0.018128,3
7,gini,11,500,"{'criterion': 'gini', 'max_depth': 11, 'n_esti...",0.600551,0.608815,0.578512,0.571823,0.549724,0.581885,0.021074,4
40,entropy,31,100,"{'criterion': 'entropy', 'max_depth': 31, 'n_e...",0.614325,0.61157,0.53719,0.60221,0.538674,0.580794,0.035229,5
38,entropy,21,1000,"{'criterion': 'entropy', 'max_depth': 21, 'n_e...",0.589532,0.608815,0.570248,0.59116,0.533149,0.578581,0.025791,6
0,gini,1,100,"{'criterion': 'gini', 'max_depth': 1, 'n_estim...",0.600551,0.61157,0.550964,0.574586,0.552486,0.578031,0.024613,7
6,gini,11,200,"{'criterion': 'gini', 'max_depth': 11, 'n_esti...",0.581267,0.595041,0.592287,0.566298,0.555249,0.578028,0.01523,8
9,gini,11,2000,"{'criterion': 'gini', 'max_depth': 11, 'n_esti...",0.586777,0.603306,0.575758,0.577348,0.541436,0.576925,0.020268,9
8,gini,11,1000,"{'criterion': 'gini', 'max_depth': 11, 'n_esti...",0.586777,0.606061,0.581267,0.569061,0.541436,0.57692,0.021383,10


Все не очень хорошо

<h3>Градиентный бустинг</h3>

In [38]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.58402204 0.58402204 0.57024793 0.56629834 0.52486188], mean: 0.5658904464027519
Wall time: 940 ms


In [39]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,21)}

In [40]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 20.8 s


({'max_depth': 13, 'n_estimators': 200}, 0.585761685158973)

все грустно и печально, придется переходить к следующей обработке данных