In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [2]:
df = pd.read_csv('../data/df_games.csv')
df = df.drop(df[(df.best_of == 5) & (df['5_map'] == 'did not play')].index.values, axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 842 entries, 0 to 844
Columns: 1029 entries, version to 5_map
dtypes: float64(339), int64(679), object(11)
memory usage: 6.6+ MB


In [3]:
df = df.drop(['version', 'url', 'number_of_confrontation', 'datetime1', 'ratio1', 'ratio2', 'name1', 'number1', 'name2', 'number2',
         'team1_stat_url', 'team2_stat_url'], axis=1)
df.head()

Unnamed: 0,team1_world_rank,team2_world_rank,1_47_top5_Times played,1_47_top5_wins,1_47_top5_draws,1_47_top5_losses,1_47_top5_Total rounds played,1_47_top5_Rounds won,1_47_top5_Win percent,1_47_top5_Pistol rounds,...,2_46_topAll_Pistol round win percent,2_46_topAll_CT round win percent,2_46_topAll_T round win percent,result,best_of,1_map,2_map,3_map,4_map,5_map
0,28,46,0,0,0,0,0,0,0.0,0,...,0.5,0.59,0.526,-1,3,Nuke,Ancient,Inferno,did not play,did not play
1,1,223,2,2,0,0,54,32,1.0,4,...,0.5,0.448,0.552,1,3,Mirage,Ancient,Inferno,did not play,did not play
2,14,101,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,1,3,Overpass,Dust2,Vertigo,did not play,did not play
3,3,46,0,0,0,0,0,0,0.0,0,...,0.5,0.59,0.526,1,3,Dust2,Inferno,Nuke,did not play,did not play
4,7,1,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,1,3,Nuke,Overpass,Mirage,did not play,did not play


In [4]:
maps_decoder = LabelEncoder()
maps_decoder.fit(df['3_map'])
for num_map in range(1,6):
    df[f'{num_map}_map'] = maps_decoder.transform(df[f'{num_map}_map'])


In [5]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
pd.DataFrame(X_scaled).head().iloc[:,:10].to_excel('standart.xlsx')

<h2>Обучение моделей. Сравнение результатов.</h2>

In [9]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X_scaled, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.59763314 0.55621302 0.5297619  0.58333333 0.56547619], mean: 0.5664835164835165


<h3>Ошибка перцептрона</h3> 

In [10]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X_scaled, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.56804734 0.56804734 0.56547619 0.61309524 0.58333333], mean: 0.579599887292195


In [11]:
param = {'penalty': ['l2', 'l1'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [13]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 659 ms


({'alpha': 0.0001, 'penalty': 'l2'}, 0.5973161453930685)

In [17]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_alpha,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.0001,l2,"{'alpha': 0.0001, 'penalty': 'l2'}",0.615385,0.639053,0.583333,0.607143,0.541667,0.597316,0.033038,1
2,1e-05,l2,"{'alpha': 1e-05, 'penalty': 'l2'}",0.609467,0.579882,0.625,0.559524,0.565476,0.58787,0.025353,2
3,1e-05,l1,"{'alpha': 1e-05, 'penalty': 'l1'}",0.621302,0.579882,0.559524,0.607143,0.571429,0.587856,0.022921,3
5,0.0001,l1,"{'alpha': 0.0001, 'penalty': 'l1'}",0.532544,0.538462,0.619048,0.64881,0.529762,0.573725,0.050128,4
6,0.001,l2,"{'alpha': 0.001, 'penalty': 'l2'}",0.621302,0.502959,0.577381,0.595238,0.565476,0.572471,0.039517,5


Линейная модель построенная на ошибке перцептрона дала + 1%, что не может не радовать, т.к. другие модели скорее всего дадут более хорошие результаты.

<h3>Метод опорных векторов</h3>

In [18]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.65088757 0.60946746 0.56547619 0.61904762 0.5952381 ], mean: 0.6080233868695408
Wall time: 305 ms


In [19]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [20]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 8.52 s


({'C': 1, 'kernel': 'sigmoid'}, 0.6151451112989574)

In [21]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,1.0,sigmoid,"{'C': 1, 'kernel': 'sigmoid'}",0.650888,0.627219,0.553571,0.613095,0.630952,0.615145,0.033072,1
9,1.0,rbf,"{'C': 1, 'kernel': 'rbf'}",0.650888,0.609467,0.565476,0.619048,0.595238,0.608023,0.028052,2
12,10.0,rbf,"{'C': 10, 'kernel': 'rbf'}",0.60355,0.60355,0.607143,0.583333,0.589286,0.597372,0.00932,3
8,0.1,sigmoid,"{'C': 0.1, 'kernel': 'sigmoid'}",0.609467,0.591716,0.547619,0.625,0.571429,0.589046,0.027362,4
10,1.0,poly,"{'C': 1, 'kernel': 'poly'}",0.621302,0.579882,0.559524,0.60119,0.559524,0.584284,0.02409,5


In [22]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [23]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 6.25 s


({'C': 1, 'degree': 1, 'kernel': 'poly'}, 0.6127712031558186)

In [24]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_C,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1,1,poly,"{'C': 1, 'degree': 1, 'kernel': 'poly'}",0.662722,0.609467,0.577381,0.613095,0.60119,0.612771,0.027902,1
1,1,2,poly,"{'C': 1, 'degree': 2, 'kernel': 'poly'}",0.609467,0.597633,0.583333,0.613095,0.571429,0.594992,0.015719,2
2,1,3,poly,"{'C': 1, 'degree': 3, 'kernel': 'poly'}",0.621302,0.579882,0.559524,0.60119,0.559524,0.584284,0.02409,3
6,1,7,poly,"{'C': 1, 'degree': 7, 'kernel': 'poly'}",0.60355,0.568047,0.559524,0.595238,0.571429,0.579558,0.016861,4
3,1,4,poly,"{'C': 1, 'degree': 4, 'kernel': 'poly'}",0.60355,0.573964,0.565476,0.60119,0.553571,0.579551,0.019741,5


SVM ведет себя не плохо, но показывает большой разброс на разных сплитах, что печально.

<h3>Дерево решений.</h3>

In [27]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.56213018 0.4852071  0.55357143 0.54761905 0.54166667], mean: 0.5380388841927304
Wall time: 264 ms


In [26]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [28]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 6.06 s


({'criterion': 'entropy', 'max_depth': 3}, 0.579613975767822)

In [29]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,entropy,3,"{'criterion': 'entropy', 'max_depth': 3}",0.579882,0.544379,0.589286,0.595238,0.589286,0.579614,0.018291,1
1,gini,3,"{'criterion': 'gini', 'max_depth': 3}",0.597633,0.550296,0.571429,0.595238,0.583333,0.579586,0.017373,2
51,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.621302,0.538462,0.553571,0.583333,0.595238,0.578381,0.02953,3
80,entropy,34,"{'criterion': 'entropy', 'max_depth': 34}",0.597633,0.538462,0.559524,0.60119,0.583333,0.576028,0.02382,4
86,entropy,40,"{'criterion': 'entropy', 'max_depth': 40}",0.585799,0.579882,0.559524,0.589286,0.565476,0.575993,0.011574,5


Интересно, что здесь результаты хуже чем в прошлый раз с деревьями

<h3>Логистическая регрессия</h3>

In [31]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.57988166 0.59171598 0.60119048 0.58333333 0.53571429], mean: 0.5783671456748379
Wall time: 194 ms


In [32]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [34]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 856 ms


({'C': 0.01}, 0.6222879684418146)

In [35]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(10)

Unnamed: 0,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.01,{'C': 0.01},0.650888,0.627219,0.613095,0.625,0.595238,0.622288,0.018253,1
2,0.1,{'C': 0.1},0.627219,0.597633,0.630952,0.642857,0.577381,0.615209,0.024068,2
0,0.001,{'C': 0.001},0.633136,0.615385,0.595238,0.607143,0.613095,0.612799,0.012335,3
3,1.0,{'C': 1},0.579882,0.591716,0.60119,0.583333,0.535714,0.578367,0.022562,4
4,10.0,{'C': 10},0.56213,0.579882,0.595238,0.571429,0.52381,0.566498,0.023962,5


На целый процент прибавили

<h3>Случайный лес</h3>

In [36]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.63905325 0.60946746 0.5952381  0.625      0.58928571], mean: 0.6116089039165963
Wall time: 479 ms


In [37]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [38]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1min 14s


({'criterion': 'gini', 'max_depth': 31, 'n_estimators': 100},
 0.6246900535362074)

In [40]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(5)

Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
15,gini,31,100,"{'criterion': 'gini', 'max_depth': 31, 'n_esti...",0.633136,0.627219,0.642857,0.613095,0.607143,0.62469,0.013039,1
11,gini,21,200,"{'criterion': 'gini', 'max_depth': 21, 'n_esti...",0.64497,0.64497,0.619048,0.613095,0.60119,0.624655,0.017556,2
36,entropy,21,200,"{'criterion': 'entropy', 'max_depth': 21, 'n_e...",0.662722,0.639053,0.625,0.60119,0.595238,0.624641,0.024789,3
13,gini,21,1000,"{'criterion': 'gini', 'max_depth': 21, 'n_esti...",0.650888,0.639053,0.625,0.613095,0.589286,0.623464,0.021322,4
46,entropy,41,200,"{'criterion': 'entropy', 'max_depth': 41, 'n_e...",0.650888,0.639053,0.607143,0.630952,0.583333,0.622274,0.02417,5


Присутствуют незначительные улучшения

In [41]:
param = {'n_estimators': [100,200,500,1000],
         'criterion': ['gini'],
         'max_depth': range(10,41,5),
         'min_samples_split': range(2,20,2),
         'min_samples_leaf': range(1,10,2)}

In [42]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 16min 31s


({'criterion': 'gini',
  'max_depth': 35,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100},
 0.6294237813468583)

In [43]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(5)

Unnamed: 0,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
900,gini,35,1,2,100,"{'criterion': 'gini', 'max_depth': 35, 'min_sa...",0.650888,0.633136,0.64881,0.613095,0.60119,0.629424,0.019565,1
453,gini,20,5,12,200,"{'criterion': 'gini', 'max_depth': 20, 'min_sa...",0.650888,0.615385,0.630952,0.630952,0.607143,0.627064,0.01504,2
1105,gini,40,1,14,200,"{'criterion': 'gini', 'max_depth': 40, 'min_sa...",0.656805,0.639053,0.613095,0.619048,0.60119,0.625838,0.019746,3
736,gini,30,1,10,100,"{'criterion': 'gini', 'max_depth': 30, 'min_sa...",0.650888,0.615385,0.613095,0.619048,0.625,0.624683,0.013706,4
1083,gini,40,1,2,1000,"{'criterion': 'gini', 'max_depth': 40, 'min_sa...",0.633136,0.639053,0.630952,0.625,0.595238,0.624676,0.015391,5


не сильно улучшилось, ну и ладно.

<h3>Градиентный бустинг</h3>

In [45]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.61538462 0.63905325 0.60714286 0.63095238 0.54761905], mean: 0.6080304311073542
Wall time: 3.69 s


In [46]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,51,2)}

In [47]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1min 53s


({'max_depth': 16, 'n_estimators': 100}, 0.6234643561566638)

In [48]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(5)

Unnamed: 0,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
34,44,100,"{'max_depth': 44, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1
12,22,100,"{'max_depth': 22, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1
28,38,100,"{'max_depth': 38, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1
24,34,100,"{'max_depth': 34, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1
22,32,100,"{'max_depth': 32, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1


тоже не улучшилось. Вывод - стандартизация данных помогает только линейным методам, а деревянные скорее ухудшает.