In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [2]:
df = pd.read_csv('../data/df_split_by_map_all_info.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1813 entries, 0 to 1812
Columns: 1025 entries, version to map
dtypes: float64(339), int64(679), object(7)
memory usage: 14.2+ MB


In [3]:
df = df.drop(['version', 'url', 'number_of_confrontation', 'datetime1', 'ratio1', 'ratio2', 'name1', 'number1', 'name2', 'number2',
         'team1_stat_url', 'team2_stat_url'], axis=1)
df.head()

Unnamed: 0,team1_world_rank,team2_world_rank,1_47_top5_Times played,1_47_top5_wins,1_47_top5_draws,1_47_top5_losses,1_47_top5_Total rounds played,1_47_top5_Rounds won,1_47_top5_Win percent,1_47_top5_Pistol rounds,...,2_46_topAll_Rounds won,2_46_topAll_Win percent,2_46_topAll_Pistol rounds,2_46_topAll_Pistol rounds won,2_46_topAll_Pistol round win percent,2_46_topAll_CT round win percent,2_46_topAll_T round win percent,result,best_of,map
0,28,46,0,0,0,0,0,0,0.0,0,...,53,0.75,8,4,0.5,0.59,0.526,-1,3,Nuke
1,28,46,0,0,0,0,0,0,0.0,0,...,53,0.75,8,4,0.5,0.59,0.526,-1,3,Ancient
2,1,223,2,2,0,0,54,32,1.0,4,...,29,0.5,4,2,0.5,0.448,0.552,1,3,Mirage
3,1,223,2,2,0,0,54,32,1.0,4,...,29,0.5,4,2,0.5,0.448,0.552,1,3,Ancient
4,14,101,0,0,0,0,0,0,0.0,0,...,0,0.0,0,0,0.0,0.0,0.0,-1,3,Overpass


In [5]:
maps_decoder = LabelEncoder()
df['map'] = maps_decoder.fit_transform(df['map'])

In [6]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
pd.DataFrame(X_scaled).head().iloc[:,:10].to_excel('standart.xlsx')

<h2>Обучение моделей. Сравнение результатов.</h2>

In [9]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X_scaled, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.56473829 0.5785124  0.56749311 0.50552486 0.53314917], mean: 0.5498835669604129


<h3>Ошибка перцептрона</h3> 

In [10]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X_scaled, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.56749311 0.53443526 0.55647383 0.54143646 0.59944751], mean: 0.5598572363514603


In [11]:
param = {'penalty': ['l2', 'l1'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [12]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 2.78 s


({'alpha': 1e-06, 'penalty': 'l1'}, 0.5609454667214587)

In [13]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_alpha,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,1e-06,l1,"{'alpha': 1e-06, 'penalty': 'l1'}",0.567493,0.570248,0.550964,0.549724,0.566298,0.560945,0.008759,1
3,1e-05,l1,"{'alpha': 1e-05, 'penalty': 'l1'}",0.545455,0.592287,0.517906,0.549724,0.596685,0.560411,0.029921,2
4,0.0001,l2,"{'alpha': 0.0001, 'penalty': 'l2'}",0.482094,0.592287,0.589532,0.535912,0.577348,0.555434,0.041859,3
7,0.001,l1,"{'alpha': 0.001, 'penalty': 'l1'}",0.600551,0.561983,0.545455,0.519337,0.533149,0.552095,0.028007,4
0,1e-06,l2,"{'alpha': 1e-06, 'penalty': 'l2'}",0.581267,0.575758,0.473829,0.541436,0.582873,0.551033,0.041453,5


Линейная модель построенная на ошибке перцептрона дала + 1%, что не может не радовать, т.к. другие модели скорее всего дадут более хорошие результаты.

<h3>Метод опорных векторов</h3>

In [14]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.62534435 0.61432507 0.61432507 0.5359116  0.56906077], mean: 0.5917933732097469
Wall time: 2.66 s


In [15]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [16]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 49.8 s


({'C': 10, 'kernel': 'rbf'}, 0.6100010654003623)

In [17]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.641873,0.644628,0.61157,0.558011,0.593923,0.610001,0.032183,1
13,10,poly,"{'C': 10, 'kernel': 'poly'}",0.62259,0.62259,0.600551,0.555249,0.566298,0.593455,0.02809,2
9,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.625344,0.614325,0.614325,0.535912,0.569061,0.591793,0.034002,3
11,1,sigmoid,"{'C': 1, 'kernel': 'sigmoid'}",0.592287,0.61157,0.559229,0.530387,0.569061,0.572507,0.027855,4
10,1,poly,"{'C': 1, 'kernel': 'poly'}",0.597796,0.589532,0.561983,0.533149,0.549724,0.566437,0.024183,5


In [18]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [19]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 34.2 s


({'C': 1, 'degree': 1, 'kernel': 'poly'}, 0.5835410864039694)

In [20]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_C,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1,1,poly,"{'C': 1, 'degree': 1, 'kernel': 'poly'}",0.608815,0.597796,0.584022,0.530387,0.596685,0.583541,0.027714,1
1,1,2,poly,"{'C': 1, 'degree': 2, 'kernel': 'poly'}",0.625344,0.608815,0.570248,0.527624,0.563536,0.579114,0.034618,2
2,1,3,poly,"{'C': 1, 'degree': 3, 'kernel': 'poly'}",0.597796,0.589532,0.561983,0.533149,0.549724,0.566437,0.024183,3
7,1,8,poly,"{'C': 1, 'degree': 8, 'kernel': 'poly'}",0.597796,0.570248,0.559229,0.535912,0.552486,0.563134,0.0206,4
6,1,7,poly,"{'C': 1, 'degree': 7, 'kernel': 'poly'}",0.600551,0.575758,0.550964,0.530387,0.552486,0.562029,0.02403,5


<h3>Дерево решений.</h3>

In [21]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.56749311 0.60606061 0.61983471 0.51933702 0.58287293], mean: 0.5791196749006894
Wall time: 531 ms


In [22]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [23]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 14.6 s


({'criterion': 'entropy', 'max_depth': 47}, 0.6033971051550158)

In [24]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
93,entropy,47,"{'criterion': 'entropy', 'max_depth': 47}",0.586777,0.606061,0.658402,0.563536,0.60221,0.603397,0.031302,1
69,entropy,23,"{'criterion': 'entropy', 'max_depth': 23}",0.581267,0.592287,0.636364,0.555249,0.596685,0.59237,0.026288,2
72,entropy,26,"{'criterion': 'entropy', 'max_depth': 26}",0.581267,0.603306,0.625344,0.555249,0.596685,0.59237,0.023353,2
90,entropy,44,"{'criterion': 'entropy', 'max_depth': 44}",0.586777,0.614325,0.62259,0.560773,0.571823,0.591258,0.023836,4
85,entropy,39,"{'criterion': 'entropy', 'max_depth': 39}",0.573003,0.597796,0.61708,0.558011,0.604972,0.590172,0.021589,5


<h3>Логистическая регрессия</h3>

In [25]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.60881543 0.61707989 0.60330579 0.53314917 0.5718232 ], mean: 0.5868346955237965
Wall time: 582 ms


In [26]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [27]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 2.71 s


({'C': 0.1}, 0.6077865546474286)

In [28]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(10)

Unnamed: 0,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.1,{'C': 0.1},0.630854,0.633609,0.641873,0.552486,0.58011,0.607787,0.035171,1
1,0.01,{'C': 0.01},0.61157,0.628099,0.619835,0.535912,0.588398,0.596763,0.033185,2
0,0.001,{'C': 0.001},0.606061,0.62259,0.592287,0.546961,0.60221,0.594022,0.025479,3
3,1.0,{'C': 1},0.608815,0.61708,0.603306,0.533149,0.571823,0.586835,0.030909,4
4,10.0,{'C': 10},0.606061,0.614325,0.578512,0.530387,0.560773,0.578012,0.030565,5


На целый процент прибавили

<h3>Случайный лес</h3>

In [29]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.60881543 0.63085399 0.62534435 0.56629834 0.60773481], mean: 0.6078093846551907
Wall time: 1.19 s


In [30]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [31]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 2min 44s


({'criterion': 'entropy', 'max_depth': 11, 'n_estimators': 100},
 0.6326179930901177)

In [32]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(5)

Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
30,entropy,11,100,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.652893,0.669421,0.639118,0.588398,0.61326,0.632618,0.028768,1
31,entropy,11,200,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.647383,0.68595,0.630854,0.566298,0.621547,0.630407,0.038889,2
32,entropy,11,500,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.630854,0.680441,0.641873,0.563536,0.624309,0.628203,0.037735,3
33,entropy,11,1000,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.633609,0.680441,0.641873,0.566298,0.61326,0.627096,0.037401,4
34,entropy,11,2000,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.625344,0.680441,0.636364,0.560773,0.621547,0.624894,0.038314,5


не плохо

<h3>Градиентный бустинг</h3>

In [33]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.61432507 0.60881543 0.63636364 0.54972376 0.57734807], mean: 0.597315191087165
Wall time: 4.94 s


In [34]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,51,2)}

In [35]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 6min 17s


({'max_depth': 12, 'n_estimators': 100}, 0.6088945710241541)

In [48]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(5)

Unnamed: 0,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
34,44,100,"{'max_depth': 44, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1
12,22,100,"{'max_depth': 22, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1
28,38,100,"{'max_depth': 38, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1
24,34,100,"{'max_depth': 34, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1
22,32,100,"{'max_depth': 32, 'n_estimators': 100}",0.64497,0.64497,0.607143,0.636905,0.583333,0.623464,0.024424,1


тоже не улучшилось. Вывод - стандартизация данных помогает только линейным методам, а деревянные скорее ухудшает.