In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [46]:
df = pd.read_csv('../data/df_split_by_map_all_info.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1813 entries, 0 to 1812
Columns: 1025 entries, version to map
dtypes: float64(339), int64(679), object(7)
memory usage: 14.2+ MB


In [47]:
df = df.drop(['version', 'url', 'number_of_confrontation', 'datetime1', 'ratio1', 'ratio2', 'name1', 'number1', 'name2', 'number2',
         'team1_stat_url', 'team2_stat_url'], axis=1)
df.head()

Unnamed: 0,team1_world_rank,team2_world_rank,1_47_top5_Times played,1_47_top5_wins,1_47_top5_draws,1_47_top5_losses,1_47_top5_Total rounds played,1_47_top5_Rounds won,1_47_top5_Win percent,1_47_top5_Pistol rounds,...,2_46_topAll_Rounds won,2_46_topAll_Win percent,2_46_topAll_Pistol rounds,2_46_topAll_Pistol rounds won,2_46_topAll_Pistol round win percent,2_46_topAll_CT round win percent,2_46_topAll_T round win percent,result,best_of,map
0,28,46,0,0,0,0,0,0,0.0,0,...,53,0.75,8,4,0.5,0.59,0.526,-1,3,Nuke
1,28,46,0,0,0,0,0,0,0.0,0,...,53,0.75,8,4,0.5,0.59,0.526,-1,3,Ancient
2,1,223,2,2,0,0,54,32,1.0,4,...,29,0.5,4,2,0.5,0.448,0.552,1,3,Mirage
3,1,223,2,2,0,0,54,32,1.0,4,...,29,0.5,4,2,0.5,0.448,0.552,1,3,Ancient
4,14,101,0,0,0,0,0,0,0.0,0,...,0,0.0,0,0,0.0,0.0,0.0,-1,3,Overpass


In [48]:
maps_decoder = LabelEncoder()
df['map'] = maps_decoder.fit_transform(df['map'])

In [49]:
df['map'][:5]

0    4
1    0
2    3
3    0
4    5
Name: map, dtype: int32

In [50]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

In [56]:
pd.DataFrame(X).head().iloc[:,:10].to_excel('without.xlsx')

<h2>Обучение моделей. Сравнение результатов.</h2>

In [51]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.56473829 0.5785124  0.56749311 0.50552486 0.53314917], mean: 0.5498835669604129


<h3>Ошибка перцептрона</h3> 

In [52]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.49035813 0.5922865  0.58677686 0.54972376 0.53867403], mean: 0.5515638555317108


In [53]:
param = {'penalty': ['l2', 'l1'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [54]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 3.13 s


({'alpha': 0.1, 'penalty': 'l1'}, 0.5675311629605954)

Вообще не понимает перцептрон

<h3>Метод опорных векторов</h3>

In [55]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.63636364 0.61983471 0.5922865  0.53867403 0.56629834], mean: 0.5906914448350913
Wall time: 2.45 s


In [12]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [13]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 49 s


({'C': 1, 'kernel': 'rbf'}, 0.5906914448350913)

нет улучшений

In [14]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [15]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 32.4 s


({'C': 1, 'degree': 3, 'kernel': 'poly'}, 0.5796752050895698)

Все плохо, обычные линейные методы показали себя не лучшим образом

<h3>Логистическая регрессия</h3>

In [16]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.56473829 0.59504132 0.56749311 0.53867403 0.5718232 ], mean: 0.5675539929683576
Wall time: 432 ms


In [17]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [18]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 2.71 s


({'C': 0.001}, 0.5774789583428459)

Понимает плохо

<h3>Дерево решений.</h3>

In [19]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.58126722 0.59779614 0.61983471 0.53314917 0.59116022], mean: 0.5846414927781075
Wall time: 435 ms


На старте дерево решений дает хороший результат

In [20]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [21]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 15.6 s


({'criterion': 'entropy', 'max_depth': 46}, 0.5945755901557007)

не плохо, но не очень хорошо

In [23]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,51,2),
         'min_samples_split': range(2,20,2),
         'min_samples_leaf': range(1,10)}

In [24]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=10)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 10min 34s


({'criterion': 'entropy',
  'max_depth': 50,
  'min_samples_leaf': 6,
  'min_samples_split': 12},
 0.6006164102095795)

<h3>Случайный лес</h3>

In [25]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.62534435 0.61432507 0.63636364 0.56077348 0.59668508], mean: 0.6066983242774302
Wall time: 1.26 s


In [26]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [27]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(), param,cv=cv, n_jobs=10)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 3min 51s


({'criterion': 'entropy', 'max_depth': 11, 'n_estimators': 500},
 0.6309590125260642)

In [35]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
32,entropy,11,500,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.652893,0.677686,0.633609,0.569061,0.621547,0.630959,0.036308,1
34,entropy,11,2000,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.644628,0.680441,0.633609,0.566298,0.616022,0.6282,0.037439,2
31,entropy,11,200,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.641873,0.669421,0.641873,0.571823,0.610497,0.627098,0.033343,3
33,entropy,11,1000,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.647383,0.677686,0.639118,0.555249,0.616022,0.627092,0.040988,4
7,gini,11,500,"{'criterion': 'gini', 'max_depth': 11, 'n_esti...",0.641873,0.663912,0.628099,0.558011,0.61326,0.621031,0.035651,5


In [36]:
param = {'n_estimators': [100,200,500,1000],
         'criterion': ['gini', 'entropy'],
         'max_depth': [10,11,12]}

In [37]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(), param,cv=cv, n_jobs=10)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1min 4s


({'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 1000},
 0.630406526338219)

In [39]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:]

Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
15,entropy,10,1000,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.644628,0.672176,0.647383,0.569061,0.618785,0.630407,0.035024,1
19,entropy,11,1000,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.650138,0.677686,0.639118,0.558011,0.624309,0.629853,0.039943,2
4,gini,11,100,"{'criterion': 'gini', 'max_depth': 11, 'n_esti...",0.658402,0.674931,0.62259,0.577348,0.61326,0.629306,0.03444,3
22,entropy,12,500,"{'criterion': 'entropy', 'max_depth': 12, 'n_e...",0.641873,0.674931,0.636364,0.566298,0.621547,0.628203,0.035534,4
3,gini,10,1000,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.663912,0.674931,0.633609,0.563536,0.604972,0.628192,0.04052,5
8,gini,12,100,"{'criterion': 'gini', 'max_depth': 12, 'n_esti...",0.644628,0.677686,0.652893,0.555249,0.610497,0.62819,0.042341,6
12,entropy,10,100,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.644628,0.672176,0.636364,0.571823,0.610497,0.627098,0.03393,7
17,entropy,11,200,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.641873,0.688705,0.619835,0.555249,0.627072,0.626547,0.042959,8
7,gini,11,1000,"{'criterion': 'gini', 'max_depth': 11, 'n_esti...",0.647383,0.666667,0.636364,0.569061,0.61326,0.626547,0.03353,9
16,entropy,11,100,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.641873,0.683196,0.636364,0.555249,0.616022,0.626541,0.041796,10


и снова не плохо.

<h3>Градиентный бустинг</h3>

In [40]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.61432507 0.60606061 0.63636364 0.54972376 0.57734807], mean: 0.5967642268998372
Wall time: 5.26 s


In [43]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,21)}

In [44]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 3min 33s


({'max_depth': 12, 'n_estimators': 100}, 0.6088945710241541)

могло бы быть и получше