In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.preprocessing import PowerTransformer

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [3]:
df = pd.read_csv('../data/df_games.csv')
df = df.drop(df[(df.best_of == 5) & (df['5_map'] == 'did not play')].index.values, axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 842 entries, 0 to 844
Columns: 1029 entries, version to 5_map
dtypes: float64(339), int64(679), object(11)
memory usage: 6.6+ MB


In [4]:
df = df.drop(['version', 'url', 'number_of_confrontation', 'datetime1', 'ratio1', 'ratio2', 'name1', 'number1', 'name2', 'number2',
         'team1_stat_url', 'team2_stat_url'], axis=1)
df.head()

Unnamed: 0,team1_world_rank,team2_world_rank,1_47_top5_Times played,1_47_top5_wins,1_47_top5_draws,1_47_top5_losses,1_47_top5_Total rounds played,1_47_top5_Rounds won,1_47_top5_Win percent,1_47_top5_Pistol rounds,...,2_46_topAll_Pistol round win percent,2_46_topAll_CT round win percent,2_46_topAll_T round win percent,result,best_of,1_map,2_map,3_map,4_map,5_map
0,28,46,0,0,0,0,0,0,0.0,0,...,0.5,0.59,0.526,-1,3,Nuke,Ancient,Inferno,did not play,did not play
1,1,223,2,2,0,0,54,32,1.0,4,...,0.5,0.448,0.552,1,3,Mirage,Ancient,Inferno,did not play,did not play
2,14,101,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,1,3,Overpass,Dust2,Vertigo,did not play,did not play
3,3,46,0,0,0,0,0,0,0.0,0,...,0.5,0.59,0.526,1,3,Dust2,Inferno,Nuke,did not play,did not play
4,7,1,0,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,1,3,Nuke,Overpass,Mirage,did not play,did not play


In [5]:
maps_decoder = LabelEncoder()
maps_decoder.fit(df['3_map'])
for num_map in range(1,6):
    df[f'{num_map}_map'] = maps_decoder.transform(df[f'{num_map}_map'])

In [6]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

In [7]:
X_norm = pd.DataFrame()
transform = PowerTransformer()
for i in X.columns:
    X_norm[i] = transform.fit_transform(np.array(X[i]).reshape(-1, 1)).reshape(1, -1)[0]
X_norm.head()

Unnamed: 0,team1_world_rank,team2_world_rank,1_47_top5_Times played,1_47_top5_wins,1_47_top5_draws,1_47_top5_losses,1_47_top5_Total rounds played,1_47_top5_Rounds won,1_47_top5_Win percent,1_47_top5_Pistol rounds,...,2_46_topAll_Pistol rounds won,2_46_topAll_Pistol round win percent,2_46_topAll_CT round win percent,2_46_topAll_T round win percent,best_of,1_map,2_map,3_map,4_map,5_map
0,-0.456001,-0.353612,-0.201968,-0.147799,0.0,-0.147799,-0.201968,-0.201968,-0.147799,-0.201968,...,0.254303,0.328579,0.714532,0.616862,0.404572,0.536579,-1.739089,-0.743002,0.09156,0.09156
1,-2.144222,1.645044,4.951278,6.765928,0.0,-0.147799,4.951278,4.951278,6.765928,4.951278,...,-0.285971,0.328579,-0.033227,0.768101,0.404572,0.031579,-1.739089,-0.743002,0.09156,0.09156
2,-0.97216,0.509862,-0.201968,-0.147799,0.0,-0.147799,-0.201968,-0.201968,-0.147799,-0.201968,...,-1.374558,-1.44801,-1.554966,-1.560098,0.404572,1.020583,-1.203108,0.905083,0.09156,0.09156
3,-1.800114,-0.353612,-0.201968,-0.147799,0.0,-0.147799,-0.201968,-0.201968,-0.147799,-0.201968,...,0.254303,0.328579,0.714532,0.616862,0.404572,-1.073353,-0.725354,0.107335,0.09156,0.09156
4,-1.397067,-2.268794,-0.201968,-0.147799,0.0,-0.147799,-0.201968,-0.201968,-0.147799,-0.201968,...,-1.374558,-1.44801,-1.554966,-1.560098,0.404572,0.536579,0.539275,-0.309511,0.09156,0.09156


In [9]:
X_norm.head().iloc[:,:10].to_excel('norm.xlsx')

<h2>Обучение моделей. Сравнение результатов.</h2>

In [39]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X_norm, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.59763314 0.55621302 0.5297619  0.58333333 0.56547619], mean: 0.5664835164835165


<h3>Ошибка перцептрона</h3> 

In [41]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X_norm, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.58579882 0.60946746 0.60714286 0.5952381  0.61904762], mean: 0.6033389687235842


In [44]:
param = {'penalty': ['l2', 'l1','none'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [45]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1.11 s


({'alpha': 1e-06, 'penalty': 'none'}, 0.6033389687235842)

In [46]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_alpha,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
26,100.0,none,"{'alpha': 100, 'penalty': 'none'}",0.585799,0.609467,0.607143,0.595238,0.619048,0.603339,0.011594,1
11,0.001,none,"{'alpha': 0.001, 'penalty': 'none'}",0.585799,0.609467,0.607143,0.595238,0.619048,0.603339,0.011594,1
2,1e-06,none,"{'alpha': 1e-06, 'penalty': 'none'}",0.585799,0.609467,0.607143,0.595238,0.619048,0.603339,0.011594,1
23,10.0,none,"{'alpha': 10, 'penalty': 'none'}",0.585799,0.609467,0.607143,0.595238,0.619048,0.603339,0.011594,1
20,1.0,none,"{'alpha': 1, 'penalty': 'none'}",0.585799,0.609467,0.607143,0.595238,0.619048,0.603339,0.011594,1


Как мы видим лучше всего без регурялизации, а так очень хорошо для перцептрона.

<h3>Метод опорных векторов</h3>

In [47]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.67455621 0.62721893 0.58333333 0.58928571 0.58928571], mean: 0.6127359819667513
Wall time: 313 ms


Очень не плохо. Попробуем пообучать

In [48]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [49]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 7.79 s


({'C': 1, 'kernel': 'rbf'}, 0.6127359819667513)

In [50]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [51]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 6.35 s


({'C': 1, 'degree': 1, 'kernel': 'poly'}, 0.618716539870386)

Почти 62 процента, что очень не плохо для SVM

<h3>Дерево решений.</h3>

In [52]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.58579882 0.49704142 0.56547619 0.56547619 0.55952381], mean: 0.5546632854325162
Wall time: 277 ms


In [53]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [54]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 6.88 s


({'criterion': 'entropy', 'max_depth': 28}, 0.5831431389123697)

Ну, как и предполагалось, дерево решений не становится лучше от таких манипуляций с данными.

<h3>Логистическая регрессия</h3>

In [56]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.61538462 0.56804734 0.58928571 0.61904762 0.54166667], mean: 0.5866863905325443
Wall time: 452 ms


In [57]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [58]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 988 ms


({'C': 0.01}, 0.6187094956325726)

In [59]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(10)

Unnamed: 0,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.01,{'C': 0.01},0.639053,0.64497,0.613095,0.613095,0.583333,0.618709,0.021991,1
0,0.001,{'C': 0.001},0.615385,0.609467,0.619048,0.630952,0.613095,0.617589,0.007372,2
2,0.1,{'C': 0.1},0.621302,0.591716,0.625,0.60119,0.52381,0.592604,0.036552,3
3,1.0,{'C': 1},0.615385,0.568047,0.589286,0.619048,0.541667,0.586686,0.029161,4
4,10.0,{'C': 10},0.591716,0.568047,0.583333,0.577381,0.52381,0.568857,0.02381,5


Нет никаких заметных улучшений, что печально.

<h3>Случайный лес</h3>

In [60]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.65088757 0.62130178 0.60714286 0.6547619  0.58333333], mean: 0.6234854888701042
Wall time: 521 ms


Из коробки уже хорошо

In [61]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [63]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 1min 20s


({'criterion': 'entropy', 'max_depth': 31, 'n_estimators': 100},
 0.6258593970132431)

In [64]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(5)

Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
40,entropy,31,100,"{'criterion': 'entropy', 'max_depth': 31, 'n_e...",0.615385,0.662722,0.636905,0.630952,0.583333,0.625859,0.026172,1
23,gini,41,1000,"{'criterion': 'gini', 'max_depth': 41, 'n_esti...",0.621302,0.639053,0.642857,0.625,0.577381,0.621119,0.023334,2
24,gini,41,2000,"{'criterion': 'gini', 'max_depth': 41, 'n_esti...",0.627219,0.633136,0.625,0.619048,0.589286,0.618738,0.015401,3
19,gini,31,2000,"{'criterion': 'gini', 'max_depth': 31, 'n_esti...",0.633136,0.639053,0.613095,0.613095,0.595238,0.618724,0.015722,4
46,entropy,41,200,"{'criterion': 'entropy', 'max_depth': 41, 'n_e...",0.650888,0.639053,0.583333,0.619048,0.60119,0.618702,0.024527,5


In [65]:
param = {'n_estimators': [100,200,500,1000],
         'criterion': ['gini'],
         'max_depth': range(10,41,5),
         'min_samples_split': range(2,20,2),
         'min_samples_leaf': range(1,10,2)}

In [66]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 16min 19s


({'criterion': 'gini',
  'max_depth': 25,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100},
 0.6341434206818823)

In [67]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(5)

Unnamed: 0,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
540,gini,25,1,2,100,"{'criterion': 'gini', 'max_depth': 25, 'min_sa...",0.680473,0.639053,0.654762,0.607143,0.589286,0.634143,0.032671,1
904,gini,35,1,4,100,"{'criterion': 'gini', 'max_depth': 35, 'min_sa...",0.668639,0.650888,0.607143,0.625,0.60119,0.630572,0.02571,2
956,gini,35,3,12,100,"{'criterion': 'gini', 'max_depth': 35, 'min_sa...",0.64497,0.639053,0.607143,0.642857,0.607143,0.628233,0.017324,3
249,gini,15,3,18,200,"{'criterion': 'gini', 'max_depth': 15, 'min_sa...",0.674556,0.627219,0.60119,0.630952,0.60119,0.627022,0.026868,4
905,gini,35,1,4,200,"{'criterion': 'gini', 'max_depth': 35, 'min_sa...",0.633136,0.633136,0.64881,0.642857,0.571429,0.625873,0.027872,5


Есть небольшие улучшения, но это все равно не очень.

<h3>Градиентный бустинг</h3>

In [69]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_norm, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.61538462 0.63905325 0.5952381  0.625      0.54761905], mean: 0.6044590025359258
Wall time: 3.9 s


In [70]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,51,2)}

In [72]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_norm,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 2min 5s


({'max_depth': 16, 'n_estimators': 100}, 0.6198999718230488)

In [73]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head(5)

Unnamed: 0,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
14,24,100,"{'max_depth': 24, 'n_estimators': 100}",0.639053,0.64497,0.630952,0.60119,0.583333,0.6199,0.023684,1
12,22,100,"{'max_depth': 22, 'n_estimators': 100}",0.639053,0.64497,0.630952,0.60119,0.583333,0.6199,0.023684,1
30,40,100,"{'max_depth': 40, 'n_estimators': 100}",0.639053,0.64497,0.630952,0.60119,0.583333,0.6199,0.023684,1
26,36,100,"{'max_depth': 36, 'n_estimators': 100}",0.639053,0.64497,0.630952,0.60119,0.583333,0.6199,0.023684,1
24,34,100,"{'max_depth': 34, 'n_estimators': 100}",0.639053,0.64497,0.630952,0.60119,0.583333,0.6199,0.023684,1


In [74]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].groupby('rank_test_score').mean()

Unnamed: 0_level_0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.639053,0.64497,0.630952,0.60119,0.583333,0.6199,0.023684
19,0.621302,0.64497,0.630952,0.589286,0.595238,0.61635,0.021143
20,0.615385,0.633136,0.630952,0.60119,0.595238,0.61518,0.015262
21,0.621302,0.621302,0.636905,0.60119,0.589286,0.613997,0.01677
22,0.639053,0.615385,0.630952,0.571429,0.607143,0.612792,0.023533
23,0.633136,0.633136,0.630952,0.589286,0.565476,0.610397,0.028001
41,0.627219,0.60355,0.607143,0.619048,0.571429,0.605678,0.019092
42,0.60355,0.615385,0.60119,0.619048,0.577381,0.603311,0.014631


Практически не улучшилось. Самое время перейти к PCA.