In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

<h2>Предобработка данных</h2>

In [2]:
df = pd.read_csv('../data/df_split_by_map_part_info.csv')
df.head()

Unnamed: 0,map,result,team1_world_rank,team2_world_rank,1_top_Times played,1_top_wins,1_top_draws,1_top_losses,1_top_Total rounds played,1_top_Rounds won,...,2_All_draws,2_All_losses,2_All_Total rounds played,2_All_Rounds won,2_All_Win percent,2_All_Pistol rounds,2_All_Pistol rounds won,2_All_Pistol round win percent,2_All_CT round win percent,2_All_T round win percent
0,Nuke,-1,28,46,3,0,0,3,80,32,...,0,2,140,82,0.667,12,7,0.583,0.774,0.436
1,Ancient,-1,28,46,1,0,0,1,27,11,...,0,2,135,86,0.667,12,8,0.667,0.756,0.578
2,Mirage,1,1,223,7,4,0,3,182,106,...,0,2,151,80,0.667,12,7,0.583,0.556,0.511
3,Ancient,1,1,223,2,2,0,0,54,32,...,0,0,0,0,0.0,0,0,0.0,0.0,0.0
4,Overpass,-1,14,101,4,2,0,2,106,60,...,0,0,0,0,0.0,0,0,0.0,0.0,0.0


In [3]:
maps_decoder = LabelEncoder()
df['map'] = maps_decoder.fit_transform(df['map'])

In [4]:
cv = KFold(shuffle=True, random_state=1)
X = df.drop('result', axis=1)
y = df['result']

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
pd.DataFrame(X_scaled).head().iloc[:,:10].to_excel('standart.xlsx')

<h2>Обучение моделей. Сравнение результатов.</h2>

In [16]:
model_dummy = DummyClassifier(strategy="most_frequent")
result_dummy = cross_val_score(model_dummy, X_scaled, y, cv=cv)
print(f'name: Dummy, score: {result_dummy}, mean: {result_dummy.mean()}')

name: Dummy, score: [0.56473829 0.5785124  0.56749311 0.50552486 0.53314917], mean: 0.5498835669604129


<h3>Ошибка перцептрона</h3> 

In [7]:
model_perceptron = Perceptron()
result_perceptron = cross_val_score(model_perceptron, X_scaled, y, cv=cv)
print(f'name: Perceptron, score: {result_perceptron}, mean: {result_perceptron.mean()}')

name: Perceptron, score: [0.57300275 0.44077135 0.53168044 0.59944751 0.54143646], mean: 0.5372677046710196


In [8]:
param = {'penalty': ['l2', 'l1'],
         'alpha': [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]}

In [12]:
%%time
grid_search = GridSearchCV(Perceptron(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 131 ms


({'alpha': 0.0001, 'penalty': 'l1'}, 0.5603884145320609, 0.035134331034612706)

+2.5%

In [13]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_alpha,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,0.0001,l1,"{'alpha': 0.0001, 'penalty': 'l1'}",0.597796,0.548209,0.550964,0.599448,0.505525,0.560388,0.035134,1
10,0.1,l2,"{'alpha': 0.1, 'penalty': 'l2'}",0.550964,0.589532,0.509642,0.566298,0.494475,0.542182,0.035314,2
9,0.01,l1,"{'alpha': 0.01, 'penalty': 'l1'}",0.564738,0.550964,0.534435,0.494475,0.555249,0.539972,0.024772,3
6,0.001,l2,"{'alpha': 0.001, 'penalty': 'l2'}",0.53719,0.415978,0.550964,0.588398,0.558011,0.530108,0.059476,4
0,1e-06,l2,"{'alpha': 1e-06, 'penalty': 'l2'}",0.479339,0.460055,0.539945,0.593923,0.552486,0.52515,0.049055,5


<h3>Метод опорных векторов</h3>

In [14]:
%%time
model_svm = SVC()
result_svm = cross_val_score(model_svm, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: SVM, score: {result_svm}, mean: {result_svm.mean()}')

name: SVM, score: [0.62809917 0.58953168 0.58126722 0.56353591 0.56353591], mean: 0.5851939789659528
Wall time: 174 ms


+3.5%

In [15]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1],
         'kernel': ['rbf', 'poly', 'sigmoid']}

In [17]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.54 s


({'C': 0.1, 'kernel': 'rbf'}, 0.5890461622756952, 0.0259228870675407)

In [18]:
param = {'C': [1],
         'kernel': ['poly'],
         'degree': range(1,11)}

In [20]:
%%time
grid_search = GridSearchCV(SVC(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.1 s


({'C': 1, 'degree': 2, 'kernel': 'poly'},
 0.5818836278404336,
 0.020885142658681428)

Дисперсия поменьше, а так хуже

<h3>Логистическая регрессия</h3>

In [17]:
%%time
model_log = LogisticRegression()
result_log = cross_val_score(model_log, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: LogisticRegression, score: {result_log}, mean: {result_log.mean()}')

name: LogisticRegression, score: [0.58677686 0.62809917 0.57300275 0.5801105  0.5718232 ], mean: 0.5879624979072493
Wall time: 1.15 s


In [18]:
model_log.fit(X_train,y_train)
y_pred_proba = model_log.predict_proba(X_test)
log_df_res = pd.DataFrame({'true': y_test, 'pred_proba': y_pred_proba[:,1]})
log_df_res.head()

Unnamed: 0,true,pred_proba
990,1,0.43672
1084,1,0.788263
1523,1,0.469795
1137,-1,0.327167
1389,-1,0.595839


In [28]:
def limit(x):
    _limit = 0.40
    if x > _limit:
        return 1
    else:
        return -1


In [29]:
log_df_res['pred_limit'] = log_df_res.pred_proba.map(limit)
log_df_res.head()

Unnamed: 0,true,pred_proba,pred_limit
990,1,0.43672,1
1084,1,0.788263,1
1523,1,0.469795,1
1137,-1,0.327167,-1
1389,-1,0.595839,1


In [30]:
accuracy_score(log_df_res.true, log_df_res.pred_limit)

0.5895316804407713

In [75]:
param = {'C': [10**-3, 10**-2, 10**-1, 10**0, 10**1]}

In [77]:
%%time
grid_search = GridSearchCV(LogisticRegression(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 87.2 ms


({'C': 0.01}, 0.5923702114058719, 0.017279058112525184)

+4% и диспирсея маленькая

<h3>Дерево решений.</h3>

In [79]:
%%time
model_tree = DecisionTreeClassifier()
result_tree = cross_val_score(model_tree, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: Tree, score: {result_tree}, mean: {result_tree.mean()}')

name: Tree, score: [0.52066116 0.5399449  0.54545455 0.53314917 0.53867403], mean: 0.5355767620960992
Wall time: 72.5 ms


In [80]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50)}

In [82]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1.68 s


({'criterion': 'gini', 'max_depth': 2},
 0.5912895910384609,
 0.030395853243852557)

не плохо, но дисперсия слишком большая

In [83]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,gini,2,"{'criterion': 'gini', 'max_depth': 2}",0.556474,0.639118,0.570248,0.61326,0.577348,0.59129,0.030396,1
48,entropy,2,"{'criterion': 'entropy', 'max_depth': 2}",0.556474,0.639118,0.570248,0.61326,0.577348,0.59129,0.030396,1
1,gini,3,"{'criterion': 'gini', 'max_depth': 3}",0.584022,0.630854,0.545455,0.596685,0.588398,0.589083,0.027321,3
49,entropy,3,"{'criterion': 'entropy', 'max_depth': 3}",0.584022,0.633609,0.5427,0.596685,0.585635,0.58853,0.029078,4
50,entropy,4,"{'criterion': 'entropy', 'max_depth': 4}",0.575758,0.619835,0.564738,0.549724,0.585635,0.579138,0.02359,5


везде очень большая дисперсия

In [7]:
param = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2,50,3),
         'min_samples_split': range(2,20,2),
         'min_samples_leaf': range(1,12,2)}

In [8]:
%%time
grid_search = GridSearchCV(DecisionTreeClassifier(), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 25.2 s


({'criterion': 'gini',
  'max_depth': 2,
  'min_samples_leaf': 1,
  'min_samples_split': 2},
 0.5912895910384609)

In [93]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,gini,2,1,2,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.556474,0.639118,0.570248,0.61326,0.577348,0.59129,0.030396,1
878,entropy,2,3,12,"{'criterion': 'entropy', 'max_depth': 2, 'min_...",0.556474,0.639118,0.570248,0.61326,0.577348,0.59129,0.030396,1
889,entropy,2,5,16,"{'criterion': 'entropy', 'max_depth': 2, 'min_...",0.556474,0.639118,0.570248,0.61326,0.577348,0.59129,0.030396,1
888,entropy,2,5,14,"{'criterion': 'entropy', 'max_depth': 2, 'min_...",0.556474,0.639118,0.570248,0.61326,0.577348,0.59129,0.030396,1
887,entropy,2,5,12,"{'criterion': 'entropy', 'max_depth': 2, 'min_...",0.556474,0.639118,0.570248,0.61326,0.577348,0.59129,0.030396,1


Все такое себе

<h3>Случайный лес</h3>

In [94]:
%%time
model_rf = RandomForestClassifier()
result_fr = cross_val_score(model_rf, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: RandomForestClassifier, score: {result_fr}, mean: {result_fr.mean()}')

name: RandomForestClassifier, score: [0.58953168 0.58953168 0.56198347 0.55801105 0.5       ], mean: 0.5598115763359359
Wall time: 1.46 s


In [95]:
param = {'n_estimators': [100,200,500,1000,2000],
         'criterion': ['gini', 'entropy'],
         'max_depth': range(1,51,10)}

In [96]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 1min 22s


({'criterion': 'entropy', 'max_depth': 11, 'n_estimators': 100},
 0.5956836065324261,
 0.022476699168774976)

In [97]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
30,entropy,11,100,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.589532,0.639118,0.584022,0.59116,0.574586,0.595684,0.022477,1
6,gini,11,200,"{'criterion': 'gini', 'max_depth': 11, 'n_esti...",0.578512,0.614325,0.592287,0.58011,0.558011,0.584649,0.018476,2
31,entropy,11,200,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.597796,0.61708,0.573003,0.58011,0.555249,0.584648,0.021191,3
34,entropy,11,2000,"{'criterion': 'entropy', 'max_depth': 11, 'n_e...",0.592287,0.61708,0.584022,0.585635,0.541436,0.584092,0.024402,4
40,entropy,31,100,"{'criterion': 'entropy', 'max_depth': 31, 'n_e...",0.592287,0.570248,0.567493,0.616022,0.560773,0.581365,0.0203,5


+4.5 Процента к случайному угадыванию

In [100]:
param = {'n_estimators': [100,200,500],
         'criterion': ['gini','entropy'],
         'max_depth': range(10,41,5),
         'min_samples_split': range(2,20,2),
         'min_samples_leaf': range(1,10,2)}

In [101]:
%%time
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_)

Wall time: 18min 3s


({'criterion': 'gini',
  'max_depth': 10,
  'min_samples_leaf': 5,
  'min_samples_split': 18,
  'n_estimators': 100},
 0.5978844192807025)

In [102]:
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False).iloc[:,4:].head()

Unnamed: 0,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
78,gini,10,5,18,100,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.628099,0.608815,0.592287,0.585635,0.574586,0.597884,0.018744,1
1215,entropy,20,1,2,100,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.614325,0.62259,0.575758,0.60221,0.571823,0.597341,0.020331,2
42,gini,10,3,12,100,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.606061,0.61708,0.61157,0.585635,0.563536,0.596776,0.01973,3
645,gini,30,7,18,100,"{'criterion': 'gini', 'max_depth': 30, 'min_sa...",0.608815,0.61708,0.573003,0.607735,0.566298,0.594586,0.020724,4
1025,entropy,10,5,18,500,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.628099,0.608815,0.581267,0.588398,0.566298,0.594576,0.02164,5


Почти +5%

<h3>Градиентный бустинг</h3>

In [103]:
%%time
model_xgb = XGBClassifier()
result_xgb = cross_val_score(model_xgb, X_scaled, y, cv=cv, n_jobs=-1)
print(f'name: XGBClassifier, score: {result_xgb}, mean: {result_xgb.mean()}')

name: XGBClassifier, score: [0.58402204 0.58402204 0.57024793 0.56353591 0.52486188], mean: 0.5653379602149065
Wall time: 2.04 s


In [104]:
param = {'n_estimators': [100,200],
         'max_depth': range(10,51,2)}

In [106]:
%%time
grid_search = GridSearchCV(XGBClassifier(n_jobs=-1), param,cv=cv, n_jobs=-1)
grid_search.fit(X_scaled,y)
(grid_search.best_params_, grid_search.best_score_,
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)['std_test_score'].values[0])

Wall time: 38.4 s


({'max_depth': 16, 'n_estimators': 100},
 0.5819034138471608,
 0.010311853521001522)

Печально это придется походу всюду нейронки сувать, но это мое личное мнение.