In [1]:
# IPython magic
%config IPCompleter.greedy=True

# Import other notebooks
import import_ipynb
from DataLoader import *
from DataPreprocessing import *
from Utils import *

# Imports
import pandas as pd
import json
import numpy as np

from copy import copy, deepcopy

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectPercentile, RFE, RFECV
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import *

# Pandas settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

importing Jupyter notebook from DataLoader.ipynb
importing Jupyter notebook from DataPreprocessing.ipynb
importing Jupyter notebook from Utils.ipynb


In [2]:
# Loading and cleaning bad data lines
dfInitial = loadDataset("dataset_initial", 17)
dfSecond = loadDataset("dataset_second", 6)
dfBoth = dfInitial.append(dfSecond, ignore_index=True)

dataset_initial/dataset_00.json
dataset_initial/dataset_01.json
dataset_initial/dataset_02.json
dataset_initial/dataset_03.json
dataset_initial/dataset_04.json
dataset_initial/dataset_05.json
dataset_initial/dataset_06.json
dataset_initial/dataset_07.json
dataset_initial/dataset_08.json
dataset_initial/dataset_09.json
dataset_initial/dataset_10.json
dataset_initial/dataset_11.json
dataset_initial/dataset_12.json
dataset_initial/dataset_13.json
dataset_initial/dataset_14.json
dataset_initial/dataset_15.json
dataset_initial/dataset_16.json
dataset_initial/dataset_17.json
Rejected data:


Unnamed: 0,map,CT_score,T_score,is_overtime,round_status,round_status_time_left,CT_total_health,T_total_health,CT_num_alive_players,T_num_alive_players,CT_equip_value,T_equip_value,CT_money,T_money,CT_num_grenades,T_num_grenades,CT_num_he,T_num_he,CT_num_flash,T_num_flash,CT_num_smokes,T_num_smokes,CT_num_molly,T_num_molly,CT_has_Defuser,bomb_site,bomb_dropped,num_active_smokes,num_active_molotovs,round_winner,fileNumber,indexInFile
0,de_inferno,0,3,False,FreezeTime,20.0,500,500,5,5,2000.0,18600,27300,33100,0,15,0,1,0,6,0,8,0,0,False,,False,0,0,Terrorist,5,2543
1,de_train,5,0,False,FreezeTime,20.0,500,600,5,6,5100.0,1200,40050,21550,0,0,0,0,0,0,0,0,0,0,True,,False,0,0,CT,8,2251
2,de_vertigo,0,0,False,FreezeTime,20.0,500,600,5,6,1000.0,1200,4000,4800,0,0,0,0,0,0,0,0,0,0,False,,False,0,0,Terrorist,11,2162
3,de_train,0,0,False,FreezeTime,1.929688,500,500,5,5,1000.0,1000,80000,80000,0,0,0,0,0,0,0,0,0,0,False,,True,0,0,Terrorist,12,3958


dataset_second/dataset_00.json
dataset_second/dataset_01.json
dataset_second/dataset_02.json
dataset_second/dataset_03.json
dataset_second/dataset_04.json
dataset_second/dataset_05.json
dataset_second/dataset_06.json
Rejected data:


Unnamed: 0,map,CT_score,T_score,is_overtime,round_status,round_status_time_left,CT_total_health,T_total_health,CT_num_alive_players,T_num_alive_players,CT_equip_value,T_equip_value,CT_money,T_money,CT_num_grenades,T_num_grenades,CT_num_he,T_num_he,CT_num_flash,T_num_flash,CT_num_smokes,T_num_smokes,CT_num_molly,T_num_molly,CT_has_Defuser,bomb_site,bomb_dropped,num_active_smokes,num_active_molotovs,round_winner,fileNumber,indexInFile
0,de_nuke,0,0,False,FreezeTime,1.9375,500,500,5,5,1000,1000,80000,80000,0,0,0,0,0,0,0,0,0,0,False,,True,0,0,Terrorist,0,2398
1,de_vertigo,0,0,False,FreezeTime,1.921875,500,500,5,5,1000,1000,80000,80000,0,0,0,0,0,0,0,0,0,0,False,,True,0,0,Terrorist,1,1512
2,de_nuke,0,0,False,FreezeTime,1.929688,500,500,5,5,1000,1000,80000,80000,0,0,0,0,0,0,0,0,0,0,False,,True,0,0,Terrorist,1,4934
3,de_nuke,0,0,False,Normal,980.9375,500,500,5,5,1000,1000,80000,80000,0,0,0,0,0,0,0,0,0,0,False,,True,0,0,Terrorist,1,4935


In [3]:
df = dfBoth.copy()
df = assignDataTypes(df)
df = extractCategories(df)
df = remove_de_cache(df)

In [4]:
display(df)

Unnamed: 0,map,CT_score,T_score,is_overtime,round_status,round_status_time_left,CT_total_health,T_total_health,CT_num_alive_players,T_num_alive_players,CT_equip_value,T_equip_value,CT_money,T_money,CT_num_grenades,T_num_grenades,CT_num_he,T_num_he,CT_num_flash,T_num_flash,CT_num_smokes,T_num_smokes,CT_num_molly,T_num_molly,CT_has_Defuser,bomb_site,bomb_dropped,num_active_smokes,num_active_molotovs,round_winner,fileNumber,indexInFile,round_winner_codes,map_de_dust2,map_de_inferno,map_de_mirage,map_de_nuke,map_de_overpass,map_de_train,status_FreezeTime,bomb_A,bomb_B
0,de_dust2,0,0,0,FreezeTime,20.000000,500,500,5,5,1000.0,1000,4000,4000,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,CT,0,0,0,1,0,0,0,0,0,1,0,0
1,de_dust2,0,0,0,FreezeTime,1.031250,500,500,5,5,4400.0,4350,600,650,2,6,0,0,2,4,0,2,0,0,1,,0,0,0,CT,0,1,0,1,0,0,0,0,0,1,0,0
2,de_dust2,0,0,0,Normal,96.031250,391,400,4,4,3511.0,3300,750,500,2,5,0,0,2,3,0,2,0,0,1,,0,0,0,CT,0,2,0,1,0,0,0,0,0,0,0,0
3,de_dust2,0,0,0,Normal,76.031250,391,400,4,4,3111.0,2100,750,500,0,0,0,0,0,0,0,0,0,0,1,,0,2,0,CT,0,3,0,1,0,0,0,0,0,0,0,0
4,de_dust2,1,0,0,FreezeTime,19.968750,500,500,5,5,2648.0,1000,18350,10750,0,0,0,0,0,0,0,0,0,0,1,,0,0,0,CT,0,4,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122397,de_train,11,14,0,BombPlanted,15.414062,200,242,2,4,7600.0,19350,100,5950,2,4,0,0,1,3,1,1,0,0,1,B,0,2,0,Terrorist,6,4730,1,0,0,0,0,0,1,0,0,1
122398,de_train,11,15,0,FreezeTime,19.929688,500,500,5,5,5950.0,11150,11500,23900,2,2,0,0,1,2,1,0,0,0,1,,0,0,0,Terrorist,6,4731,1,0,0,0,0,0,1,1,0,0
122399,de_train,11,15,0,Normal,114.929690,500,500,5,5,15450.0,25150,1200,6700,15,20,2,0,5,10,5,5,3,5,1,,0,0,0,Terrorist,6,4732,1,0,0,0,0,0,1,0,0,0
122400,de_train,11,15,0,Normal,94.929690,500,500,5,5,12750.0,26350,1200,6700,7,13,2,0,2,6,2,3,1,4,1,,0,5,3,Terrorist,6,4733,1,0,0,0,0,0,1,0,0,0


In [5]:
# X2 = df[['mapCodes', 'CT_score', 'T_score', 'is_overtime', 'round_status_codes', 'round_status_time_left', 'CT_total_health', 'T_total_health', 'CT_num_alive_players', 'T_num_alive_players', 'CT_equip_value', 'T_equip_value', 'CT_money', 'T_money', 'CT_num_grenades', 'T_num_grenades', 'CT_num_he', 'T_num_he', 'CT_num_flash', 'T_num_flash', 'CT_num_smokes', 'T_num_smokes', 'CT_num_molly', 'T_num_molly', 'CT_has_Defuser', 'bomb_site_codes', 'bomb_dropped', 'num_active_smokes', 'num_active_molotovs']]
# X = df.drop(['Debug', 'round_winner_codes', 'round_winner', 'fileNumber', 'indexInFile', 'map', 'round_status', 'bomb_site'], axis=1)
X = df.drop(['round_winner_codes', 'round_winner', 'fileNumber', 'indexInFile', 'map', 'round_status', 'bomb_site', 'CT_num_grenades', 'T_num_grenades'], axis=1)
# print(list(set(X.columns) - set(X2.columns)) + list(set(X2.columns) - set(X.columns)))
y = df['round_winner_codes']

#Splitting the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training / 30% testing data

In [6]:
## Pipelines Creation
## 1. Scaling / Normalization of data
## 2. Reduce Dimension using PCA
## 3. Feature Selection
## 4. Apply  Classifier

In [7]:
pipeline_gaussNB= Pipeline([('scalarGauss',StandardScaler()),
#                       ('pca1',PCA(n_components=10)),
                        ('feat_selec', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10)),
                        ('gnb_classifier', GaussianNB())])

In [8]:
# pipeline_multiNomNB= Pipeline([('scalarMultiNom',MinMaxScaler()), #Not adequate for the dataset with continuous features
#                       #('pca1',PCA(n_components=10)),
#                       ('mnb_classifier', MultinomialNB())])

In [9]:
# pipeline_compNB= Pipeline([('scalarComp',MinMaxScaler()),   # Only adequate for categorical features
#                       #('pca1',PCA(n_components=10)),
#                       ('cnb_classifier', ComplementNB())])

In [10]:
pipeline_logir = Pipeline([('scalar1',StandardScaler()),
                      #('pca1',PCA(n_components=10)),
                      ('lr_classifier',LogisticRegression(random_state=42, max_iter=1000, n_jobs=-1))])

In [11]:
pipeline_dt=Pipeline([('scalar2',StandardScaler()),   # Decision trees and Random Forest do not need scalers, but I'm adding it because why not
                     #('pca2',PCA(n_components=10)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [12]:
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     #('pca3',PCA(n_components=10)),
                     ('rf_classifier',RandomForestClassifier(n_estimators=100, n_jobs=-1))])

In [14]:
pipeline_gradBoost= Pipeline([('scalar3',StandardScaler()),
                    #('pca3',PCA(n_components=10)),
                    ('rf_classifier',GradientBoostingClassifier(n_estimators=100, random_state=42))])

In [15]:
pipeline_svc=Pipeline([('scalar4',StandardScaler()),
                     #('pca3',PCA(n_components=10)),
                     ('svc_classifier',SVC(random_state=42))])

In [16]:
pipeline_lin_svc=Pipeline([('scalar5',StandardScaler()),
                     #('pca3',PCA(n_components=10)),
                     ('lin_svc_classifier',LinearSVC(random_state=42))])

In [17]:
pipeline_sgd_class=Pipeline([('scalar6',StandardScaler()),
                     #('pca3',PCA(n_components=10)),
                     ('sgd_classifier',SGDClassifier(random_state=42))])

In [18]:
# List of the pipelines with names
fastPipelines = {
    'Gaussian Naive Bayes': pipeline_gaussNB,
#     'MultinomialNB': pipeline_multiNomNB,
#     'ComplementNB': pipeline_compNB,
    'Logistic Regression': pipeline_logir,
    'Decision Tree': pipeline_dt, 
    'Random Forest': pipeline_randomforest,
    'Gradient Boosting Classifier': pipeline_gradBoost
}

slowPipelines = {
    'SVC': pipeline_svc, 
    'Linear SVC': pipeline_lin_svc, 
    'SGD Classifier': pipeline_sgd_class
}

In [19]:
def classification_report_with_accuracy_score(y_true, y_pred):

    print(classification_report(y_true, y_pred)) # print classification report
    
    return accuracy_score(y_true, y_pred) # return accuracy score

def runPipelines(pipelines):
    folds = 5
    for plName, pl in pipelines.items():
        printmd('### ' + plName + ':')
        cv = StratifiedKFold(n_splits=folds, shuffle=True)
    #     scores = cross_val_score(pl, X_train, y_train, cv = cv)
#         scores = cross_validate(pl, X, y, cv=cv, scoring=('accuracy', 'precision', 'roc_auc', 'f1'))
        scores = cross_validate(pl, X, y, cv=cv, scoring=make_scorer(classification_report_with_accuracy_score))
        display(scores)

In [20]:
runPipelines(fastPipelines)

### Gaussian Naive Bayes:

              precision    recall  f1-score   support

           0       0.69      0.68      0.68     11980
           1       0.69      0.70      0.70     12472

    accuracy                           0.69     24452
   macro avg       0.69      0.69      0.69     24452
weighted avg       0.69      0.69      0.69     24452

              precision    recall  f1-score   support

           0       0.69      0.68      0.69     11980
           1       0.70      0.71      0.71     12472

    accuracy                           0.70     24452
   macro avg       0.70      0.70      0.70     24452
weighted avg       0.70      0.70      0.70     24452

              precision    recall  f1-score   support

           0       0.69      0.67      0.68     11980
           1       0.69      0.71      0.70     12471

    accuracy                           0.69     24451
   macro avg       0.69      0.69      0.69     24451
weighted avg       0.69      0.69      0.69     24451

              preci

{'fit_time': array([20.7899189 , 20.89392018, 21.90607929, 20.91218448, 20.9701879 ]),
 'score_time': array([0.03309202, 0.03273702, 0.03797078, 0.0329957 , 0.03295994]),
 'test_score': array([0.69049566, 0.69548503, 0.6906875 , 0.6884381 , 0.68995133])}

### Logistic Regression:

              precision    recall  f1-score   support

           0       0.69      0.72      0.71     11980
           1       0.72      0.69      0.71     12472

    accuracy                           0.71     24452
   macro avg       0.71      0.71      0.71     24452
weighted avg       0.71      0.71      0.71     24452

              precision    recall  f1-score   support

           0       0.69      0.72      0.71     11980
           1       0.72      0.69      0.70     12472

    accuracy                           0.70     24452
   macro avg       0.71      0.71      0.70     24452
weighted avg       0.71      0.70      0.70     24452

              precision    recall  f1-score   support

           0       0.70      0.72      0.71     11980
           1       0.72      0.70      0.71     12471

    accuracy                           0.71     24451
   macro avg       0.71      0.71      0.71     24451
weighted avg       0.71      0.71      0.71     24451

              preci

{'fit_time': array([4.07193756, 3.72392964, 3.98410225, 3.84275651, 3.68827319]),
 'score_time': array([0.03296232, 0.03056335, 0.03397012, 0.03806615, 0.03112817]),
 'test_score': array([0.70575004, 0.70468673, 0.70909165, 0.70766022, 0.70119831])}

### Decision Tree:

              precision    recall  f1-score   support

           0       0.72      0.73      0.72     11980
           1       0.74      0.73      0.73     12472

    accuracy                           0.73     24452
   macro avg       0.73      0.73      0.73     24452
weighted avg       0.73      0.73      0.73     24452

              precision    recall  f1-score   support

           0       0.73      0.72      0.72     11980
           1       0.73      0.74      0.74     12472

    accuracy                           0.73     24452
   macro avg       0.73      0.73      0.73     24452
weighted avg       0.73      0.73      0.73     24452

              precision    recall  f1-score   support

           0       0.72      0.72      0.72     11980
           1       0.73      0.73      0.73     12471

    accuracy                           0.72     24451
   macro avg       0.72      0.72      0.72     24451
weighted avg       0.72      0.72      0.72     24451

              preci

{'fit_time': array([1.11022425, 1.12026429, 1.13782215, 1.12908006, 1.12491679]),
 'score_time': array([0.03857994, 0.03900051, 0.03645062, 0.03696156, 0.0377295 ]),
 'test_score': array([0.72840667, 0.73032881, 0.72226085, 0.72663695, 0.7210748 ])}

### Random Forest:

              precision    recall  f1-score   support

           0       0.76      0.80      0.78     11980
           1       0.80      0.76      0.78     12472

    accuracy                           0.78     24452
   macro avg       0.78      0.78      0.78     24452
weighted avg       0.78      0.78      0.78     24452

              precision    recall  f1-score   support

           0       0.76      0.80      0.78     11980
           1       0.80      0.76      0.78     12472

    accuracy                           0.78     24452
   macro avg       0.78      0.78      0.78     24452
weighted avg       0.78      0.78      0.78     24452

              precision    recall  f1-score   support

           0       0.76      0.80      0.78     11980
           1       0.80      0.76      0.78     12471

    accuracy                           0.78     24451
   macro avg       0.78      0.78      0.78     24451
weighted avg       0.78      0.78      0.78     24451

              preci

{'fit_time': array([3.55133128, 2.70257092, 2.69433856, 2.70027089, 2.78093481]),
 'score_time': array([0.15702534, 0.15137935, 0.14949322, 0.15111232, 0.14883351]),
 'test_score': array([0.77973172, 0.7787502 , 0.77825038, 0.780009  , 0.77820948])}

### Gradient Boosting Classifier:

              precision    recall  f1-score   support

           0       0.69      0.76      0.72     11980
           1       0.74      0.67      0.70     12472

    accuracy                           0.71     24452
   macro avg       0.71      0.71      0.71     24452
weighted avg       0.71      0.71      0.71     24452

              precision    recall  f1-score   support

           0       0.69      0.75      0.72     11980
           1       0.74      0.67      0.70     12472

    accuracy                           0.71     24452
   macro avg       0.71      0.71      0.71     24452
weighted avg       0.71      0.71      0.71     24452

              precision    recall  f1-score   support

           0       0.69      0.76      0.72     11980
           1       0.75      0.67      0.70     12471

    accuracy                           0.71     24451
   macro avg       0.72      0.72      0.71     24451
weighted avg       0.72      0.71      0.71     24451

              preci

{'fit_time': array([17.41183996, 17.85050797, 17.37308812, 17.32717466, 17.49199319]),
 'score_time': array([0.0580008 , 0.06200027, 0.06092095, 0.05904531, 0.05851555]),
 'test_score': array([0.71110748, 0.71020775, 0.71481739, 0.71408122, 0.71494008])}

In [21]:
runPipelines(slowPipelines)

### SVC:

              precision    recall  f1-score   support

           0       0.69      0.77      0.73     11980
           1       0.75      0.67      0.71     12472

    accuracy                           0.72     24452
   macro avg       0.72      0.72      0.72     24452
weighted avg       0.72      0.72      0.72     24452

              precision    recall  f1-score   support

           0       0.70      0.76      0.73     11980
           1       0.75      0.68      0.72     12472

    accuracy                           0.72     24452
   macro avg       0.72      0.72      0.72     24452
weighted avg       0.73      0.72      0.72     24452

              precision    recall  f1-score   support

           0       0.69      0.76      0.73     11980
           1       0.75      0.68      0.71     12471

    accuracy                           0.72     24451
   macro avg       0.72      0.72      0.72     24451
weighted avg       0.72      0.72      0.72     24451

              preci

{'fit_time': array([530.65126586, 534.15418291, 530.34049916, 529.5051918 ,
        561.31728983]),
 'score_time': array([62.34380221, 62.49989772, 61.86725235, 63.3437078 , 61.64330173]),
 'test_score': array([0.71838704, 0.72284476, 0.71919349, 0.72177007, 0.72250624])}

### Linear SVC:



              precision    recall  f1-score   support

           0       0.69      0.72      0.71     11980
           1       0.72      0.69      0.71     12472

    accuracy                           0.71     24452
   macro avg       0.71      0.71      0.71     24452
weighted avg       0.71      0.71      0.71     24452





              precision    recall  f1-score   support

           0       0.69      0.72      0.70     11980
           1       0.72      0.68      0.70     12472

    accuracy                           0.70     24452
   macro avg       0.70      0.70      0.70     24452
weighted avg       0.70      0.70      0.70     24452





              precision    recall  f1-score   support

           0       0.69      0.72      0.71     11980
           1       0.72      0.69      0.71     12471

    accuracy                           0.71     24451
   macro avg       0.71      0.71      0.71     24451
weighted avg       0.71      0.71      0.71     24451





              precision    recall  f1-score   support

           0       0.69      0.72      0.71     11979
           1       0.72      0.70      0.71     12472

    accuracy                           0.71     24451
   macro avg       0.71      0.71      0.71     24451
weighted avg       0.71      0.71      0.71     24451

              precision    recall  f1-score   support

           0       0.70      0.73      0.71     11979
           1       0.72      0.69      0.71     12472

    accuracy                           0.71     24451
   macro avg       0.71      0.71      0.71     24451
weighted avg       0.71      0.71      0.71     24451





{'fit_time': array([22.37592173, 22.37971115, 21.44912124, 21.341676  , 21.16136408]),
 'score_time': array([0.03400016, 0.03199983, 0.03203559, 0.02957201, 0.03395605]),
 'test_score': array([0.70889907, 0.70219205, 0.7059834 , 0.70631058, 0.70950063])}

### SGD Classifier:

              precision    recall  f1-score   support

           0       0.69      0.69      0.69     11980
           1       0.71      0.71      0.71     12472

    accuracy                           0.70     24452
   macro avg       0.70      0.70      0.70     24452
weighted avg       0.70      0.70      0.70     24452

              precision    recall  f1-score   support

           0       0.67      0.74      0.70     11980
           1       0.72      0.65      0.68     12472

    accuracy                           0.69     24452
   macro avg       0.70      0.69      0.69     24452
weighted avg       0.70      0.69      0.69     24452

              precision    recall  f1-score   support

           0       0.71      0.64      0.67     11980
           1       0.68      0.75      0.71     12471

    accuracy                           0.70     24451
   macro avg       0.70      0.69      0.69     24451
weighted avg       0.70      0.70      0.69     24451

              preci

{'fit_time': array([0.65899491, 0.60500169, 0.570086  , 0.67596221, 0.60296655]),
 'score_time': array([0.03200531, 0.03092933, 0.03100061, 0.0300355 , 0.03000021]),
 'test_score': array([0.70018812, 0.69348111, 0.69539078, 0.70332502, 0.69682222])}

In [22]:
rfTest = deepcopy(fastPipelines["Random Forest"])
rfTest.fit(X_train, y_train)

Pipeline(steps=[('scalar3', StandardScaler()),
                ('rf_classifier', RandomForestClassifier(n_jobs=-1))])

In [23]:
rfModel = rfTest['rf_classifier']
res = {X_train.columns[i]: rfModel.feature_importances_[i] for i in range(len(X_train.columns))}
catTest = dict(sorted(res.items(), key=lambda item: item[1], reverse=True))
display(catTest)

{'T_equip_value': 0.10728388265393118,
 'CT_equip_value': 0.10720608126511465,
 'T_money': 0.08381280717843176,
 'CT_money': 0.08093528778953364,
 'round_status_time_left': 0.0620741685387155,
 'CT_score': 0.05114484659557133,
 'T_score': 0.05074206146849082,
 'T_total_health': 0.04896795809422872,
 'CT_total_health': 0.046101566706581866,
 'CT_num_flash': 0.03423028601425586,
 'T_num_flash': 0.03240098153693643,
 'T_num_alive_players': 0.031228726571132848,
 'CT_num_alive_players': 0.02626073703469736,
 'CT_num_smokes': 0.023066566215270583,
 'T_num_molly': 0.02207946004907905,
 'T_num_smokes': 0.02161807788632579,
 'CT_num_molly': 0.020870684486232978,
 'CT_has_Defuser': 0.019752884502988916,
 'CT_num_he': 0.019507723234756115,
 'T_num_he': 0.014815236970373458,
 'num_active_smokes': 0.013630683625824662,
 'map_de_inferno': 0.00913696148975221,
 'map_de_dust2': 0.008853503982418007,
 'map_de_mirage': 0.008741671159011435,
 'bomb_dropped': 0.008631113873885187,
 'map_de_nuke': 0.00853

In [10]:
scaler = PipelineHelper([
    ('std', StandardScaler()),
    ('maxAbs', MaxAbsScaler()),
])

# feat_reduc = PCA(n_components=10)
feat_reduc = PipelineHelper([
    ('pca', PCA(n_components=10)),
#     ('pass', 'passthrough'),
])

feat_selec = PipelineHelper([
    ('rfe', RFE(estimator=GaussianNB(), n_features_to_select=10)),
    ('kBest', SelectKBest(mutual_info_classif, k=10)),
#     ('pass', 'passthrough'),
])

clf = GaussianNB()

pipe1 = Pipeline([('scaler', scaler),
                        ('feat_reduc', feat_reduc),
                        ('feat_selec', feat_selec),
                        ('gnb_classifier', clf)])
# pipe1 = Pipeline([('scaler', 'passthrough'),
#                         ('feat_reduc', 'passthrough'),
#                         ('feat_selec', 'passthrough'),
#                         ('gnb_classifier', clf)])

# param_grid = dict(feat_reduc=['passthrough', PCA(5), PCA(10)],
# #                 clf=[SVC(), LogisticRegression()],
# #                 clf__C=[0.1, 10, 100]
#                 feat_selec=[RFE(), SelectKBest()]                  
#                 gnb_classifier__var_smoothing=[1e-10, 1e-9, 1e-8])

# param_grid = [
#     {
#         'scaler': [StandardScaler(), MaxAbsScaler()],
#         'feat_reduc': ['passthrough', PCA(n_components=6), PCA(n_components=10), PCA()],
#         'feat_selec': [RFE(estimator=GaussianNB())],
#         'feat_selec__n_features_to_select': [5, 10, 15],
#         'gnb_classifier__var_smoothing': [1e-10, 1e-9, 1e-8],
#     },
#     {
#         'scaler': [StandardScaler(), MaxAbsScaler()],
#         'feat_reduc': ['passthrough', PCA(n_components=6), PCA(n_components=10), PCA()],
#         'feat_selec': [SelectKBest()],
#         'feat_selec__estimator': [GaussianNB(), RandomForestClassifier(n_estimators=100, n_jobs=-1)],
#         'feat_selec__k': [5, 10, 15],
#         'gnb_classifier__var_smoothing': [1e-10, 1e-9, 1e-8],
#     }
# ]

params = {
#     'scaler__selected_model': pipe1.named_steps['scaler'].generate({
#         'std__with_mean': [True, False],
#         'std__with_std': [True, False],
#     }),
    'feat_reduc__selected_model': pipe1.named_steps['feat_reduc'].generate({
        'pca__n_components': [15,20,25,30],
    }),
    'feat_selec__selected_model': pipe1.named_steps['feat_selec'].generate({
        'rfe__n_features_to_select': [5, 10, 15],
        'rfe__estimator': [DecisionTreeClassifier(), RandomForestClassifier(n_estimators=100, n_jobs=1)],
        'kBest__k': [5, 10, 15, 20, 25],
    }),
#     'gnb_classifier__var_smoothing': [1e-10, 1e-9, 1e-8],
}

# dict(feat_reduc=['passthrough', PCA(5), PCA(10)],
# #                 clf=[SVC(), LogisticRegression()],
# #                 clf__C=[0.1, 10, 100]
#                 feat_selec=[RFE(), SelectKBest()]                  
#                 gnb_classifier__var_smoothing=[1e-10, 1e-9, 1e-8])

cv = StratifiedKFold(n_splits=5, shuffle=True)

if __name__ == '__main__':
    model = GridSearchCV(pipe1, param_grid=params, scoring='accuracy', n_jobs=12, cv=cv, verbose=2)
    model.fit(X, y)
    print(model)
    print("Best Score: ")
    print(model.best_score_)

Fitting 5 folds for each of 660 candidates, totalling 3300 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.0min
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:  4.8min
[Parallel(n_jobs=12)]: Done 341 tasks      | elapsed: 71.8min


KeyboardInterrupt: 