# MODEL SELECTION - OVERLEAF TABLES

In this notebook we just prepare the tables to be written in Overleaf document.

Probably, the LaTeX tables need some manual adjustment afterwards.

## Modules and configuration

### Modules

In [1]:
import pandas as pd

import sys

from collections import OrderedDict

from sklearn.linear_model import Perceptron, LogisticRegression, PassiveAggressiveClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier, HistGradientBoostingClassifier

from sklearn.gaussian_process.kernels import RBF, RationalQuadratic, DotProduct

import pickle


### Configuration

In [40]:
RANDOM_STATE = 11

RESULTS_SUMMARY_FILE = "ModelPreselection_PrecisionResults_OversampledSMOTE_n3.csv"
DEFAULT_MODELS_IN = "Default_models.csv"
MODELS_FOLDER = "../data/ML_MODELS/ML_model_preselection/"


OTS_CLF_GRIDS_OUT = "ots_clf_param_grids_n3.pickle"

LATEX_INI_RESULTS_OUT = "Default_results_n3_ML_preselect.tex"
LATEX_OPT_RESULTS_OUT = "Opt_results_n3_ML_preselect.tex"
LATEX_PARAMGRID_OUT = "Initial_param_grid_n3_ML_preselect.tex"
LATEX_DEFAULT_PARAMS_OUT = "Default_params_n3_ML_preselect.tex"
LATEX_DEFAULT_MODELS_OUT = "Default_models_n3_ML_preselect.tex"

OFF_THE_SHELF_CLASSIFIERS = OrderedDict({
    'Perceptron': {
        'clf': Perceptron(),
        'param_grid': {'penalty': ['l1', 'l2', 'elasticnet'],
                       'alpha': [0.001, 0.0001, 0.00001],
                       'l1_ratio': [None, 0.075, 0.15, 0.30],
                       'max_iter': [500, 1000, 2000],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'LogisticRegression': {
        'clf': LogisticRegression(),
        'param_grid': {'penalty': ['l1', 'l2', 'elasticnet'],
                       'C': [0.5, 1.0, 2.0],
                       'l1_ratio': [None, 0.075, 0.15, 0.30],
                       'solver': ['saga'],
                       'max_iter': [50, 100, 200],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'PassiveAggressiveClassifier': {
        'clf': PassiveAggressiveClassifier(),
        'param_grid': {'C': [0.5, 1.0, 2.0],
                       'max_iter': [500, 1000, 2000],
                       'loss': ['hinge', 'squared_hinge'],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'SVC': {
        'clf': SVC(),
        'param_grid': {'C': [0.5, 1.0, 2.0],
                       'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'recomputed'],
                       'degree': [2, 3, 6],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'KNeighborsClassifier': {
        'clf': KNeighborsClassifier(),
        'param_grid': {'n_neighbors': [1, 3, 5, 10],
                       'weights': ['uniform', 'distance'],
                       'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                       'p': [1, 2]
                      }
    },
    'GaussianProcessClassifier': {
        'clf': GaussianProcessClassifier(),
        'param_grid': {'kernel': [RBF(), RationalQuadratic(), DotProduct()],
                       'max_iter_predict': [50, 100, 200],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'DecisionTreeClassifier': {
        'clf': DecisionTreeClassifier(),
        'param_grid': {'criterion': ['gini', 'entropy', 'log_loss'],
                       'max_depth': [25, 50, 100],
                       'min_samples_leaf': [5, 10, 20],
                       'max_features': [None, 'sqrt', 'log2'],
                       'ccp_alpha': [0.005, 0.015, 0.030],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'RandomForestClassifier': {
        'clf': RandomForestClassifier(),
        'param_grid': {'n_estimators': [50, 100, 200],
                       'criterion': ['gini', 'entropy', 'log_loss'],
                       'max_depth': [25, 50, 100],
                       'min_samples_leaf': [5, 10, 20],
                       'max_features': [None, 'sqrt', 'log2'],
                       'ccp_alpha': [0.005, 0.015, 0.030],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'AdaBoostClassifier': {
        'clf': AdaBoostClassifier(),
        'param_grid': {'n_estimators': [25, 50, 100],
                       'learning_rate': [0.5, 1.0, 2.0],
                       'algorithm': ['SAMME', 'SAMME.R'],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'GradientBoostingClassifier': {
        'clf': GradientBoostingClassifier(),
        'param_grid': {'loss': ['log_loss', 'deviance'],
                       'learning_rate': [0.05, 0.1, 0.2],
                       'n_estimators': [25, 50, 100],
                       'criterion': ['friedman_mse', 'squared_error'],
                       'max_depth': [25, 50, 100],
                       'min_samples_leaf': [5, 10, 20],
                       'ccp_alpha': [0.005, 0.015, 0.030],
                       'random_state': [RANDOM_STATE]
                      }
    }
})


#S4_METADATA_FILE = "../data/DATASETS_CESIUM/cesium_ML_FINAL_S4.csv"
#S4_RV_CURVES_INFO_FILE = "../data/SYNTH_RV_SAMPLES/RV_FINAL_ML_SyntheticDatasets.csv"

#REL_FEATURES_IN = "../data/ML_MODELS/ML_pipeline_steps/Reliable_features.pickle"
#UNREL_FEATURES_IN = "../data/ML_MODELS/ML_pipeline_steps/Unreliable_features.pickle"

#MODELS_FOLDER = "../data/ML_MODELS/ML_model_preselection/"

#ADD_META_COLUMNS = [
#    'S3_sampling_idx', 'S3_Tobs', 'S3_Ps_mean', 'S3_Ps_median', 'S3_Ps_stdev', 'S3_NumPoints',
#    'S2_errorRV_dist_idx', 'S2_errorRV_dist_name', 'S2_errorRV_dist_loc', 'S2_errorRV_dist_scale',
#    'S4_errorRV_mean', 'S4_errorRV_median', 'S4_errorRV_stdev'
#]

#IMG_FOLDER = "./img/"

### Functions

## Load data

### Load default models

In [41]:
models = pd.read_csv(MODELS_FOLDER + DEFAULT_MODELS_IN, sep=',', decimal='.')
models

Unnamed: 0,Family,Classifier
0,Linear models,Perceptron
1,Linear models,LogisticRegression
2,Linear models,PassiveAggressiveClassifier
3,Support Vector Machines,SVC
4,Nearest-Neighbors,KNeighborsClassifier
5,Gaussian Processes,GaussianProcessClassifier
6,Tree models,DecisionTreeClassifier
7,Ensemble models,RandomForestClassifier
8,Ensemble models,AdaBoostClassifier
9,Ensemble models,GradientBoostingClassifier


### Load results

In [42]:
RESULTS_SUMMARY_FILE

'ModelPreselection_PrecisionResults_OversampledSMOTE_n3.csv'

In [43]:
results = pd.read_csv(MODELS_FOLDER + RESULTS_SUMMARY_FILE, sep=',', decimal='.')
results

Unnamed: 0,Classifier,BM_tr_precision,BM_val_precision,BMOPT_tr_precision,BMOPT_val_precision
0,Perceptron,0.63,0.1,0.62,0.09
1,LogisticRegression,0.66,0.09,0.67,0.08
2,PassiveAggressiveClassifier,0.65,0.1,0.62,0.1
3,SVC,0.93,0.14,0.97,0.19
4,KNeighborsClassifier,0.77,0.11,1.0,0.12
5,GaussianProcessClassifier,1.0,0.11,0.98,0.13
6,DecisionTreeClassifier,1.0,0.08,0.95,0.12
7,RandomForestClassifier,1.0,0.2,0.99,0.06
8,AdaBoostClassifier,0.87,0.13,0.92,0.14
9,GradientBoostingClassifier,0.97,0.1,0.84,0.09


#### Separate initial results

In [44]:
initial_results = results[['Classifier', 'BM_tr_precision', 'BM_val_precision']].copy()
initial_results

Unnamed: 0,Classifier,BM_tr_precision,BM_val_precision
0,Perceptron,0.63,0.1
1,LogisticRegression,0.66,0.09
2,PassiveAggressiveClassifier,0.65,0.1
3,SVC,0.93,0.14
4,KNeighborsClassifier,0.77,0.11
5,GaussianProcessClassifier,1.0,0.11
6,DecisionTreeClassifier,1.0,0.08
7,RandomForestClassifier,1.0,0.2
8,AdaBoostClassifier,0.87,0.13
9,GradientBoostingClassifier,0.97,0.1


In [45]:
initial_results.rename(columns={'BM_tr_precision': 'Training sample', 'BM_val_precision': 'Validation sample'},
                       inplace=True)
initial_results

Unnamed: 0,Classifier,Training sample,Validation sample
0,Perceptron,0.63,0.1
1,LogisticRegression,0.66,0.09
2,PassiveAggressiveClassifier,0.65,0.1
3,SVC,0.93,0.14
4,KNeighborsClassifier,0.77,0.11
5,GaussianProcessClassifier,1.0,0.11
6,DecisionTreeClassifier,1.0,0.08
7,RandomForestClassifier,1.0,0.2
8,AdaBoostClassifier,0.87,0.13
9,GradientBoostingClassifier,0.97,0.1


In [46]:
initial_results

Unnamed: 0,Classifier,Training sample,Validation sample
0,Perceptron,0.63,0.1
1,LogisticRegression,0.66,0.09
2,PassiveAggressiveClassifier,0.65,0.1
3,SVC,0.93,0.14
4,KNeighborsClassifier,0.77,0.11
5,GaussianProcessClassifier,1.0,0.11
6,DecisionTreeClassifier,1.0,0.08
7,RandomForestClassifier,1.0,0.2
8,AdaBoostClassifier,0.87,0.13
9,GradientBoostingClassifier,0.97,0.1


In [47]:
initial_results.columns = pd.MultiIndex.from_product([['Default models'],
                                                     initial_results.columns])
initial_results

Unnamed: 0_level_0,Default models,Default models,Default models
Unnamed: 0_level_1,Classifier,Training sample,Validation sample
0,Perceptron,0.63,0.1
1,LogisticRegression,0.66,0.09
2,PassiveAggressiveClassifier,0.65,0.1
3,SVC,0.93,0.14
4,KNeighborsClassifier,0.77,0.11
5,GaussianProcessClassifier,1.0,0.11
6,DecisionTreeClassifier,1.0,0.08
7,RandomForestClassifier,1.0,0.2
8,AdaBoostClassifier,0.87,0.13
9,GradientBoostingClassifier,0.97,0.1


#### Separate optimized results

In [48]:
opt_results = results[['Classifier', 'BMOPT_tr_precision', 'BMOPT_val_precision']].copy()
opt_results

Unnamed: 0,Classifier,BMOPT_tr_precision,BMOPT_val_precision
0,Perceptron,0.62,0.09
1,LogisticRegression,0.67,0.08
2,PassiveAggressiveClassifier,0.62,0.1
3,SVC,0.97,0.19
4,KNeighborsClassifier,1.0,0.12
5,GaussianProcessClassifier,0.98,0.13
6,DecisionTreeClassifier,0.95,0.12
7,RandomForestClassifier,0.99,0.06
8,AdaBoostClassifier,0.92,0.14
9,GradientBoostingClassifier,0.84,0.09


In [49]:
opt_results.rename(columns={'BMOPT_tr_precision': 'Training sample', 'BMOPT_val_precision': 'Validation sample'},
                       inplace=True)
opt_results

Unnamed: 0,Classifier,Training sample,Validation sample
0,Perceptron,0.62,0.09
1,LogisticRegression,0.67,0.08
2,PassiveAggressiveClassifier,0.62,0.1
3,SVC,0.97,0.19
4,KNeighborsClassifier,1.0,0.12
5,GaussianProcessClassifier,0.98,0.13
6,DecisionTreeClassifier,0.95,0.12
7,RandomForestClassifier,0.99,0.06
8,AdaBoostClassifier,0.92,0.14
9,GradientBoostingClassifier,0.84,0.09


In [50]:
opt_results

Unnamed: 0,Classifier,Training sample,Validation sample
0,Perceptron,0.62,0.09
1,LogisticRegression,0.67,0.08
2,PassiveAggressiveClassifier,0.62,0.1
3,SVC,0.97,0.19
4,KNeighborsClassifier,1.0,0.12
5,GaussianProcessClassifier,0.98,0.13
6,DecisionTreeClassifier,0.95,0.12
7,RandomForestClassifier,0.99,0.06
8,AdaBoostClassifier,0.92,0.14
9,GradientBoostingClassifier,0.84,0.09


In [51]:
opt_results.columns = pd.MultiIndex.from_product([['Optimized models'], opt_results.columns])
opt_results

Unnamed: 0_level_0,Optimized models,Optimized models,Optimized models
Unnamed: 0_level_1,Classifier,Training sample,Validation sample
0,Perceptron,0.62,0.09
1,LogisticRegression,0.67,0.08
2,PassiveAggressiveClassifier,0.62,0.1
3,SVC,0.97,0.19
4,KNeighborsClassifier,1.0,0.12
5,GaussianProcessClassifier,0.98,0.13
6,DecisionTreeClassifier,0.95,0.12
7,RandomForestClassifier,0.99,0.06
8,AdaBoostClassifier,0.92,0.14
9,GradientBoostingClassifier,0.84,0.09


### Prepare initial classifier results

In [52]:
OFF_THE_SHELF_CLASSIFIERS

OrderedDict([('Perceptron',
              {'clf': Perceptron(),
               'param_grid': {'penalty': ['l1', 'l2', 'elasticnet'],
                'alpha': [0.001, 0.0001, 1e-05],
                'l1_ratio': [None, 0.075, 0.15, 0.3],
                'max_iter': [500, 1000, 2000],
                'random_state': [11]}}),
             ('LogisticRegression',
              {'clf': LogisticRegression(),
               'param_grid': {'penalty': ['l1', 'l2', 'elasticnet'],
                'C': [0.5, 1.0, 2.0],
                'l1_ratio': [None, 0.075, 0.15, 0.3],
                'solver': ['saga'],
                'max_iter': [50, 100, 200],
                'random_state': [11]}}),
             ('PassiveAggressiveClassifier',
              {'clf': PassiveAggressiveClassifier(),
               'param_grid': {'C': [0.5, 1.0, 2.0],
                'max_iter': [500, 1000, 2000],
                'loss': ['hinge', 'squared_hinge'],
                'random_state': [11]}}),
             ('SVC',
   

#### Save the classifier dictionary object to file

In [53]:
pickle.dump(OFF_THE_SHELF_CLASSIFIERS, open(MODELS_FOLDER + OTS_CLF_GRIDS_OUT, 'wb'))

#### Prepare the default parameter info

In [54]:
SVC().get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [91]:
clf_list = []
def_param_list = []
for clf in OFF_THE_SHELF_CLASSIFIERS.keys():
    clf_list.append(clf)
    clf_params = OFF_THE_SHELF_CLASSIFIERS[clf]['clf'].get_params()
    param_values = ""
    for param in OFF_THE_SHELF_CLASSIFIERS[clf]['param_grid'].keys():
        param_values += param + ": " + str(clf_params[param]) + " | "
    param_values = param_values[:-3]
    def_param_list.append(param_values)

clf_def_params = pd.DataFrame(data={'Model': clf_list, 'Main parameters default values': def_param_list})
clf_def_params

Unnamed: 0,Model,Main parameters default values
0,Perceptron,penalty: None | alpha: 0.0001 | l1_ratio: 0.15 | max_iter: 1000 | random_state: 0
1,LogisticRegression,penalty: l2 | C: 1.0 | l1_ratio: None | solver: lbfgs | max_iter: 100 | random_state: None
2,PassiveAggressiveClassifier,C: 1.0 | max_iter: 1000 | loss: hinge | random_state: None
3,SVC,C: 1.0 | kernel: rbf | degree: 3 | random_state: None
4,KNeighborsClassifier,n_neighbors: 5 | weights: uniform | algorithm: auto | p: 2
5,GaussianProcessClassifier,kernel: None | max_iter_predict: 100 | random_state: None
6,DecisionTreeClassifier,criterion: gini | max_depth: None | min_samples_leaf: 1 | max_features: None | ccp_alpha: 0.0 | random_state: None
7,RandomForestClassifier,n_estimators: 100 | criterion: gini | max_depth: None | min_samples_leaf: 1 | max_features: sqrt | ccp_alpha: 0.0 | random_state: None
8,AdaBoostClassifier,n_estimators: 50 | learning_rate: 1.0 | algorithm: SAMME.R | random_state: None
9,GradientBoostingClassifier,loss: log_loss | learning_rate: 0.1 | n_estimators: 100 | criterion: friedman_mse | max_depth: 3 | min_samples_leaf: 1 | ccp_alpha: 0.0 | random_state: None


#### Prepare the parameter grid info

In [95]:
clf_list = []
param_list = []
for clf in OFF_THE_SHELF_CLASSIFIERS.keys():
    clf_list.append(clf)
    param_grid = ""
    for param in OFF_THE_SHELF_CLASSIFIERS[clf]['param_grid'].keys():
        param_grid += param + ": " + str(OFF_THE_SHELF_CLASSIFIERS[clf]['param_grid'][param]) + " | "
    param_grid = param_grid[:-3]
    param_list.append(param_grid)

clf_param_grid = pd.DataFrame(data={'Model': clf_list, 'Initial parameter grid': param_list})
clf_param_grid

Unnamed: 0,Model,Initial parameter grid
0,Perceptron,"penalty: ['l1', 'l2', 'elasticnet'] | alpha: [0.001, 0.0001, 1e-05] | l1_ratio: [None, 0.075, 0.15, 0.3] | max_iter: [500, 1000, 2000] | random_state: [11]"
1,LogisticRegression,"penalty: ['l1', 'l2', 'elasticnet'] | C: [0.5, 1.0, 2.0] | l1_ratio: [None, 0.075, 0.15, 0.3] | solver: ['saga'] | max_iter: [50, 100, 200] | random_state: [11]"
2,PassiveAggressiveClassifier,"C: [0.5, 1.0, 2.0] | max_iter: [500, 1000, 2000] | loss: ['hinge', 'squared_hinge'] | random_state: [11]"
3,SVC,"C: [0.5, 1.0, 2.0] | kernel: ['linear', 'poly', 'rbf', 'sigmoid', 'recomputed'] | degree: [2, 3, 6] | random_state: [11]"
4,KNeighborsClassifier,"n_neighbors: [1, 3, 5, 10] | weights: ['uniform', 'distance'] | algorithm: ['ball_tree', 'kd_tree', 'brute'] | p: [1, 2]"
5,GaussianProcessClassifier,"kernel: [RBF(length_scale=1), RationalQuadratic(alpha=1, length_scale=1), DotProduct(sigma_0=1)] | max_iter_predict: [50, 100, 200] | random_state: [11]"
6,DecisionTreeClassifier,"criterion: ['gini', 'entropy', 'log_loss'] | max_depth: [25, 50, 100] | min_samples_leaf: [5, 10, 20] | max_features: [None, 'sqrt', 'log2'] | ccp_alpha: [0.005, 0.015, 0.03] | random_state: [11]"
7,RandomForestClassifier,"n_estimators: [50, 100, 200] | criterion: ['gini', 'entropy', 'log_loss'] | max_depth: [25, 50, 100] | min_samples_leaf: [5, 10, 20] | max_features: [None, 'sqrt', 'log2'] | ccp_alpha: [0.005, 0.015, 0.03] | random_state: [11]"
8,AdaBoostClassifier,"n_estimators: [25, 50, 100] | learning_rate: [0.5, 1.0, 2.0] | algorithm: ['SAMME', 'SAMME.R'] | random_state: [11]"
9,GradientBoostingClassifier,"loss: ['log_loss', 'deviance'] | learning_rate: [0.05, 0.1, 0.2] | n_estimators: [25, 50, 100] | criterion: ['friedman_mse', 'squared_error'] | max_depth: [25, 50, 100] | min_samples_leaf: [5, 10, 20] | ccp_alpha: [0.005, 0.015, 0.03] | random_state: [11]"


## Convert to LaTeX

In [57]:
pd.options.display.float_format = '{:,.2f}'.format

In [73]:
pd.options.display.max_colwidth = None

### Default models and families

In [74]:
models

Unnamed: 0,Family,Classifier
0,Linear models,Perceptron
1,Linear models,LogisticRegression
2,Linear models,PassiveAggressiveClassifier
3,Support Vector Machines,SVC
4,Nearest-Neighbors,KNeighborsClassifier
5,Gaussian Processes,GaussianProcessClassifier
6,Tree models,DecisionTreeClassifier
7,Ensemble models,RandomForestClassifier
8,Ensemble models,AdaBoostClassifier
9,Ensemble models,GradientBoostingClassifier


In [75]:
# Print to screen:
print(models.to_latex(index=False, longtable=False,
                      caption=("Default models chosen, by family.",
                               "Default models chosen, by family.")))



\begin{table}
\centering
\caption[Default models chosen, by family.]{Default models chosen, by family.}
\begin{tabular}{ll}
\toprule
                 Family &                  Classifier \\
\midrule
          Linear models &                  Perceptron \\
          Linear models &          LogisticRegression \\
          Linear models & PassiveAggressiveClassifier \\
Support Vector Machines &                         SVC \\
      Nearest-Neighbors &        KNeighborsClassifier \\
     Gaussian Processes &   GaussianProcessClassifier \\
            Tree models &      DecisionTreeClassifier \\
        Ensemble models &      RandomForestClassifier \\
        Ensemble models &          AdaBoostClassifier \\
        Ensemble models &  GradientBoostingClassifier \\
\bottomrule
\end{tabular}
\end{table}



  print(models.to_latex(index=False, longtable=False,


In [76]:
# Save to file:
# Write to file:
# Backup the standard output:
original_stdout = sys.stdout
with open(LATEX_DEFAULT_MODELS_OUT, 'w') as f:
    sys.stdout = f # Change the standard output to the file we created.
    print(models.to_latex(index=False, longtable=False,
                          caption=("Default models chosen, by family.",
                                   "Default models chosen, by family.")))
# Restore the standard output to its original value
sys.stdout = original_stdout

  print(models.to_latex(index=False, longtable=False,


### Default params for models

In [92]:
clf_def_params

Unnamed: 0,Model,Main parameters default values
0,Perceptron,penalty: None | alpha: 0.0001 | l1_ratio: 0.15 | max_iter: 1000 | random_state: 0
1,LogisticRegression,penalty: l2 | C: 1.0 | l1_ratio: None | solver: lbfgs | max_iter: 100 | random_state: None
2,PassiveAggressiveClassifier,C: 1.0 | max_iter: 1000 | loss: hinge | random_state: None
3,SVC,C: 1.0 | kernel: rbf | degree: 3 | random_state: None
4,KNeighborsClassifier,n_neighbors: 5 | weights: uniform | algorithm: auto | p: 2
5,GaussianProcessClassifier,kernel: None | max_iter_predict: 100 | random_state: None
6,DecisionTreeClassifier,criterion: gini | max_depth: None | min_samples_leaf: 1 | max_features: None | ccp_alpha: 0.0 | random_state: None
7,RandomForestClassifier,n_estimators: 100 | criterion: gini | max_depth: None | min_samples_leaf: 1 | max_features: sqrt | ccp_alpha: 0.0 | random_state: None
8,AdaBoostClassifier,n_estimators: 50 | learning_rate: 1.0 | algorithm: SAMME.R | random_state: None
9,GradientBoostingClassifier,loss: log_loss | learning_rate: 0.1 | n_estimators: 100 | criterion: friedman_mse | max_depth: 3 | min_samples_leaf: 1 | ccp_alpha: 0.0 | random_state: None


In [93]:
# Print to screen:
print(clf_def_params.to_latex(index=False, longtable=True,
                              caption=("Default params for models chosen.",
                                       "Default params for models chosen.")))



\begin{longtable}{ll}
\caption[Default params for models chosen.]{Default params for models chosen.}\\
\toprule
                      Model &                                                                                                                               Main parameters default values \\
\midrule
\endfirsthead
\caption[]{Default params for models chosen.} \\
\toprule
                      Model &                                                                                                                               Main parameters default values \\
\midrule
\endhead
\midrule
\multicolumn{2}{r}{{Continued on next page}} \\
\midrule
\endfoot

\bottomrule
\endlastfoot
                 Perceptron &                                                                            penalty: None | alpha: 0.0001 | l1\_ratio: 0.15 | max\_iter: 1000 | random\_state: 0 \\
         LogisticRegression &                                                                   penalty: l2 | C: 1.

  print(clf_def_params.to_latex(index=False, longtable=True,


In [94]:
# Save to file:
# Write to file:
# Backup the standard output:
original_stdout = sys.stdout
with open(LATEX_DEFAULT_PARAMS_OUT, 'w') as f:
    sys.stdout = f # Change the standard output to the file we created.
    print(clf_def_params.to_latex(index=False, longtable=True,
                                  caption=("Default params for models chosen.",
                                           "Default params for models chosen.")))
# Restore the standard output to its original value
sys.stdout = original_stdout

  print(clf_def_params.to_latex(index=False, longtable=True,


### Default models results

In [80]:
initial_results

Unnamed: 0_level_0,Default models,Default models,Default models
Unnamed: 0_level_1,Classifier,Training sample,Validation sample
0,Perceptron,0.63,0.1
1,LogisticRegression,0.66,0.09
2,PassiveAggressiveClassifier,0.65,0.1
3,SVC,0.93,0.14
4,KNeighborsClassifier,0.77,0.11
5,GaussianProcessClassifier,1.0,0.11
6,DecisionTreeClassifier,1.0,0.08
7,RandomForestClassifier,1.0,0.2
8,AdaBoostClassifier,0.87,0.13
9,GradientBoostingClassifier,0.97,0.1


In [81]:
# Print to screen:
print(initial_results.to_latex(index=False, longtable=False,
                               caption=("Precision results for default ML models.",
                                        "Precision results for default ML models.")))



\begin{table}
\centering
\caption[Precision results for default ML models.]{Precision results for default ML models.}
\begin{tabular}{lrr}
\toprule
             Default models \\
                 Classifier & Training sample & Validation sample \\
\midrule
                 Perceptron &            0.63 &              0.10 \\
         LogisticRegression &            0.66 &              0.09 \\
PassiveAggressiveClassifier &            0.65 &              0.10 \\
                        SVC &            0.93 &              0.14 \\
       KNeighborsClassifier &            0.77 &              0.11 \\
  GaussianProcessClassifier &            1.00 &              0.11 \\
     DecisionTreeClassifier &            1.00 &              0.08 \\
     RandomForestClassifier &            1.00 &              0.20 \\
         AdaBoostClassifier &            0.87 &              0.13 \\
 GradientBoostingClassifier &            0.97 &              0.10 \\
\bottomrule
\end{tabular}
\end{table}



  print(initial_results.to_latex(index=False, longtable=False,


In [82]:
# Save to file:
# Write to file:
# Backup the standard output:
original_stdout = sys.stdout
with open(LATEX_INI_RESULTS_OUT, 'w') as f:
    sys.stdout = f # Change the standard output to the file we created.
    print(initial_results.to_latex(index=False, longtable=False,
                                   caption=("Precision results for default ML models.",
                                            "Precision results for default ML models.")))
# Restore the standard output to its original value
sys.stdout = original_stdout

  print(initial_results.to_latex(index=False, longtable=False,


### Optimized models results

In [83]:
opt_results

Unnamed: 0_level_0,Optimized models,Optimized models,Optimized models
Unnamed: 0_level_1,Classifier,Training sample,Validation sample
0,Perceptron,0.62,0.09
1,LogisticRegression,0.67,0.08
2,PassiveAggressiveClassifier,0.62,0.1
3,SVC,0.97,0.19
4,KNeighborsClassifier,1.0,0.12
5,GaussianProcessClassifier,0.98,0.13
6,DecisionTreeClassifier,0.95,0.12
7,RandomForestClassifier,0.99,0.06
8,AdaBoostClassifier,0.92,0.14
9,GradientBoostingClassifier,0.84,0.09


In [84]:
# Print to screen:
print(opt_results.to_latex(index=False, longtable=False,
                           caption=("Precision results for optimized ML models.",
                                    "Precision results for optimized ML models.")))



\begin{table}
\centering
\caption[Precision results for optimized ML models.]{Precision results for optimized ML models.}
\begin{tabular}{lrr}
\toprule
           Optimized models \\
                 Classifier & Training sample & Validation sample \\
\midrule
                 Perceptron &            0.62 &              0.09 \\
         LogisticRegression &            0.67 &              0.08 \\
PassiveAggressiveClassifier &            0.62 &              0.10 \\
                        SVC &            0.97 &              0.19 \\
       KNeighborsClassifier &            1.00 &              0.12 \\
  GaussianProcessClassifier &            0.98 &              0.13 \\
     DecisionTreeClassifier &            0.95 &              0.12 \\
     RandomForestClassifier &            0.99 &              0.06 \\
         AdaBoostClassifier &            0.92 &              0.14 \\
 GradientBoostingClassifier &            0.84 &              0.09 \\
\bottomrule
\end{tabular}
\end{table}



  print(opt_results.to_latex(index=False, longtable=False,


In [85]:
# Save to file:
# Write to file:
# Backup the standard output:
original_stdout = sys.stdout
with open(LATEX_OPT_RESULTS_OUT, 'w') as f:
    sys.stdout = f # Change the standard output to the file we created.
    print(opt_results.to_latex(index=False, longtable=False,
                               caption=("Precision results for optimized ML models.",
                                        "Precision results for optimized ML models.")))
# Restore the standard output to its original value
sys.stdout = original_stdout

  print(opt_results.to_latex(index=False, longtable=False,


### Classifier parameter grids

In [96]:
clf_param_grid

Unnamed: 0,Model,Initial parameter grid
0,Perceptron,"penalty: ['l1', 'l2', 'elasticnet'] | alpha: [0.001, 0.0001, 1e-05] | l1_ratio: [None, 0.075, 0.15, 0.3] | max_iter: [500, 1000, 2000] | random_state: [11]"
1,LogisticRegression,"penalty: ['l1', 'l2', 'elasticnet'] | C: [0.5, 1.0, 2.0] | l1_ratio: [None, 0.075, 0.15, 0.3] | solver: ['saga'] | max_iter: [50, 100, 200] | random_state: [11]"
2,PassiveAggressiveClassifier,"C: [0.5, 1.0, 2.0] | max_iter: [500, 1000, 2000] | loss: ['hinge', 'squared_hinge'] | random_state: [11]"
3,SVC,"C: [0.5, 1.0, 2.0] | kernel: ['linear', 'poly', 'rbf', 'sigmoid', 'recomputed'] | degree: [2, 3, 6] | random_state: [11]"
4,KNeighborsClassifier,"n_neighbors: [1, 3, 5, 10] | weights: ['uniform', 'distance'] | algorithm: ['ball_tree', 'kd_tree', 'brute'] | p: [1, 2]"
5,GaussianProcessClassifier,"kernel: [RBF(length_scale=1), RationalQuadratic(alpha=1, length_scale=1), DotProduct(sigma_0=1)] | max_iter_predict: [50, 100, 200] | random_state: [11]"
6,DecisionTreeClassifier,"criterion: ['gini', 'entropy', 'log_loss'] | max_depth: [25, 50, 100] | min_samples_leaf: [5, 10, 20] | max_features: [None, 'sqrt', 'log2'] | ccp_alpha: [0.005, 0.015, 0.03] | random_state: [11]"
7,RandomForestClassifier,"n_estimators: [50, 100, 200] | criterion: ['gini', 'entropy', 'log_loss'] | max_depth: [25, 50, 100] | min_samples_leaf: [5, 10, 20] | max_features: [None, 'sqrt', 'log2'] | ccp_alpha: [0.005, 0.015, 0.03] | random_state: [11]"
8,AdaBoostClassifier,"n_estimators: [25, 50, 100] | learning_rate: [0.5, 1.0, 2.0] | algorithm: ['SAMME', 'SAMME.R'] | random_state: [11]"
9,GradientBoostingClassifier,"loss: ['log_loss', 'deviance'] | learning_rate: [0.05, 0.1, 0.2] | n_estimators: [25, 50, 100] | criterion: ['friedman_mse', 'squared_error'] | max_depth: [25, 50, 100] | min_samples_leaf: [5, 10, 20] | ccp_alpha: [0.005, 0.015, 0.03] | random_state: [11]"


**NOTE:** this table will probably need major manual rework afterwards.

In [97]:
# Print to screen:
print(clf_param_grid.to_latex(index=False, longtable=True,
                              caption=("Model preselection. Initial parameter grids.",
                                       "Model preselection. Initial parameter grids.")))



\begin{longtable}{ll}
\caption[Model preselection. Initial parameter grids.]{Model preselection. Initial parameter grids.}\\
\toprule
                      Model &                                                                                                                                                                                                                                          Initial parameter grid \\
\midrule
\endfirsthead
\caption[]{Model preselection. Initial parameter grids.} \\
\toprule
                      Model &                                                                                                                                                                                                                                          Initial parameter grid \\
\midrule
\endhead
\midrule
\multicolumn{2}{r}{{Continued on next page}} \\
\midrule
\endfoot

\bottomrule
\endlastfoot
                 Perceptron &                                                

  print(clf_param_grid.to_latex(index=False, longtable=True,


In [98]:
# Save to file:
# Write to file:
# Backup the standard output:
original_stdout = sys.stdout
with open(LATEX_PARAMGRID_OUT, 'w') as f:
    sys.stdout = f # Change the standard output to the file we created.
    print(clf_param_grid.to_latex(index=False, longtable=True,
                                  caption=("Model preselection. Initial parameter grids.",
                                           "Model preselection. Initial parameter grids.")))
# Restore the standard output to its original value
sys.stdout = original_stdout

  print(clf_param_grid.to_latex(index=False, longtable=True,


## Summary

**RESULTS:**

- Generated and saved the LaTeX tables for the Masters' thesis documentation.

