#  PoC - TRAINIGN THE MODELS WITH LIMITED MINIMUM AMPLITUDE

In this Notebook we try to train the classifier models employing only non-pulsating stars and pulsating stars with a minimum value of amplitude.

**Note:** we train with the new S4 sample that we created, with the amplitude grids, and validate also with the new S4 validation set (i.e. same CARMENES idx for sampling pattern and noise distribution).

## Modules and configuration

### Modules

In [38]:
import pandas as pd
import numpy as np

import copy

from time import time

from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, \
    f1_score, log_loss, matthews_corrcoef, classification_report, \
    get_scorer_names, confusion_matrix

from collections import OrderedDict

import warnings

#from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif

from sklearn.gaussian_process.kernels import RBF, RationalQuadratic, DotProduct

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit # Se usa para evitar overfitting
#from sklearn.model_selection import cross_validate #### NOTA, IGUAL ES MEJOR ESTE, PARA TENER EL CONTROL Y SACAR
# TODOS LOS RUNS QUE QUERAMOS
###### NO HACE FALTA EL GRID SEARCH NI NADA DE ESTO??? SIMPLEMENTE LE HACEMOS EL "FIT" Y LUEGO MEDIMOS CON
# EL "PREDICT" SOBRE EL VALIDATION SET

from sklearn.linear_model import Perceptron, LogisticRegression, PassiveAggressiveClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier, HistGradientBoostingClassifier

import pickle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white", {'figure.figsize':(15,10)})

from IPython.display import display

# from imblearn import 

### Configuration

In [32]:
RANDOM_STATE = 11 # For reproducibility

AMP_LIMIT = 1.2
CASE_ID = "A" + str(AMP_LIMIT)

S4_TRAIN_SET_IN = "../data/DATASETS_ML/1NN/1NN_TRAIN_S4B_02_DS_AfterImputing.csv"
S4_VALIDATION_SET_IN = "../data/DATASETS_ML/1NN/1NN_VAL_S4B_02_DS_AfterImputing.csv"

REL_FEATURES_IN = "../data/ML_MODELS/ML_pipeline_steps/Reliable_features.pickle"
#UNREL_FEATURES_IN = "../data/ML_MODELS/ML_pipeline_steps/Unreliable_features.pickle"

ML_ADD_COLUMNS = ['Karmn'] # Only cesium features and this column will be kept.
S4_ADD_COLUMNS = ['ID', 'Pulsating', 'frequency', 'amplitudeRV',
                  'offsetRV', 'refepochRV', 'phase',
                  'CARMENES_source_idx', 'CARMENES_Ref_star'] # Only cesium features and these columns will be kept.

MODELS_FOLDER = "../data/ML_MODELS/ML_model_preselection/"

PRECISION_RESULTS_OUT = "ModelPreselection_PrecisionResults_1NN_" + CASE_ID + ".csv"
VAL_PREDICTIONS_OUT = "ModelPreselection_ValidationPredictions_1NN_S4_" + CASE_ID + ".csv"

# Note: it would be better to use a jason file for this configuration.
OFF_THE_SHELF_CLASSIFIERS = OrderedDict({
    'Perceptron': {
        'clf': Perceptron(),
        'param_grid': {'penalty': ['l1', 'l2', 'elasticnet'],
                       'alpha': [0.001, 0.0001, 0.00001],
                       'l1_ratio': [None, 0.075, 0.15, 0.30],
                       'max_iter': [500, 1000, 2000],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'LogisticRegression': {
        'clf': LogisticRegression(),
        'param_grid': {'penalty': ['l1', 'l2', 'elasticnet'],
                       'C': [0.5, 1.0, 2.0],
                       'l1_ratio': [None, 0.075, 0.15, 0.30],
                       'solver': ['saga'],
                       'max_iter': [50, 100, 200],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'PassiveAggressiveClassifier': {
        'clf': PassiveAggressiveClassifier(),
        'param_grid': {'C': [0.5, 1.0, 2.0],
                       'max_iter': [500, 1000, 2000],
                       'loss': ['hinge', 'squared_hinge'],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'SVC': {
        'clf': SVC(),
        'param_grid': {'C': [0.5, 1.0, 2.0],
                       'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'recomputed'],
                       'degree': [2, 3, 6],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'KNeighborsClassifier': {
        'clf': KNeighborsClassifier(),
        'param_grid': {'n_neighbors': [1, 3, 5, 10],
                       'weights': ['uniform', 'distance'],
                       'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                       'p': [1, 2]
                      }
    },
    'GaussianProcessClassifier': {
        'clf': GaussianProcessClassifier(),
        'param_grid': {'kernel': [RBF(), RationalQuadratic(), DotProduct()],
                       'max_iter_predict': [50, 100, 200],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'DecisionTreeClassifier': {
        'clf': DecisionTreeClassifier(),
        'param_grid': {'criterion': ['gini', 'entropy', 'log_loss'],
                       'max_depth': [25, 50, 100],
                       'min_samples_leaf': [5, 10, 20],
                       'max_features': [None, 'sqrt', 'log2'],
                       'ccp_alpha': [0.005, 0.015, 0.030],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'RandomForestClassifier': {
        'clf': RandomForestClassifier(),
        'param_grid': {'n_estimators': [50, 100, 200],
                       'criterion': ['gini', 'entropy', 'log_loss'],
                       'max_depth': [25, 50, 100],
                       'min_samples_leaf': [5, 10, 20],
                       'max_features': [None, 'sqrt', 'log2'],
                       'ccp_alpha': [0.005, 0.015, 0.030],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'AdaBoostClassifier': {
        'clf': AdaBoostClassifier(),
        'param_grid': {'n_estimators': [25, 50, 100],
                       'learning_rate': [0.5, 1.0, 2.0],
                       'algorithm': ['SAMME', 'SAMME.R'],
                       'random_state': [RANDOM_STATE]
                      }
    },
    'GradientBoostingClassifier': {
        'clf': GradientBoostingClassifier(),
        'param_grid': {'loss': ['log_loss', 'deviance'],
                       'learning_rate': [0.05, 0.1, 0.2],
                       'n_estimators': [25, 50, 100],
                       'criterion': ['friedman_mse', 'squared_error'],
                       'max_depth': [25, 50, 100],
                       'min_samples_leaf': [5, 10, 20],
                       'ccp_alpha': [0.005, 0.015, 0.030],
                       'random_state': [RANDOM_STATE]
                      }
    }
})

IMAGE_FOLDER = './img/'

### Functions

## Load data

We load the data, which are the time series as previously featurized by _cesium_, scaled, and with `NaN` values imputed by a `KNNImputer`.

### Load reliable features list

In [10]:
rel_features = pickle.load(open(REL_FEATURES_IN, 'rb'))
print(rel_features)

['all_times_nhist_numpeaks', 'all_times_nhist_peak1_bin', 'all_times_nhist_peak2_bin', 'all_times_nhist_peak3_bin', 'all_times_nhist_peak4_bin', 'all_times_nhist_peak_1_to_2', 'all_times_nhist_peak_1_to_3', 'all_times_nhist_peak_1_to_4', 'all_times_nhist_peak_2_to_3', 'all_times_nhist_peak_2_to_4', 'all_times_nhist_peak_3_to_4', 'all_times_nhist_peak_val', 'avg_double_to_single_step', 'cad_probs_1', 'cad_probs_10', 'cad_probs_20', 'cad_probs_30', 'cad_probs_40', 'cad_probs_50', 'cad_probs_100', 'cad_probs_500', 'cad_probs_1000', 'cad_probs_5000', 'cad_probs_10000', 'cad_probs_50000', 'cad_probs_100000', 'cad_probs_500000', 'cad_probs_1000000', 'cad_probs_5000000', 'cad_probs_10000000', 'cads_avg', 'cads_med', 'cads_std', 'med_double_to_single_step', 'n_epochs', 'std_double_to_single_step', 'total_time', 'percent_beyond_1_std', 'freq1_rel_phase2', 'freq1_rel_phase3', 'freq1_rel_phase4', 'freq2_rel_phase2', 'freq2_rel_phase3', 'freq2_rel_phase4', 'freq3_rel_phase2', 'freq3_rel_phase3', '

###  Load the NEW S4 sample data train set

In [13]:
s4_train = pd.read_csv(S4_TRAIN_SET_IN, sep=',', decimal='.')
s4_train

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,ALT-B_Star-00000,True,8.0,0.1,0.0,0.0,0.00,0,J23505-095,0.205178,...,-0.908970,0.541171,-0.669931,-0.015526,-0.019047,-0.007352,-0.442166,-0.134743,-0.219987,0.356795
1,ALT-B_Star-00001,False,0.0,0.0,0.0,0.0,0.00,0,J23505-095,0.205178,...,-0.725712,0.832929,0.631644,-0.009818,-0.021695,0.552345,-0.429826,0.007825,-0.005592,0.720584
2,ALT-B_Star-00002,True,8.0,0.1,0.0,0.0,0.25,0,J23505-095,0.205178,...,-0.411204,0.857062,0.541050,-0.005791,-0.035445,0.013431,-0.327874,-0.179274,-0.090747,0.234164
3,ALT-B_Star-00003,False,0.0,0.0,0.0,0.0,0.00,0,J23505-095,0.205178,...,-0.022857,1.028396,-0.061844,-0.002265,-0.031985,0.275371,-0.217636,0.168502,0.170924,0.373533
4,ALT-B_Star-00004,True,8.0,0.1,0.0,0.0,0.50,0,J23505-095,0.205178,...,-0.272705,0.962090,-0.223832,-0.008516,-0.043389,-0.593592,0.261842,-0.171281,0.590502,1.231367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37275,ALT-B_Star-37275,False,0.0,0.0,0.0,0.0,0.00,232,J00051+457,0.830097,...,-0.394117,0.405540,1.174777,-0.032514,-0.030659,-0.363602,-0.042368,-0.039153,0.551196,0.088020
37276,ALT-B_Star-37276,True,64.0,1.6,0.0,0.0,0.50,232,J00051+457,0.830097,...,0.288502,0.461546,0.193656,-0.023099,-0.028984,0.259205,-0.075329,-0.205573,-0.414141,0.392452
37277,ALT-B_Star-37277,False,0.0,0.0,0.0,0.0,0.00,232,J00051+457,0.830097,...,-0.876158,0.530909,0.215261,-0.027492,-0.033688,0.460585,-0.247941,-0.138908,-0.166431,0.054261
37278,ALT-B_Star-37278,True,64.0,1.6,0.0,0.0,0.75,232,J00051+457,0.830097,...,-0.487701,0.343003,-1.656181,-0.023002,-0.000361,0.384265,-0.588162,-0.164535,-0.621599,0.013055


In [14]:
print(list(s4_train.columns))

['ID', 'Pulsating', 'frequency', 'amplitudeRV', 'offsetRV', 'refepochRV', 'phase', 'CARMENES_source_idx', 'CARMENES_Ref_star', 'all_times_nhist_numpeaks', 'all_times_nhist_peak1_bin', 'all_times_nhist_peak2_bin', 'all_times_nhist_peak3_bin', 'all_times_nhist_peak4_bin', 'all_times_nhist_peak_1_to_2', 'all_times_nhist_peak_1_to_3', 'all_times_nhist_peak_1_to_4', 'all_times_nhist_peak_2_to_3', 'all_times_nhist_peak_2_to_4', 'all_times_nhist_peak_3_to_4', 'all_times_nhist_peak_val', 'avg_double_to_single_step', 'avg_err', 'avgt', 'cad_probs_1', 'cad_probs_10', 'cad_probs_20', 'cad_probs_30', 'cad_probs_40', 'cad_probs_50', 'cad_probs_100', 'cad_probs_500', 'cad_probs_1000', 'cad_probs_5000', 'cad_probs_10000', 'cad_probs_50000', 'cad_probs_100000', 'cad_probs_500000', 'cad_probs_1000000', 'cad_probs_5000000', 'cad_probs_10000000', 'cads_avg', 'cads_med', 'cads_std', 'mean', 'med_double_to_single_step', 'med_err', 'n_epochs', 'std_double_to_single_step', 'std_err', 'total_time', 'amplitude

We need to keep only the reliable features and some metadata, and change the labels to `0` / `1`.

#### Encode target variable (`Pulsating`)

In [19]:
encode = {False: 0, True: 1}
s4_train['Pulsating'] = s4_train['Pulsating'].map(lambda x: encode[x])
s4_train.head()

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,ALT-B_Star-00000,1,8.0,0.1,0.0,0.0,0.0,0,J23505-095,0.205178,...,-0.90897,0.541171,-0.669931,-0.015526,-0.019047,-0.007352,-0.442166,-0.134743,-0.219987,0.356795
1,ALT-B_Star-00001,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,-0.725712,0.832929,0.631644,-0.009818,-0.021695,0.552345,-0.429826,0.007825,-0.005592,0.720584
2,ALT-B_Star-00002,1,8.0,0.1,0.0,0.0,0.25,0,J23505-095,0.205178,...,-0.411204,0.857062,0.54105,-0.005791,-0.035445,0.013431,-0.327874,-0.179274,-0.090747,0.234164
3,ALT-B_Star-00003,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,-0.022857,1.028396,-0.061844,-0.002265,-0.031985,0.275371,-0.217636,0.168502,0.170924,0.373533
4,ALT-B_Star-00004,1,8.0,0.1,0.0,0.0,0.5,0,J23505-095,0.205178,...,-0.272705,0.96209,-0.223832,-0.008516,-0.043389,-0.593592,0.261842,-0.171281,0.590502,1.231367


#### Filter the relevant columns only

In [20]:
s4_train_rel = s4_train[S4_ADD_COLUMNS + rel_features].copy()
s4_train_rel.head()

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,ALT-B_Star-00000,1,8.0,0.1,0.0,0.0,0.0,0,J23505-095,0.205178,...,1.614069,0.998428,1.466916,0.338484,-1.510543,0.994977,1.131783,-0.861327,0.11011,1.177233
1,ALT-B_Star-00001,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,-0.119644,-0.478883,-1.275757,0.148617,1.492525,-0.143042,-1.449993,-1.171944,-0.620326,-1.178087
2,ALT-B_Star-00002,1,8.0,0.1,0.0,0.0,0.25,0,J23505-095,0.205178,...,1.516926,1.179306,-0.545249,0.435829,1.086582,-1.598621,1.620377,-0.295209,0.90286,1.305973
3,ALT-B_Star-00003,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,-0.634983,1.524229,0.153258,0.416966,1.008979,-0.472625,-1.520984,-0.482138,-1.09066,0.010829
4,ALT-B_Star-00004,1,8.0,0.1,0.0,0.0,0.5,0,J23505-095,0.205178,...,0.802641,0.492376,1.101914,-1.011576,-0.144773,-0.739138,1.442992,-1.507838,-0.345497,0.014787


In [25]:
s4_train_rel[['Pulsating', 'ID']].groupby('Pulsating').count()

Unnamed: 0_level_0,ID
Pulsating,Unnamed: 1_level_1
0,18640
1,18640


###  Load the NEW S4 sample data validation set

In [26]:
s4_val = pd.read_csv(S4_VALIDATION_SET_IN, sep=',', decimal='.')
s4_val

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,B_Star-00000,False,0.00,0.00,0.0,2.457432e+06,0.00,116,J11511+352,0.517637,...,-0.134573,1.418486,0.042013,-0.009545,-0.025562,0.142376,-0.263196,0.251708,-0.038958,1.237807
1,B_Star-00001,False,0.00,0.00,0.0,2.457487e+06,0.00,29,J20336+617,-0.419742,...,0.125231,0.579019,1.116863,0.000807,-0.002270,-0.173628,0.037728,-0.114574,0.549077,-0.031212
2,B_Star-00002,False,0.00,0.00,0.0,2.457417e+06,0.00,156,J08402+314,0.517637,...,1.350668,-0.929902,-0.047202,0.016980,0.060763,0.030306,1.365530,0.577156,0.547318,-0.942817
3,B_Star-00003,False,0.00,0.00,0.0,2.457431e+06,0.00,180,J05421+124,0.830097,...,-0.058875,0.312563,0.586087,-0.005898,-0.013044,-0.553518,0.269385,-0.201648,0.506757,0.267605
4,B_Star-00004,False,0.00,0.00,0.0,2.461026e+06,0.00,67,J17052-050,0.830097,...,-0.309842,0.665012,-0.000102,-0.008592,-0.002190,-0.588534,0.493010,-0.205573,0.753534,0.369862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3988,B_Star-03995,False,0.00,0.00,0.0,2.459911e+06,0.00,53,J18165+048,-0.732202,...,-0.754672,0.189341,0.511304,0.017560,-0.040668,1.209315,0.189618,-0.023871,-1.992779,0.480138
3989,B_Star-03996,False,0.00,0.00,0.0,2.457428e+06,0.00,8,J23216+172,-1.669582,...,0.936833,1.083443,-0.583714,-0.005475,-0.009577,0.188601,-0.225531,-0.068249,-0.300657,0.228148
3990,B_Star-03997,False,0.00,0.00,0.0,2.458409e+06,0.00,3,J23419+441,0.205178,...,0.416565,1.183910,-0.122027,-0.004456,-0.023344,-0.211972,-0.221623,-0.018265,0.051901,1.076491
3991,B_Star-03998,False,0.00,0.00,0.0,2.457468e+06,0.00,181,J05415+534,0.205178,...,0.272003,1.404428,-0.099327,-0.007536,-0.028463,-0.020467,-0.110814,-0.065574,0.194104,0.845764


We do need to do the transformations.

#### Encode target variable (`Pulsating`)

We encode the target variable as `True` / `False` = `0` / `1`.

In [27]:
s4_val['Pulsating'] = s4_val['Pulsating'].map(lambda x: encode[x])
s4_val.head()

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,B_Star-00000,0,0.0,0.0,0.0,2457432.0,0.0,116,J11511+352,0.517637,...,-0.134573,1.418486,0.042013,-0.009545,-0.025562,0.142376,-0.263196,0.251708,-0.038958,1.237807
1,B_Star-00001,0,0.0,0.0,0.0,2457487.0,0.0,29,J20336+617,-0.419742,...,0.125231,0.579019,1.116863,0.000807,-0.00227,-0.173628,0.037728,-0.114574,0.549077,-0.031212
2,B_Star-00002,0,0.0,0.0,0.0,2457417.0,0.0,156,J08402+314,0.517637,...,1.350668,-0.929902,-0.047202,0.01698,0.060763,0.030306,1.36553,0.577156,0.547318,-0.942817
3,B_Star-00003,0,0.0,0.0,0.0,2457431.0,0.0,180,J05421+124,0.830097,...,-0.058875,0.312563,0.586087,-0.005898,-0.013044,-0.553518,0.269385,-0.201648,0.506757,0.267605
4,B_Star-00004,0,0.0,0.0,0.0,2461026.0,0.0,67,J17052-050,0.830097,...,-0.309842,0.665012,-0.000102,-0.008592,-0.00219,-0.588534,0.49301,-0.205573,0.753534,0.369862


#### Filter the relevant columns only

We now filter only by the reliable relevant columns plus the `Pulsating` column.

In [28]:
s4_val_rel = s4_val[S4_ADD_COLUMNS + rel_features].copy()
s4_val_rel

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,B_Star-00000,0,0.00,0.00,0.0,2.457432e+06,0.00,116,J11511+352,0.517637,...,0.052795,1.583234,0.932739,0.450136,0.507614,-1.306587,-1.667521,-0.920210,-0.783439,-1.401898
1,B_Star-00001,0,0.00,0.00,0.0,2.457487e+06,0.00,29,J20336+617,-0.419742,...,1.163064,-1.210955,-0.115978,-0.472571,-1.020306,-1.701440,-0.559011,-1.284851,1.122835,0.245348
2,B_Star-00002,0,0.00,0.00,0.0,2.457417e+06,0.00,156,J08402+314,0.517637,...,-0.012660,-0.053798,1.376012,0.424505,-0.725802,-0.057991,1.221787,0.318082,0.308407,-1.671493
3,B_Star-00003,0,0.00,0.00,0.0,2.457431e+06,0.00,180,J05421+124,0.830097,...,-0.805112,0.997168,-0.526090,1.599326,-0.195380,-0.021941,1.534360,0.990544,1.292806,-0.179514
4,B_Star-00004,0,0.00,0.00,0.0,2.461026e+06,0.00,67,J17052-050,0.830097,...,-0.132562,-1.224669,-0.258461,-0.535823,0.080260,0.925326,1.641694,0.658672,0.750566,-1.178309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3988,B_Star-03995,0,0.00,0.00,0.0,2.459911e+06,0.00,53,J18165+048,-0.732202,...,-1.017774,-1.561081,-0.962640,0.998539,0.265269,-1.448659,1.254501,-1.205162,1.687979,0.461238
3989,B_Star-03996,0,0.00,0.00,0.0,2.457428e+06,0.00,8,J23216+172,-1.669582,...,-1.447144,1.234007,0.240059,1.353353,-0.292464,-1.594149,-0.514765,-0.265768,-0.908883,1.236731
3990,B_Star-03997,0,0.00,0.00,0.0,2.458409e+06,0.00,3,J23419+441,0.205178,...,-0.952528,-1.318117,0.205736,-0.124168,-1.438182,0.429789,1.284367,-1.263381,0.082818,0.330041
3991,B_Star-03998,0,0.00,0.00,0.0,2.457468e+06,0.00,181,J05415+534,0.205178,...,-0.691898,-0.129730,-0.076035,1.534325,1.077539,0.339340,-0.390660,0.649633,-1.455163,-0.301700


In [29]:
print(list(s4_val_rel.columns))

['ID', 'Pulsating', 'frequency', 'amplitudeRV', 'offsetRV', 'refepochRV', 'phase', 'CARMENES_source_idx', 'CARMENES_Ref_star', 'all_times_nhist_numpeaks', 'all_times_nhist_peak1_bin', 'all_times_nhist_peak2_bin', 'all_times_nhist_peak3_bin', 'all_times_nhist_peak4_bin', 'all_times_nhist_peak_1_to_2', 'all_times_nhist_peak_1_to_3', 'all_times_nhist_peak_1_to_4', 'all_times_nhist_peak_2_to_3', 'all_times_nhist_peak_2_to_4', 'all_times_nhist_peak_3_to_4', 'all_times_nhist_peak_val', 'avg_double_to_single_step', 'cad_probs_1', 'cad_probs_10', 'cad_probs_20', 'cad_probs_30', 'cad_probs_40', 'cad_probs_50', 'cad_probs_100', 'cad_probs_500', 'cad_probs_1000', 'cad_probs_5000', 'cad_probs_10000', 'cad_probs_50000', 'cad_probs_100000', 'cad_probs_500000', 'cad_probs_1000000', 'cad_probs_5000000', 'cad_probs_10000000', 'cads_avg', 'cads_med', 'cads_std', 'med_double_to_single_step', 'n_epochs', 'std_double_to_single_step', 'total_time', 'percent_beyond_1_std', 'freq1_rel_phase2', 'freq1_rel_phas

In [31]:
s4_val_rel[['Pulsating', 'ID']].groupby('Pulsating').count()

Unnamed: 0_level_0,ID
Pulsating,Unnamed: 1_level_1
0,3602
1,391


## Pre-select the training data

### Select only pulsating with large amplitudes

In [35]:
s4_p_sel = s4_train_rel[s4_train_rel['amplitudeRV'] >= AMP_LIMIT]
s4_p_sel

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
32,ALT-B_Star-00032,1,8.0,1.6,0.0,0.0,0.00,0,J23505-095,0.205178,...,-1.074482,-1.431737,0.259974,-0.952483,-1.224305,-1.723113,0.252498,-1.325087,1.601744,0.361527
34,ALT-B_Star-00034,1,8.0,1.6,0.0,0.0,0.25,0,J23505-095,0.205178,...,-1.681624,1.344105,0.958364,-0.112659,-1.366790,-1.534866,1.092114,0.140819,-1.018173,1.090087
36,ALT-B_Star-00036,1,8.0,1.6,0.0,0.0,0.50,0,J23505-095,0.205178,...,0.114506,-1.238678,-1.367694,-1.332358,-1.550098,-0.319512,1.205561,1.188305,0.945860,-1.164275
38,ALT-B_Star-00038,1,8.0,1.6,0.0,0.0,0.75,0,J23505-095,0.205178,...,1.699257,0.744009,0.686096,1.407319,1.153091,0.356956,1.481383,0.479158,1.572947,2.637808
72,ALT-B_Star-00072,1,16.0,1.6,0.0,0.0,0.00,0,J23505-095,0.205178,...,-0.368981,1.543165,-0.004324,-1.422922,-0.567557,-1.384126,-0.987949,-0.715888,1.076111,-0.413494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37238,ALT-B_Star-37238,1,32.0,1.6,0.0,0.0,0.75,232,J00051+457,0.830097,...,0.775511,0.784189,-0.066339,0.458638,0.624362,-0.065369,0.390614,-1.325653,-0.269108,0.275244
37272,ALT-B_Star-37272,1,64.0,1.6,0.0,0.0,0.00,232,J00051+457,0.830097,...,-1.248050,0.182940,1.646465,1.436225,1.057840,1.479032,1.402431,1.040431,-0.831621,0.100595
37274,ALT-B_Star-37274,1,64.0,1.6,0.0,0.0,0.25,232,J00051+457,0.830097,...,1.435098,0.357766,-0.781517,-0.477600,0.410372,0.027069,0.833529,-0.790351,0.458927,0.385304
37276,ALT-B_Star-37276,1,64.0,1.6,0.0,0.0,0.50,232,J00051+457,0.830097,...,-0.858004,-1.078940,-1.023399,1.505857,1.606995,-0.998876,-0.347842,0.901605,-1.716635,0.223086


In [39]:
num_pulsating = len(s4_p_sel)
num_pulsating

3728

### Select the same number of non pulsating, stratifying by `CARMENES_Ref_star`

In [40]:
s4_np_sel = s4_train_rel[s4_train_rel['Pulsating'] == False]
s4_np_sel.head()

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
1,ALT-B_Star-00001,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,-0.119644,-0.478883,-1.275757,0.148617,1.492525,-0.143042,-1.449993,-1.171944,-0.620326,-1.178087
3,ALT-B_Star-00003,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,-0.634983,1.524229,0.153258,0.416966,1.008979,-0.472625,-1.520984,-0.482138,-1.09066,0.010829
5,ALT-B_Star-00005,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,1.142936,-0.406398,-0.057561,1.473037,-0.96413,0.648689,-1.014587,0.245582,-1.515027,0.995216
7,ALT-B_Star-00007,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,-0.776647,-0.039096,-0.178198,-0.057564,-1.063286,-1.443676,0.751892,0.048522,-0.23529,0.000404
9,ALT-B_Star-00009,0,0.0,0.0,0.0,0.0,0.0,0,J23505-095,0.205178,...,-1.242777,-0.408936,-0.146931,-0.457715,1.526687,-0.473615,-0.622539,-1.14002,-0.974352,1.875795


In [41]:
len(s4_np_sel)

18640

In [42]:
_, X_test, _, _ = train_test_split(s4_np_sel, s4_np_sel['Pulsating'], test_size=num_pulsating,
                                   random_state=RANDOM_STATE, stratify=s4_np_sel['CARMENES_source_idx'])
X_test

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
22991,ALT-B_Star-22991,0,0.0,0.0,0.0,0.0,0.0,143,J09511-123,-0.419742,...,1.719605,-1.571779,-0.155136,-0.296326,0.701626,-1.480015,-0.840085,1.344919,-1.180000,1.047530
17793,ALT-B_Star-17793,0,0.0,0.0,0.0,0.0,0.0,111,J12248-182,0.205178,...,0.328811,0.669824,-0.371930,1.137747,1.053213,-0.500143,0.700219,1.282748,-1.342304,-0.747275
19865,ALT-B_Star-19865,0,0.0,0.0,0.0,0.0,0.0,124,J11126+189,0.517637,...,-0.460720,-0.492794,1.195018,0.685614,-1.074879,-0.453189,1.232566,-0.574009,-0.157956,-0.825766
19609,ALT-B_Star-19609,0,0.0,0.0,0.0,0.0,0.0,122,J11302+076,1.142557,...,0.747716,0.932376,-1.139456,-1.146280,0.924987,1.146088,-0.130169,1.048578,-0.763408,-0.163366
3003,ALT-B_Star-03003,0,0.0,0.0,0.0,0.0,0.0,18,J21466-001,-0.107282,...,-1.717073,-0.137808,1.397578,0.406749,1.282558,-0.655863,0.268309,-0.612159,1.606985,0.551598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18685,ALT-B_Star-18685,0,0.0,0.0,0.0,0.0,0.0,116,J11511+352,0.517637,...,-1.402285,0.104469,-1.655831,1.474236,-0.956572,0.980998,1.271520,1.410496,-1.195108,1.151948
23203,ALT-B_Star-23203,0,0.0,0.0,0.0,0.0,0.0,145,J09447-182,-1.044662,...,-1.649089,-0.336574,-0.763547,0.115010,1.057371,1.352379,-1.077540,0.929470,-0.458194,0.313249
25545,ALT-B_Star-25545,0,0.0,0.0,0.0,0.0,0.0,159,J08293+039,-0.107282,...,1.476535,-1.283028,0.292253,-1.208234,1.573168,-0.252986,-1.326559,1.384645,0.640677,0.716681
4999,ALT-B_Star-04999,0,0.0,0.0,0.0,0.0,0.0,31,J20260+585,-0.107282,...,0.967112,1.329113,0.558478,-0.818445,-0.991235,0.586118,-0.403554,1.503508,1.690160,0.480188


### Join the two parts

In [44]:
s4_tr_sel = pd.concat([s4_p_sel, X_test], axis=0, ignore_index=True)
s4_tr_sel

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,all_times_nhist_numpeaks,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,ALT-B_Star-00032,1,8.0,1.6,0.0,0.0,0.00,0,J23505-095,0.205178,...,-1.074482,-1.431737,0.259974,-0.952483,-1.224305,-1.723113,0.252498,-1.325087,1.601744,0.361527
1,ALT-B_Star-00034,1,8.0,1.6,0.0,0.0,0.25,0,J23505-095,0.205178,...,-1.681624,1.344105,0.958364,-0.112659,-1.366790,-1.534866,1.092114,0.140819,-1.018173,1.090087
2,ALT-B_Star-00036,1,8.0,1.6,0.0,0.0,0.50,0,J23505-095,0.205178,...,0.114506,-1.238678,-1.367694,-1.332358,-1.550098,-0.319512,1.205561,1.188305,0.945860,-1.164275
3,ALT-B_Star-00038,1,8.0,1.6,0.0,0.0,0.75,0,J23505-095,0.205178,...,1.699257,0.744009,0.686096,1.407319,1.153091,0.356956,1.481383,0.479158,1.572947,2.637808
4,ALT-B_Star-00072,1,16.0,1.6,0.0,0.0,0.00,0,J23505-095,0.205178,...,-0.368981,1.543165,-0.004324,-1.422922,-0.567557,-1.384126,-0.987949,-0.715888,1.076111,-0.413494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7451,ALT-B_Star-18685,0,0.0,0.0,0.0,0.0,0.00,116,J11511+352,0.517637,...,-1.402285,0.104469,-1.655831,1.474236,-0.956572,0.980998,1.271520,1.410496,-1.195108,1.151948
7452,ALT-B_Star-23203,0,0.0,0.0,0.0,0.0,0.00,145,J09447-182,-1.044662,...,-1.649089,-0.336574,-0.763547,0.115010,1.057371,1.352379,-1.077540,0.929470,-0.458194,0.313249
7453,ALT-B_Star-25545,0,0.0,0.0,0.0,0.0,0.00,159,J08293+039,-0.107282,...,1.476535,-1.283028,0.292253,-1.208234,1.573168,-0.252986,-1.326559,1.384645,0.640677,0.716681
7454,ALT-B_Star-04999,0,0.0,0.0,0.0,0.0,0.00,31,J20260+585,-0.107282,...,0.967112,1.329113,0.558478,-0.818445,-0.991235,0.586118,-0.403554,1.503508,1.690160,0.480188


In [45]:
s4_tr_sel[['ID', 'Pulsating']].groupby(by='Pulsating').count()

Unnamed: 0_level_0,ID
Pulsating,Unnamed: 1_level_1
0,3728
1,3728


## Train all the _off-the-shelf_ classifiers

### Train benchmark classifiers

We train the classifiers just off-the shelf, with all the parameters set to default.

In [46]:
warnings.simplefilter('ignore')
fitted_classifiers_benchmark = OrderedDict()
for classifier in list(OFF_THE_SHELF_CLASSIFIERS.keys()):
    print("Fitting off-the-shelf classifier %s..." %classifier)
    clf = copy.deepcopy(OFF_THE_SHELF_CLASSIFIERS[classifier]['clf'])
    #param_grid = OFF_THE_SHELF_CLASSIFIERS[classifier]['param_grid']
    # Optimize with training data:
    cv = GridSearchCV(clf, param_grid={}, scoring='precision', cv=3, refit=True)
    start_time = time()
    cv.fit(s4_tr_sel[rel_features], s4_tr_sel['Pulsating'])
    end_time = time()
    elapsed_time = end_time - start_time
    print("... completed. Elapsed time: %.3f seconds" %elapsed_time)
    # Add the best fitted classifier to the dictionary:
    fitted_classifiers_benchmark[classifier] = OrderedDict({
        'Fitted_clf': copy.deepcopy(cv.best_estimator_),
        'OptTrain_time': elapsed_time
    })

Fitting off-the-shelf classifier Perceptron...
... completed. Elapsed time: 0.069 seconds
Fitting off-the-shelf classifier LogisticRegression...
... completed. Elapsed time: 0.157 seconds
Fitting off-the-shelf classifier PassiveAggressiveClassifier...
... completed. Elapsed time: 0.062 seconds
Fitting off-the-shelf classifier SVC...
... completed. Elapsed time: 9.122 seconds
Fitting off-the-shelf classifier KNeighborsClassifier...
... completed. Elapsed time: 0.390 seconds
Fitting off-the-shelf classifier GaussianProcessClassifier...
... completed. Elapsed time: 61.632 seconds
Fitting off-the-shelf classifier DecisionTreeClassifier...
... completed. Elapsed time: 0.599 seconds
Fitting off-the-shelf classifier RandomForestClassifier...
... completed. Elapsed time: 8.067 seconds
Fitting off-the-shelf classifier AdaBoostClassifier...
... completed. Elapsed time: 3.050 seconds
Fitting off-the-shelf classifier GradientBoostingClassifier...
... completed. Elapsed time: 14.881 seconds


In [47]:
fitted_classifiers_benchmark

OrderedDict([('Perceptron',
              OrderedDict([('Fitted_clf', Perceptron()),
                           ('OptTrain_time', 0.06880950927734375)])),
             ('LogisticRegression',
              OrderedDict([('Fitted_clf', LogisticRegression()),
                           ('OptTrain_time', 0.15658187866210938)])),
             ('PassiveAggressiveClassifier',
              OrderedDict([('Fitted_clf', PassiveAggressiveClassifier()),
                           ('OptTrain_time', 0.061835289001464844)])),
             ('SVC',
              OrderedDict([('Fitted_clf', SVC()),
                           ('OptTrain_time', 9.122247457504272)])),
             ('KNeighborsClassifier',
              OrderedDict([('Fitted_clf', KNeighborsClassifier()),
                           ('OptTrain_time', 0.3899233341217041)])),
             ('GaussianProcessClassifier',
              OrderedDict([('Fitted_clf', GaussianProcessClassifier()),
                           ('OptTrain_time', 61.63198542

We now validate the performance of each classifier, both on the training set and on the validation set.

We will calculate and record the following metrics for each of the optimized and fitted classifier:
- Accuracy
- Precision
- Recall
- F1_score
- LogLoss
- Matthews correlation coefficient

### Performance measurements

In [48]:
fitted_classifiers_benchmark

OrderedDict([('Perceptron',
              OrderedDict([('Fitted_clf', Perceptron()),
                           ('OptTrain_time', 0.06880950927734375)])),
             ('LogisticRegression',
              OrderedDict([('Fitted_clf', LogisticRegression()),
                           ('OptTrain_time', 0.15658187866210938)])),
             ('PassiveAggressiveClassifier',
              OrderedDict([('Fitted_clf', PassiveAggressiveClassifier()),
                           ('OptTrain_time', 0.061835289001464844)])),
             ('SVC',
              OrderedDict([('Fitted_clf', SVC()),
                           ('OptTrain_time', 9.122247457504272)])),
             ('KNeighborsClassifier',
              OrderedDict([('Fitted_clf', KNeighborsClassifier()),
                           ('OptTrain_time', 0.3899233341217041)])),
             ('GaussianProcessClassifier',
              OrderedDict([('Fitted_clf', GaussianProcessClassifier()),
                           ('OptTrain_time', 61.63198542

Let's see the classification reports for all the classifiers:

#### Confusion matrices

In [49]:
warnings.simplefilter('ignore')
y_train_true = s4_tr_sel['Pulsating']
y_val_true = s4_val_rel['Pulsating']
for classifier in list(fitted_classifiers_benchmark.keys()):
    print("\n\nPrinting confusion matrices for classifier %s..." %classifier)
    print("Training set:")
    y_train_pred = fitted_classifiers_benchmark[classifier]['Fitted_clf'].predict(s4_tr_sel[rel_features])
    print(confusion_matrix(y_true=y_train_true, y_pred=y_train_pred))
    # Training set:
    print("Validation set:")
    y_val_pred = fitted_classifiers_benchmark[classifier]['Fitted_clf'].predict(s4_val_rel[rel_features])
    print(confusion_matrix(y_true=y_val_true, y_pred=y_val_pred))



Printing confusion matrices for classifier Perceptron...
Training set:
[[1600 2128]
 [1560 2168]]
Validation set:
[[1557 2045]
 [ 166  225]]


Printing confusion matrices for classifier LogisticRegression...
Training set:
[[2017 1711]
 [1793 1935]]
Validation set:
[[1863 1739]
 [ 198  193]]


Printing confusion matrices for classifier PassiveAggressiveClassifier...
Training set:
[[1643 2085]
 [1613 2115]]
Validation set:
[[1548 2054]
 [ 180  211]]


Printing confusion matrices for classifier SVC...
Training set:
[[2314 1414]
 [1391 2337]]
Validation set:
[[1848 1754]
 [ 195  196]]


Printing confusion matrices for classifier KNeighborsClassifier...
Training set:
[[2518 1210]
 [1221 2507]]
Validation set:
[[1808 1794]
 [ 205  186]]


Printing confusion matrices for classifier GaussianProcessClassifier...
Training set:
[[3727    1]
 [   0 3728]]
Validation set:
[[1847 1755]
 [ 199  192]]


Printing confusion matrices for classifier DecisionTreeClassifier...
Training set:
[[3728    0]
 

**OBSERVATION:** The main problem here is that most classifiers overfit to the training set. However, it seems that the behavipour over the training set is now better (i.e. the classifiers do not predict a single label).

#### Classification reports

In [50]:
warnings.simplefilter('ignore')
y_train_true = s4_tr_sel['Pulsating']
y_val_true = s4_val_rel['Pulsating']
for classifier in list(fitted_classifiers_benchmark.keys()):
    print("Printing classification reports for classifier %s..." %classifier)
    print("\tTraining set:")
    y_train_pred = fitted_classifiers_benchmark[classifier]['Fitted_clf'].predict(s4_tr_sel[rel_features])
    print(classification_report(y_true=y_train_true, y_pred=y_train_pred))
    # Training set:
    print("\tValidation set:")
    y_val_pred = fitted_classifiers_benchmark[classifier]['Fitted_clf'].predict(s4_val_rel[rel_features])
    print(classification_report(y_true=y_val_true, y_pred=y_val_pred))


Printing classification reports for classifier Perceptron...
	Training set:
              precision    recall  f1-score   support

           0       0.51      0.43      0.46      3728
           1       0.50      0.58      0.54      3728

    accuracy                           0.51      7456
   macro avg       0.51      0.51      0.50      7456
weighted avg       0.51      0.51      0.50      7456

	Validation set:
              precision    recall  f1-score   support

           0       0.90      0.43      0.58      3602
           1       0.10      0.58      0.17       391

    accuracy                           0.45      3993
   macro avg       0.50      0.50      0.38      3993
weighted avg       0.82      0.45      0.54      3993

Printing classification reports for classifier LogisticRegression...
	Training set:
              precision    recall  f1-score   support

           0       0.53      0.54      0.54      3728
           1       0.53      0.52      0.52      3728

    a

We now validate the performance of each classifier, both on the training set and on the validation set.

We will calculate and record the following metrics for each of the optimized and fitted classifier:
- Accuracy
- Precision
- Recall
- F1_score
- LogLoss
- Matthews correlation coefficient

#### Main metrics

In [51]:
warnings.simplefilter('ignore')
y_train_true = s4_tr_sel['Pulsating']
y_val_true = s4_val_rel['Pulsating']
for classifier in list(fitted_classifiers_benchmark.keys()):
    print("Calculating predictions and performance for classifier %s..." %classifier)
    # Training set:
    y_train_pred = fitted_classifiers_benchmark[classifier]['Fitted_clf'].predict(s4_tr_sel[rel_features])
    fitted_classifiers_benchmark[classifier]['Training metrics'] = OrderedDict({})
    fitted_classifiers_benchmark[classifier]['Training metrics']['accuracy'] = \
        accuracy_score(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_benchmark[classifier]['Training metrics']['precision'] = \
        precision_score(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_benchmark[classifier]['Training metrics']['recall'] = \
        recall_score(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_benchmark[classifier]['Training metrics']['F1'] = \
        f1_score(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_benchmark[classifier]['Training metrics']['log_loss'] = \
        log_loss(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_benchmark[classifier]['Training metrics']['MCC'] = \
        matthews_corrcoef(y_true=y_train_true, y_pred=y_train_pred)
    # Validation set:
    y_val_pred = fitted_classifiers_benchmark[classifier]['Fitted_clf'].predict(s4_val_rel[rel_features])
    fitted_classifiers_benchmark[classifier]['Validation metrics'] = OrderedDict({})
    fitted_classifiers_benchmark[classifier]['Validation metrics']['accuracy'] = \
        accuracy_score(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_benchmark[classifier]['Validation metrics']['precision'] = \
        precision_score(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_benchmark[classifier]['Validation metrics']['recall'] = \
        recall_score(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_benchmark[classifier]['Validation metrics']['F1'] = \
        f1_score(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_benchmark[classifier]['Validation metrics']['log_loss'] = \
        log_loss(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_benchmark[classifier]['Validation metrics']['MCC'] = \
        matthews_corrcoef(y_true=y_val_true, y_pred=y_val_pred)


Calculating predictions and performance for classifier Perceptron...
Calculating predictions and performance for classifier LogisticRegression...
Calculating predictions and performance for classifier PassiveAggressiveClassifier...
Calculating predictions and performance for classifier SVC...
Calculating predictions and performance for classifier KNeighborsClassifier...
Calculating predictions and performance for classifier GaussianProcessClassifier...
Calculating predictions and performance for classifier DecisionTreeClassifier...
Calculating predictions and performance for classifier RandomForestClassifier...
Calculating predictions and performance for classifier AdaBoostClassifier...
Calculating predictions and performance for classifier GradientBoostingClassifier...


In [52]:
fitted_classifiers_benchmark

OrderedDict([('Perceptron',
              OrderedDict([('Fitted_clf', Perceptron()),
                           ('OptTrain_time', 0.06880950927734375),
                           ('Training metrics',
                            OrderedDict([('accuracy', 0.5053648068669528),
                                         ('precision', 0.5046554934823091),
                                         ('recall', 0.5815450643776824),
                                         ('F1', 0.5403788634097707),
                                         ('log_loss', 17.82845945534657),
                                         ('MCC', 0.010856361936157001)])),
                           ('Validation metrics',
                            OrderedDict([('accuracy', 0.4462809917355372),
                                         ('precision', 0.09911894273127753),
                                         ('recall', 0.5754475703324808),
                                         ('F1', 0.1691093573844419),
              

**OBSERVATION:** <font color='red'>**VERY BAD RESULTS**</font><font color='blue'>**, BUT THE IMBALANCE PROBLEM HAS CLEARLY DISSAPEARED NOW**</font>

We have still the problem of overfitting.

#### Focus on `precision`

We now set the focus on `precision`, as it is the metric we are more interested in.

In [53]:
precision_results = pd.DataFrame(index=fitted_classifiers_benchmark.keys())
precision_results

Perceptron
LogisticRegression
PassiveAggressiveClassifier
SVC
KNeighborsClassifier
GaussianProcessClassifier
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier


In [54]:
precision_results['BM_tr_precision'] = np.nan
precision_results['BM_val_precision'] = np.nan
precision_results

Unnamed: 0,BM_tr_precision,BM_val_precision
Perceptron,,
LogisticRegression,,
PassiveAggressiveClassifier,,
SVC,,
KNeighborsClassifier,,
GaussianProcessClassifier,,
DecisionTreeClassifier,,
RandomForestClassifier,,
AdaBoostClassifier,,
GradientBoostingClassifier,,


In [55]:
for clf in fitted_classifiers_benchmark.keys():
    precision_results.loc[clf, 'BM_tr_precision'] = \
        fitted_classifiers_benchmark[clf]['Training metrics']['precision']
    precision_results.loc[clf, 'BM_val_precision'] = \
        fitted_classifiers_benchmark[clf]['Validation metrics']['precision']
precision_results

Unnamed: 0,BM_tr_precision,BM_val_precision
Perceptron,0.504655,0.099119
LogisticRegression,0.530719,0.099896
PassiveAggressiveClassifier,0.503571,0.093157
SVC,0.623034,0.100513
KNeighborsClassifier,0.674469,0.093939
GaussianProcessClassifier,0.999732,0.098613
DecisionTreeClassifier,1.0,0.098846
RandomForestClassifier,0.999732,0.094381
AdaBoostClassifier,0.572537,0.096806
GradientBoostingClassifier,0.675528,0.096979


In [56]:
print("TRAINING / VALIDATION PRECISION RESULTS, OFF-THE-SHELF CLASSIFIERS")
for idx in (precision_results.index):
    print("%s: %.2f / %.2f" %(idx,
                              precision_results.loc[idx, 'BM_tr_precision'],
                              precision_results.loc[idx, 'BM_val_precision']))

TRAINING / VALIDATION PRECISION RESULTS, OFF-THE-SHELF CLASSIFIERS
Perceptron: 0.50 / 0.10
LogisticRegression: 0.53 / 0.10
PassiveAggressiveClassifier: 0.50 / 0.09
SVC: 0.62 / 0.10
KNeighborsClassifier: 0.67 / 0.09
GaussianProcessClassifier: 1.00 / 0.10
DecisionTreeClassifier: 1.00 / 0.10
RandomForestClassifier: 1.00 / 0.09
AdaBoostClassifier: 0.57 / 0.10
GradientBoostingClassifier: 0.68 / 0.10


<font color=red>**IT WORKS WORST THAN BEFORE WITH 1.2 m/?**</font>

**OBSERVATION:** so, it is clear that all the algorithms are performing pretty well for the training set, but:

- All of them show heavy overfitting, and even some of them in an extreme way. Notice that, compared wikth the training with imbalanced data, the classifiers less affected for that are the Tree classifiers (DecisionTree and RandomForest).


### Save the models

In [57]:
pickle.dump(fitted_classifiers_benchmark, open(MODELS_FOLDER + CASE_ID + "_1NN_fitted_clf_ots.pickle", 'wb'))

## Optimize all the _off-the-shelf_ classifiers

We now try to do a very simple and quick optimization of those off-the shelf classifiers, to see if the situation improves for some of them.

We choose the grid values around the default values, when possible.

In [58]:
list(OFF_THE_SHELF_CLASSIFIERS.keys())

['Perceptron',
 'LogisticRegression',
 'PassiveAggressiveClassifier',
 'SVC',
 'KNeighborsClassifier',
 'GaussianProcessClassifier',
 'DecisionTreeClassifier',
 'RandomForestClassifier',
 'AdaBoostClassifier',
 'GradientBoostingClassifier']

In [59]:
OFF_THE_SHELF_CLASSIFIERS

OrderedDict([('Perceptron',
              {'clf': Perceptron(),
               'param_grid': {'penalty': ['l1', 'l2', 'elasticnet'],
                'alpha': [0.001, 0.0001, 1e-05],
                'l1_ratio': [None, 0.075, 0.15, 0.3],
                'max_iter': [500, 1000, 2000],
                'random_state': [11]}}),
             ('LogisticRegression',
              {'clf': LogisticRegression(),
               'param_grid': {'penalty': ['l1', 'l2', 'elasticnet'],
                'C': [0.5, 1.0, 2.0],
                'l1_ratio': [None, 0.075, 0.15, 0.3],
                'solver': ['saga'],
                'max_iter': [50, 100, 200],
                'random_state': [11]}}),
             ('PassiveAggressiveClassifier',
              {'clf': PassiveAggressiveClassifier(),
               'param_grid': {'C': [0.5, 1.0, 2.0],
                'max_iter': [500, 1000, 2000],
                'loss': ['hinge', 'squared_hinge'],
                'random_state': [11]}}),
             ('SVC',
   

### Optimize classifiers

We now try a simple optimization of these off-the shelf classifiers, and targetting the  `precision` metric for minimization.

In [60]:
warnings.simplefilter('ignore')
fitted_classifiers_opt = OrderedDict()
for classifier in list(OFF_THE_SHELF_CLASSIFIERS.keys()):
    print("Optimizing and fitting classifier %s..." %classifier)
    clf = copy.deepcopy(OFF_THE_SHELF_CLASSIFIERS[classifier]['clf'])
    param_grid = OFF_THE_SHELF_CLASSIFIERS[classifier]['param_grid']
    # Optimize with training data:
    cv = GridSearchCV(clf, param_grid=param_grid, scoring='precision', cv=3, refit=True)
    start_time = time()
    cv.fit(s4_tr_sel[rel_features], s4_tr_sel['Pulsating'])
    end_time = time()
    elapsed_time = end_time - start_time
    print("... completed. Elapsed time: %.3f seconds" %elapsed_time)
    # Add the best fitted classifier to the dictionary:
    fitted_classifiers_opt[classifier] = OrderedDict({
        'Fitted_clf': copy.deepcopy(cv.best_estimator_),
        'OptTrain_time': elapsed_time
    })

Optimizing and fitting classifier Perceptron...
... completed. Elapsed time: 10.569 seconds
Optimizing and fitting classifier LogisticRegression...
... completed. Elapsed time: 106.604 seconds
Optimizing and fitting classifier PassiveAggressiveClassifier...
... completed. Elapsed time: 0.890 seconds
Optimizing and fitting classifier SVC...
... completed. Elapsed time: 183.403 seconds
Optimizing and fitting classifier KNeighborsClassifier...
... completed. Elapsed time: 69.803 seconds
Optimizing and fitting classifier GaussianProcessClassifier...
... completed. Elapsed time: 4638.038 seconds
Optimizing and fitting classifier DecisionTreeClassifier...
... completed. Elapsed time: 43.230 seconds
Optimizing and fitting classifier RandomForestClassifier...


KeyboardInterrupt: 

In [32]:
fitted_classifiers_opt

OrderedDict([('Perceptron',
              OrderedDict([('Fitted_clf',
                            Perceptron(alpha=0.001, l1_ratio=0.075, max_iter=500, penalty='elasticnet',
                                       random_state=11)),
                           ('OptTrain_time', 2.106365442276001)])),
             ('LogisticRegression',
              OrderedDict([('Fitted_clf',
                            LogisticRegression(C=0.5, l1_ratio=0.15, max_iter=50, penalty='elasticnet',
                                               random_state=11, solver='saga')),
                           ('OptTrain_time', 20.48395013809204)])),
             ('PassiveAggressiveClassifier',
              OrderedDict([('Fitted_clf',
                            PassiveAggressiveClassifier(C=2.0, loss='squared_hinge', max_iter=500,
                                                        random_state=11)),
                           ('OptTrain_time', 0.3291199207305908)])),
             ('SVC',
              Orde

We now validate the performance of each classifier, both on the training set and on the validation set.

We will calculate and record the following metrics for each of the optimized and fitted classifier:
- Accuracy
- Precision
- Recall
- F1_score
- LogLoss
- Matthews correlation coefficient

### Performance measurements

In [33]:
fitted_classifiers_opt

OrderedDict([('Perceptron',
              OrderedDict([('Fitted_clf',
                            Perceptron(alpha=0.001, l1_ratio=0.075, max_iter=500, penalty='elasticnet',
                                       random_state=11)),
                           ('OptTrain_time', 2.106365442276001)])),
             ('LogisticRegression',
              OrderedDict([('Fitted_clf',
                            LogisticRegression(C=0.5, l1_ratio=0.15, max_iter=50, penalty='elasticnet',
                                               random_state=11, solver='saga')),
                           ('OptTrain_time', 20.48395013809204)])),
             ('PassiveAggressiveClassifier',
              OrderedDict([('Fitted_clf',
                            PassiveAggressiveClassifier(C=2.0, loss='squared_hinge', max_iter=500,
                                                        random_state=11)),
                           ('OptTrain_time', 0.3291199207305908)])),
             ('SVC',
              Orde

Let's see the classification reports for all the classifiers:

#### Confusion matrices

In [34]:
warnings.simplefilter('ignore')
y_train_true = s4_tr_sel['Pulsating']
y_val_true = s4_val_rel['Pulsating']
for classifier in list(fitted_classifiers_opt.keys()):
    print("\n\nPrinting confusion matrices for classifier %s..." %classifier)
    print("Training set:")
    y_train_pred = fitted_classifiers_opt[classifier]['Fitted_clf'].predict(s4_tr_sel[rel_features])
    print(confusion_matrix(y_true=y_train_true, y_pred=y_train_pred))
    # Training set:
    print("Validation set:")
    y_val_pred = fitted_classifiers_opt[classifier]['Fitted_clf'].predict(s4_val_rel[rel_features])
    print(confusion_matrix(y_true=y_val_true, y_pred=y_val_pred))



Printing confusion matrices for classifier Perceptron...
Training set:
[[279 393]
 [141 531]]
Validation set:
[[ 76 148]
 [  7  19]]


Printing confusion matrices for classifier LogisticRegression...
Training set:
[[432 240]
 [214 458]]
Validation set:
[[122 102]
 [ 15  11]]


Printing confusion matrices for classifier PassiveAggressiveClassifier...
Training set:
[[394 278]
 [302 370]]
Validation set:
[[119 105]
 [ 10  16]]


Printing confusion matrices for classifier SVC...
Training set:
[[643  29]
 [  0 672]]
Validation set:
[[194  30]
 [ 20   6]]


Printing confusion matrices for classifier KNeighborsClassifier...
Training set:
[[672   0]
 [  0 672]]
Validation set:
[[194  30]
 [ 22   4]]


Printing confusion matrices for classifier GaussianProcessClassifier...
Training set:
[[654  18]
 [  0 672]]
Validation set:
[[203  21]
 [ 20   6]]


Printing confusion matrices for classifier DecisionTreeClassifier...
Training set:
[[631  41]
 [ 89 583]]
Validation set:
[[180  44]
 [ 20   6]]


**OBSERVATION:** Again, the main problem here is that most classifiers overfit to the training set.

#### Classification reports

In [35]:
warnings.simplefilter('ignore')
y_train_true = s4_tr_sel['Pulsating']
y_val_true = s4_val_rel['Pulsating']
for classifier in list(fitted_classifiers_opt.keys()):
    print("Printing classification reports for classifier %s..." %classifier)
    print("\tTraining set:")
    y_train_pred = fitted_classifiers_opt[classifier]['Fitted_clf'].predict(s4_tr_sel[rel_features])
    print(classification_report(y_true=y_train_true, y_pred=y_train_pred))
    # Training set:
    print("\tValidation set:")
    y_val_pred = fitted_classifiers_opt[classifier]['Fitted_clf'].predict(s4_val_rel[rel_features])
    print(classification_report(y_true=y_val_true, y_pred=y_val_pred))


Printing classification reports for classifier Perceptron...
	Training set:
              precision    recall  f1-score   support

           0       0.66      0.42      0.51       672
           1       0.57      0.79      0.67       672

    accuracy                           0.60      1344
   macro avg       0.62      0.60      0.59      1344
weighted avg       0.62      0.60      0.59      1344

	Validation set:
              precision    recall  f1-score   support

           0       0.92      0.34      0.50       224
           1       0.11      0.73      0.20        26

    accuracy                           0.38       250
   macro avg       0.51      0.54      0.35       250
weighted avg       0.83      0.38      0.46       250

Printing classification reports for classifier LogisticRegression...
	Training set:
              precision    recall  f1-score   support

           0       0.67      0.64      0.66       672
           1       0.66      0.68      0.67       672

    a

We now validate the performance of each classifier, both on the training set and on the validation set.

We will calculate and record the following metrics for each of the optimized and fitted classifier:
- Accuracy
- Precision
- Recall
- F1_score
- LogLoss
- Matthews correlation coefficient

#### Main metrics

In [36]:
warnings.simplefilter('ignore')
y_train_true = s4_tr_sel['Pulsating']
y_val_true = s4_val_rel['Pulsating']
for classifier in list(fitted_classifiers_opt.keys()):
    print("Calculating predictions and performance for classifier %s..." %classifier)
    # Training set:
    y_train_pred = fitted_classifiers_opt[classifier]['Fitted_clf'].predict(s4_tr_sel[rel_features])
    fitted_classifiers_opt[classifier]['Training metrics'] = OrderedDict({})
    fitted_classifiers_opt[classifier]['Training metrics']['accuracy'] = \
        accuracy_score(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_opt[classifier]['Training metrics']['precision'] = \
        precision_score(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_opt[classifier]['Training metrics']['recall'] = \
        recall_score(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_opt[classifier]['Training metrics']['F1'] = \
        f1_score(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_opt[classifier]['Training metrics']['log_loss'] = \
        log_loss(y_true=y_train_true, y_pred=y_train_pred)
    fitted_classifiers_opt[classifier]['Training metrics']['MCC'] = \
        matthews_corrcoef(y_true=y_train_true, y_pred=y_train_pred)
    # Validation set:
    y_val_pred = fitted_classifiers_opt[classifier]['Fitted_clf'].predict(s4_val_rel[rel_features])
    fitted_classifiers_opt[classifier]['Validation metrics'] = OrderedDict({})
    fitted_classifiers_opt[classifier]['Validation metrics']['accuracy'] = \
        accuracy_score(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_opt[classifier]['Validation metrics']['precision'] = \
        precision_score(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_opt[classifier]['Validation metrics']['recall'] = \
        recall_score(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_opt[classifier]['Validation metrics']['F1'] = \
        f1_score(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_opt[classifier]['Validation metrics']['log_loss'] = \
        log_loss(y_true=y_val_true, y_pred=y_val_pred)
    fitted_classifiers_opt[classifier]['Validation metrics']['MCC'] = \
        matthews_corrcoef(y_true=y_val_true, y_pred=y_val_pred)


Calculating predictions and performance for classifier Perceptron...
Calculating predictions and performance for classifier LogisticRegression...
Calculating predictions and performance for classifier PassiveAggressiveClassifier...
Calculating predictions and performance for classifier SVC...
Calculating predictions and performance for classifier KNeighborsClassifier...
Calculating predictions and performance for classifier GaussianProcessClassifier...
Calculating predictions and performance for classifier DecisionTreeClassifier...
Calculating predictions and performance for classifier RandomForestClassifier...
Calculating predictions and performance for classifier AdaBoostClassifier...
Calculating predictions and performance for classifier GradientBoostingClassifier...


In [37]:
fitted_classifiers_opt

OrderedDict([('Perceptron',
              OrderedDict([('Fitted_clf',
                            Perceptron(alpha=0.001, l1_ratio=0.075, max_iter=500, penalty='elasticnet',
                                       random_state=11)),
                           ('OptTrain_time', 2.106365442276001),
                           ('Training metrics',
                            OrderedDict([('accuracy', 0.6026785714285714),
                                         ('precision', 0.5746753246753247),
                                         ('recall', 0.7901785714285714),
                                         ('F1', 0.6654135338345865),
                                         ('log_loss', 14.320915855497438),
                                         ('MCC', 0.2215228119522081)])),
                           ('Validation metrics',
                            OrderedDict([('accuracy', 0.38),
                                         ('precision', 0.11377245508982035),
                          

**OBSERVATION:** <font color='red'>**ONCE AGAIN, VERY BAD RESULTS EVEN WITH OPTIMIZATION**</font>

The problem here could be that all classifiers are suffering from overfitting and / or imbalanced dataset.

#### Focus on `precision`

We now set the focus on `precision`, as it is the metric we are more interested in.

In [38]:
precision_results['BMOPT_tr_precision'] = np.nan
precision_results['BMOPT_val_precision'] = np.nan
precision_results

Unnamed: 0,BM_tr_precision,BM_val_precision,BMOPT_tr_precision,BMOPT_val_precision
Perceptron,0.578526,0.122449,,
LogisticRegression,0.654519,0.105263,,
PassiveAggressiveClassifier,0.59882,0.090909,,
SVC,0.922438,0.155556,,
KNeighborsClassifier,0.764505,0.110092,,
GaussianProcessClassifier,1.0,0.115385,,
DecisionTreeClassifier,1.0,0.108696,,
RandomForestClassifier,1.0,0.181818,,
AdaBoostClassifier,0.859216,0.096154,,
GradientBoostingClassifier,0.968162,0.128205,,


In [39]:
for clf in fitted_classifiers_opt.keys():
    precision_results.loc[clf, 'BMOPT_tr_precision'] = \
        fitted_classifiers_opt[clf]['Training metrics']['precision']
    precision_results.loc[clf, 'BMOPT_val_precision'] = \
        fitted_classifiers_opt[clf]['Validation metrics']['precision']
precision_results

Unnamed: 0,BM_tr_precision,BM_val_precision,BMOPT_tr_precision,BMOPT_val_precision
Perceptron,0.578526,0.122449,0.574675,0.113772
LogisticRegression,0.654519,0.105263,0.65616,0.097345
PassiveAggressiveClassifier,0.59882,0.090909,0.570988,0.132231
SVC,0.922438,0.155556,0.958631,0.166667
KNeighborsClassifier,0.764505,0.110092,1.0,0.117647
GaussianProcessClassifier,1.0,0.115385,0.973913,0.222222
DecisionTreeClassifier,1.0,0.108696,0.934295,0.12
RandomForestClassifier,1.0,0.181818,0.98797,0.0625
AdaBoostClassifier,0.859216,0.096154,0.902616,0.073171
GradientBoostingClassifier,0.968162,0.128205,0.796634,0.105263


In [40]:
print("TRAINING / VALIDATION PRECISION RESULTS, SLIGHTLY OPTIMIZED OFF-THE-SHELF CLASSIFIERS")
for idx in (precision_results.index):
    print("%s: %.2f / %.2f" %(idx,
                              precision_results.loc[idx, 'BMOPT_tr_precision'],
                              precision_results.loc[idx, 'BMOPT_val_precision']))

TRAINING / VALIDATION PRECISION RESULTS, SLIGHTLY OPTIMIZED OFF-THE-SHELF CLASSIFIERS
Perceptron: 0.57 / 0.11
LogisticRegression: 0.66 / 0.10
PassiveAggressiveClassifier: 0.57 / 0.13
SVC: 0.96 / 0.17
KNeighborsClassifier: 1.00 / 0.12
GaussianProcessClassifier: 0.97 / 0.22
DecisionTreeClassifier: 0.93 / 0.12
RandomForestClassifier: 0.99 / 0.06
AdaBoostClassifier: 0.90 / 0.07
GradientBoostingClassifier: 0.80 / 0.11


**OBSERVATION:** so, it is clear that all the algorithms are still performing badly:

- All of them show some kind of overfitting.

**CONCLUSIONS: IMPROVEMENT/WORSENING WITH A SINGLE OPTIMIZATION OF OFF-THE-SHELF CLASSIFIERS:**
- Perceptron: **optimization has worsened the results a little**, and **overfitting / imbalance is still a problem** (but not so hard as in other cases, and it is still the most train/validation-balanced case from all the classifiers).
- LogisticRegression: **no improvement at all with optimization**. Again, extremely bad results, both in training and in validation. **Probably, suffering from imbalanced classes in an extreme way**. Does not even have the opportunity to overfit.
- PassiveAggressiveClassifier: **improves with optimization**. Performances in line with those of `Perceptron`, but a little bit more overfitting. **Probably, suffering from imbalanced classes**. Does not even have the opportunity to overfit.
- SVC: **great improve with optimization**, but **it now falls in extreme overfitting.**.
- KNeighborsClassifier: **with optimization, results seem more reasonable**. Perfect precision in training, low precision in validation. **Extreme overfitting problem**.
- GaussianProcessClassifier: **much worse with optimization**. Both`precision` values go to null: **extreme problem with imbalanced classes.** Does not even have the opportunity to overfit.
- DecisionTreeClassifier: **with optimization, it seems to overfit less**, but the `precision` in validation goes to zero. **Probably Suffering a lot from overfitting**.
- RandomForestClassifier: **no change with optimization**: again, **problems with both overfitting and umbalanced classes**.
- AdaBoostClassifier: **with optimization, the overfitting problem has improved, it now overfits less**, but still seems to suffer a little from imbalanced classes.
- GradientBoostingClassifier: **optimization has worsened the results**. Zero precision for both the training and validation sets. **Why?**

### Save the models

In [41]:
pickle.dump(fitted_classifiers_opt, open(MODELS_FOLDER + CASE_ID + "_1NN_fitted_clf_opt.pickle", 'wb'))

## Save the results

In [42]:
precision_results = precision_results.reset_index(drop=False).rename(columns={'index': 'Classifier'})
precision_results

Unnamed: 0,Classifier,BM_tr_precision,BM_val_precision,BMOPT_tr_precision,BMOPT_val_precision
0,Perceptron,0.578526,0.122449,0.574675,0.113772
1,LogisticRegression,0.654519,0.105263,0.65616,0.097345
2,PassiveAggressiveClassifier,0.59882,0.090909,0.570988,0.132231
3,SVC,0.922438,0.155556,0.958631,0.166667
4,KNeighborsClassifier,0.764505,0.110092,1.0,0.117647
5,GaussianProcessClassifier,1.0,0.115385,0.973913,0.222222
6,DecisionTreeClassifier,1.0,0.108696,0.934295,0.12
7,RandomForestClassifier,1.0,0.181818,0.98797,0.0625
8,AdaBoostClassifier,0.859216,0.096154,0.902616,0.073171
9,GradientBoostingClassifier,0.968162,0.128205,0.796634,0.105263


In [43]:
precision_results.to_csv(MODELS_FOLDER + PRECISION_RESULTS_OUT, sep=',', decimal='.', index=False)

## Predictions on the validation dataset

### Predictions

We now save the predictions on the validation dataset, alongside with all available metadata, so that they can be analysed later on.

In [44]:
s4_val_w_pred = s4_val.copy()
s4_val_w_pred

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,Star-00107,0,0.0,0.0,0.0,2.457430e+06,0.0,-0.991660,0.031948,0.542146,...,0.215296,-1.171010,-1.399418,0.202986,-0.550638,0.473838,-0.300629,-0.565171,-0.717831,-0.998997
1,Star-00868,0,0.0,0.0,0.0,2.457432e+06,0.0,-1.309194,-1.081711,1.825039,...,0.859339,-1.201677,-0.242453,-0.032886,-0.381893,4.479455,0.554354,0.946671,-2.945576,-0.390979
2,Star-00106,0,0.0,0.0,0.0,2.457404e+06,0.0,-0.356591,0.379966,0.844003,...,-0.619653,1.401153,0.280531,0.057394,-0.394560,0.012444,-0.506509,-0.073337,-0.019620,0.850942
3,Star-00120,0,0.0,0.0,0.0,2.457395e+06,0.0,-0.039057,0.519174,0.994931,...,-0.544944,-0.806949,-0.860069,0.186396,0.224160,-0.895172,-0.068329,-0.094626,0.283729,-1.018961
4,Star-00559,0,0.0,0.0,0.0,2.457441e+06,0.0,0.596012,-0.664089,-0.212498,...,0.600208,0.897864,0.219432,0.051864,-0.446379,-0.502312,-0.305140,-0.144503,0.889861,0.402366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Star-00232,0,0.0,0.0,0.0,2.457436e+06,0.0,0.913546,0.101552,-1.419927,...,-0.705425,0.564869,-0.512824,0.063347,-0.448318,-0.790483,0.590039,-0.056457,0.771706,0.054225
246,Star-00943,0,0.0,0.0,0.0,2.457418e+06,0.0,0.278478,-0.733692,1.447717,...,0.591706,-1.002341,0.166327,0.253063,0.827107,-0.848836,-0.189881,-0.728901,0.989328,-1.001833
247,Star-00721,0,0.0,0.0,0.0,2.457398e+06,0.0,-0.356591,1.493625,1.221324,...,0.631140,-1.384100,-0.953155,0.073608,0.003997,-0.510853,-0.228942,-0.016225,-0.886376,-1.003102
248,Star-00926,0,0.0,0.0,0.0,2.457425e+06,0.0,0.913546,-1.151314,0.240288,...,-0.040429,0.545847,-0.283424,0.096595,-0.575888,-0.223498,0.142206,-0.298919,-0.741117,0.182974


In [45]:
warnings.simplefilter('ignore')
for classifier in list(fitted_classifiers_opt.keys()):
    print("Calculating predictions for classifier %s..." %classifier)
    # Training set:
    y_val_pred = fitted_classifiers_opt[classifier]['Fitted_clf'].predict(s4_val[rel_features])
    s4_val_w_pred['Prediction_' + classifier] = y_val_pred

Calculating predictions for classifier Perceptron...
Calculating predictions for classifier LogisticRegression...
Calculating predictions for classifier PassiveAggressiveClassifier...
Calculating predictions for classifier SVC...
Calculating predictions for classifier KNeighborsClassifier...
Calculating predictions for classifier GaussianProcessClassifier...
Calculating predictions for classifier DecisionTreeClassifier...
Calculating predictions for classifier RandomForestClassifier...
Calculating predictions for classifier AdaBoostClassifier...
Calculating predictions for classifier GradientBoostingClassifier...


In [46]:
s4_val_w_pred.head(20)

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,...,Prediction_Perceptron,Prediction_LogisticRegression,Prediction_PassiveAggressiveClassifier,Prediction_SVC,Prediction_KNeighborsClassifier,Prediction_GaussianProcessClassifier,Prediction_DecisionTreeClassifier,Prediction_RandomForestClassifier,Prediction_AdaBoostClassifier,Prediction_GradientBoostingClassifier
0,Star-00107,0,0.0,0.0,0.0,2457430.0,0.0,-0.99166,0.031948,0.542146,...,0,0,0,0,0,0,0,0,0,0
1,Star-00868,0,0.0,0.0,0.0,2457432.0,0.0,-1.309194,-1.081711,1.825039,...,0,0,0,0,0,0,0,0,0,0
2,Star-00106,0,0.0,0.0,0.0,2457404.0,0.0,-0.356591,0.379966,0.844003,...,1,0,1,0,0,0,1,0,1,0
3,Star-00120,0,0.0,0.0,0.0,2457395.0,0.0,-0.039057,0.519174,0.994931,...,0,0,0,0,0,0,1,1,0,1
4,Star-00559,0,0.0,0.0,0.0,2457441.0,0.0,0.596012,-0.664089,-0.212498,...,0,0,1,0,0,0,0,0,0,0
5,Star-00205,1,71.94,0.53,0.0,2457510.0,0.05,-0.039057,1.493625,0.391217,...,1,1,1,0,0,0,1,0,0,1
6,Star-00061,0,0.0,0.0,0.0,2457432.0,0.0,-0.356591,1.493625,1.221324,...,1,1,1,0,0,0,0,0,0,0
7,Star-00258,0,0.0,0.0,0.0,2457401.0,0.0,0.278478,-0.176863,-0.665284,...,1,1,1,0,0,0,0,0,0,1
8,Star-00124,0,0.0,0.0,0.0,2457401.0,0.0,-1.944263,1.0064,2.051432,...,1,0,1,0,0,0,0,0,0,0
9,Star-00775,0,0.0,0.0,0.0,2457404.0,0.0,0.596012,-0.664089,-0.212498,...,0,0,1,0,0,0,0,0,0,0


### Prediction probabilities (if available)

In [47]:
warnings.simplefilter('ignore')
for classifier in list(fitted_classifiers_opt.keys()):
    print("Calculating predictions for classifier %s..." %classifier)
    # Validation set:
    try:
        y_val_pred_proba = fitted_classifiers_opt[classifier]['Fitted_clf'].predict_proba(s4_val[rel_features])
        s4_val_w_pred['PredictionProb_' + classifier] = pd.Series(y_val_pred_proba[:, 1])
        print("... ok, probabilities calculated")
    except:
        print("**WARNING: 'predict_proba' method failed for classifier '%s'." %classifier)
        s4_val_w_pred['PredictionProb_' + classifier] = np.nan

Calculating predictions for classifier Perceptron...
Calculating predictions for classifier LogisticRegression...
... ok, probabilities calculated
Calculating predictions for classifier PassiveAggressiveClassifier...
Calculating predictions for classifier SVC...
Calculating predictions for classifier KNeighborsClassifier...
... ok, probabilities calculated
Calculating predictions for classifier GaussianProcessClassifier...
... ok, probabilities calculated
Calculating predictions for classifier DecisionTreeClassifier...
... ok, probabilities calculated
Calculating predictions for classifier RandomForestClassifier...
... ok, probabilities calculated
Calculating predictions for classifier AdaBoostClassifier...
... ok, probabilities calculated
Calculating predictions for classifier GradientBoostingClassifier...
... ok, probabilities calculated


In [48]:
s4_val_w_pred.head(20)

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,...,PredictionProb_Perceptron,PredictionProb_LogisticRegression,PredictionProb_PassiveAggressiveClassifier,PredictionProb_SVC,PredictionProb_KNeighborsClassifier,PredictionProb_GaussianProcessClassifier,PredictionProb_DecisionTreeClassifier,PredictionProb_RandomForestClassifier,PredictionProb_AdaBoostClassifier,PredictionProb_GradientBoostingClassifier
0,Star-00107,0,0.0,0.0,0.0,2457430.0,0.0,-0.99166,0.031948,0.542146,...,,0.254676,,,0.0,0.313916,0.0,0.312359,0.3978,0.449421
1,Star-00868,0,0.0,0.0,0.0,2457432.0,0.0,-1.309194,-1.081711,1.825039,...,,0.193567,,,0.0,0.288121,0.079365,0.132156,0.476476,0.290055
2,Star-00106,0,0.0,0.0,0.0,2457404.0,0.0,-0.356591,0.379966,0.844003,...,,0.377001,,,0.0,0.286604,0.846154,0.273521,0.502487,0.494673
3,Star-00120,0,0.0,0.0,0.0,2457395.0,0.0,-0.039057,0.519174,0.994931,...,,0.438314,,,0.0,0.325637,0.962963,0.594324,0.497224,0.535603
4,Star-00559,0,0.0,0.0,0.0,2457441.0,0.0,0.596012,-0.664089,-0.212498,...,,0.458241,,,0.0,0.40798,0.022727,0.153099,0.488427,0.350336
5,Star-00205,1,71.94,0.53,0.0,2457510.0,0.05,-0.039057,1.493625,0.391217,...,,0.533833,,,0.0,0.462049,0.949275,0.44029,0.499768,0.506261
6,Star-00061,0,0.0,0.0,0.0,2457432.0,0.0,-0.356591,1.493625,1.221324,...,,0.646664,,,0.0,0.412224,0.352941,0.259671,0.411559,0.418226
7,Star-00258,0,0.0,0.0,0.0,2457401.0,0.0,0.278478,-0.176863,-0.665284,...,,0.673551,,,0.0,0.412592,0.377778,0.376673,0.499576,0.653731
8,Star-00124,0,0.0,0.0,0.0,2457401.0,0.0,-1.944263,1.0064,2.051432,...,,0.345656,,,0.0,0.389594,0.079365,0.216017,0.404364,0.358141
9,Star-00775,0,0.0,0.0,0.0,2457404.0,0.0,0.596012,-0.664089,-0.212498,...,,0.31465,,,0.0,0.422637,0.022727,0.260407,0.496226,0.369092


### Save the predictions

And we now save the file:

In [49]:
s4_val_w_pred.to_csv(MODELS_FOLDER + VAL_PREDICTIONS_OUT, sep=',', decimal='.', index=False)

## Summary

**RESULTS:**

- We tested different classifiers from different families against the S4 sample oversampled with SMOTE.
- In general, results show slight improvement with a very simple and naive model hyperparameter optimization.
- Overfitting seems to be a serious problem, more specially with the tree / ensemble methods.
- We have stored both the precision of the different ML models, as well as their predictions on the validation set (and prediction probabilities when available).

**CONCLUSIONS:**

- Additional work in tree and ensemble classifiers is needed to prevent overfitting: pruning the trees, for example (even if some values for `ccp_alpha` parameter were tried as part of the optimization).
