# MACHINE LEARNING - BULK VALIDATION

In this notebook we do the bulk validation of the model with the previously created 20 random synthetic datasets (of type DS4).

## Modules and configuration

### Modules

In [3]:
import warnings

import pandas as pd

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

#from sklearn.experimental import enable_hist_gradient_boosting
# Not sure why this 'experimental' import is needed, as the sklearn version is 0.24.x > 0.21.x
from sklearn.ensemble import HistGradientBoostingClassifier

from joblib import dump, load


### Configuration

In [4]:
# CONFIGURATION:
RANDOM_STATE = 11 # For reproducibility

N_DS = 20 # Number of synthetic datasets.

CS_FEATURES_FOLDER = "../data/DATASETS_CESIUM/"
OUT_DATASET_GEN_FILE = "cesium_VAL_DS<number>_4_Dataset.csv"

VAL_RESULT_FILE = "./Bulk_ModelValidation_Result.csv"

OUT_MODELS_FOLDER = "../data/MODELS_ML/"

# FEATURES TO KEEP - To remove the Lomb-Scargle (Periodic) 'cesium' features
TARGET_DS4 = ['Pulsating']
KEEP_CS_FEATURES = ['all_times_nhist_numpeaks',
                   'all_times_nhist_peak1_bin', 'all_times_nhist_peak2_bin', 'all_times_nhist_peak3_bin', 'all_times_nhist_peak4_bin',
                   'all_times_nhist_peak_1_to_2', 'all_times_nhist_peak_1_to_3', 'all_times_nhist_peak_1_to_4',
                   'all_times_nhist_peak_2_to_3', 'all_times_nhist_peak_2_to_4',
                   'all_times_nhist_peak_3_to_4',
                   'all_times_nhist_peak_val',
                   'avg_double_to_single_step', 'avg_err', 'avgt',
                   'cad_probs_1', 'cad_probs_10', 'cad_probs_20', 'cad_probs_30', 'cad_probs_40', 'cad_probs_50',
                   'cad_probs_100', 'cad_probs_500', 'cad_probs_1000', 'cad_probs_5000',
                   'cad_probs_10000', 'cad_probs_50000', 'cad_probs_100000', 'cad_probs_500000',
                   'cad_probs_1000000', 'cad_probs_5000000', 'cad_probs_10000000',
                   'cads_avg', 'cads_med', 'cads_std', 'mean',
                   'med_double_to_single_step', 'med_err',
                   'n_epochs', 'std_double_to_single_step', 'std_err',
                   'total_time', 'amplitude',
                   'flux_percentile_ratio_mid20', 'flux_percentile_ratio_mid35', 'flux_percentile_ratio_mid50',
                   'flux_percentile_ratio_mid65', 'flux_percentile_ratio_mid80',
                   'max_slope', 'maximum', 'median', 'median_absolute_deviation', 'minimum',
                   'percent_amplitude', 'percent_beyond_1_std', 'percent_close_to_median', 'percent_difference_flux_percentile',
                   'period_fast', 'qso_log_chi2_qsonu', 'qso_log_chi2nuNULL_chi2nu', 'skew', 'std',
                   'stetson_j', 'stetson_k', 'weighted_average', 'fold2P_slope_10percentile', 'fold2P_slope_90percentile']




### Functions

## Load the model

In [5]:
clf = load(OUT_MODELS_FOLDER + "Best_Model_After_RandSearchCV.joblib")
clf

RandomizedSearchCV(cv=5,
                   estimator=HistGradientBoostingClassifier(random_state=11),
                   n_iter=200,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000276CAE05430>,
                                        'max_bins': [255], 'max_depth': [None],
                                        'max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000276CE6B74F0>,
                                        'max_leaf_nodes': [None],
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000276CE6B74C0>},
                   scoring='roc_auc', verbose=1)

In [6]:
clf.best_params_

{'learning_rate': 0.08887773751235287,
 'max_bins': 255,
 'max_depth': None,
 'max_iter': 53,
 'max_leaf_nodes': None,
 'min_samples_leaf': 58}

## Validate the model against all the synthetic datasets

In [7]:
res_df = pd.DataFrame(columns=['Val_sample_ID', 'precision', 'accuracy', 'recall', 'f1_score'])
res_df.head()

Unnamed: 0,Val_sample_ID,precision,accuracy,recall,f1_score


In [8]:
#for i in range(0, 2): # TEST
for i in range(0, N_DS):
    # Load the validation dataset:
    ds = pd.read_csv(CS_FEATURES_FOLDER + OUT_DATASET_GEN_FILE.replace("<number>", str(i)) , sep=',', decimal='.')
    X_val = ds[KEEP_CS_FEATURES].copy()
    y_val = ds[TARGET_DS4].copy()
    # Calculate performance:
    precision = precision_score(y_val, clf.best_estimator_.predict(X_val))
    accuracy = accuracy_score(y_val, clf.best_estimator_.predict(X_val))
    recall = recall_score(y_val, clf.best_estimator_.predict(X_val))
    f1 = f1_score(y_val, clf.best_estimator_.predict(X_val))
    # Add the data:
    new_data = pd.DataFrame(data={
        'Val_sample_ID': str(i),
        'precision': precision,
        'accuracy': accuracy,
        'recall': recall,
        'f1_score': f1}, index=[0])
    res_df = pd.concat([res_df, new_data], ignore_index=True)



In [9]:
res_df

Unnamed: 0,Val_sample_ID,precision,accuracy,recall,f1_score
0,0,0.703804,0.6675,0.915194,0.795699
1,1,0.705882,0.6875,0.946237,0.808576
2,2,0.724796,0.705,0.939929,0.818462
3,3,0.6875,0.65,0.910072,0.783282
4,4,0.707775,0.69,0.946237,0.809816
5,5,0.756303,0.7025,0.89404,0.819423
6,6,0.707124,0.69,0.953737,0.812121
7,7,0.697297,0.67,0.928058,0.796296
8,8,0.70411,0.6625,0.90493,0.791988
9,9,0.66129,0.6375,0.928302,0.77237


In [10]:
min_df = pd.DataFrame(data={
        'Val_sample_ID': 'min',
        'precision': res_df['precision'].min(),
        'accuracy': res_df['accuracy'].min(),
        'recall': res_df['recall'].min(),
        'f1_score': res_df['f1_score'].min()}, index=[0])
max_df = pd.DataFrame(data={
        'Val_sample_ID': 'max',
        'precision': res_df['precision'].max(),
        'accuracy': res_df['accuracy'].max(),
        'recall': res_df['recall'].max(),
        'f1_score': res_df['f1_score'].max()}, index=[0])
mean_df = pd.DataFrame(data={
        'Val_sample_ID': 'mean',
        'precision': res_df['precision'].mean(),
        'accuracy': res_df['accuracy'].mean(),
        'recall': res_df['recall'].mean(),
        'f1_score': res_df['f1_score'].mean()}, index=[0])
std_df = pd.DataFrame(data={
        'Val_sample_ID': 'stdev',
        'precision': res_df['precision'].std(),
        'accuracy': res_df['accuracy'].std(),
        'recall': res_df['recall'].std(),
        'f1_score': res_df['f1_score'].std()}, index=[0])

res_df = pd.concat([res_df, min_df, max_df, std_df, mean_df], ignore_index=True)
res_df

Unnamed: 0,Val_sample_ID,precision,accuracy,recall,f1_score
0,0,0.703804,0.6675,0.915194,0.795699
1,1,0.705882,0.6875,0.946237,0.808576
2,2,0.724796,0.705,0.939929,0.818462
3,3,0.6875,0.65,0.910072,0.783282
4,4,0.707775,0.69,0.946237,0.809816
5,5,0.756303,0.7025,0.89404,0.819423
6,6,0.707124,0.69,0.953737,0.812121
7,7,0.697297,0.67,0.928058,0.796296
8,8,0.70411,0.6625,0.90493,0.791988
9,9,0.66129,0.6375,0.928302,0.77237


In [11]:
# Print the table to Latex (with four decimals):
#pandas.options.display.float_format = '{.4f}'.format
print(res_df.to_latex(float_format="%.4f", index=False))

\begin{tabular}{lrrrr}
\toprule
Val\_sample\_ID &  precision &  accuracy &  recall &  f1\_score \\
\midrule
            0 &     0.7038 &    0.6675 &  0.9152 &    0.7957 \\
            1 &     0.7059 &    0.6875 &  0.9462 &    0.8086 \\
            2 &     0.7248 &    0.7050 &  0.9399 &    0.8185 \\
            3 &     0.6875 &    0.6500 &  0.9101 &    0.7833 \\
            4 &     0.7078 &    0.6900 &  0.9462 &    0.8098 \\
            5 &     0.7563 &    0.7025 &  0.8940 &    0.8194 \\
            6 &     0.7071 &    0.6900 &  0.9537 &    0.8121 \\
            7 &     0.6973 &    0.6700 &  0.9281 &    0.7963 \\
            8 &     0.7041 &    0.6625 &  0.9049 &    0.7920 \\
            9 &     0.6613 &    0.6375 &  0.9283 &    0.7724 \\
           10 &     0.7253 &    0.6825 &  0.9072 &    0.8061 \\
           11 &     0.7170 &    0.6925 &  0.9366 &    0.8122 \\
           12 &     0.7139 &    0.6800 &  0.9193 &    0.8037 \\
           13 &     0.7054 &    0.6650 &  0.9126 &    0.7957

## Save results

In [12]:
res_df.to_csv(VAL_RESULT_FILE, sep=',', decimal='.', index=False)