# MACHINE LEARNING - BULK VALIDATION - RANDOM GUESS

In this notebook we see the results with just a random guess taking into account the prior knowledge about the non-pulsating stars ratio ($\approx29\%$).

The objective of this test is to check if the validation previously done over the 20 datasets of type DS4 can be accepted, and hence the classification model can be considered as a good one.

## Modules and configuration

### Modules

In [10]:
import warnings

import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

#from sklearn.experimental import enable_hist_gradient_boosting
# Not sure why this 'experimental' import is needed, as the sklearn version is 0.24.x > 0.21.x
from sklearn.ensemble import HistGradientBoostingClassifier


### Configuration

In [11]:
# CONFIGURATION:
# Prior knowledge of ratio of pulsating stars:
PRIOR = 0.71 # 71% prior knowledge of pulsating stars.

RANDOM_STATE = 11 # For reproducibility

N_DS = 20 # Number of synthetic datasets.

CS_FEATURES_FOLDER = "../data/DATASETS_CESIUM/"
OUT_DATASET_GEN_FILE = "cesium_VAL_DS<number>_4_Dataset.csv"

VAL_RESULT_FILE = "./Bulk_RandomGuess_Result.csv"

OUT_MODELS_FOLDER = "../data/MODELS_ML/"

# FEATURES TO KEEP - To remove the Lomb-Scargle (Periodic) 'cesium' features
TARGET_DS4 = ['Pulsating']
KEEP_CS_FEATURES = ['all_times_nhist_numpeaks',
                   'all_times_nhist_peak1_bin', 'all_times_nhist_peak2_bin', 'all_times_nhist_peak3_bin', 'all_times_nhist_peak4_bin',
                   'all_times_nhist_peak_1_to_2', 'all_times_nhist_peak_1_to_3', 'all_times_nhist_peak_1_to_4',
                   'all_times_nhist_peak_2_to_3', 'all_times_nhist_peak_2_to_4',
                   'all_times_nhist_peak_3_to_4',
                   'all_times_nhist_peak_val',
                   'avg_double_to_single_step', 'avg_err', 'avgt',
                   'cad_probs_1', 'cad_probs_10', 'cad_probs_20', 'cad_probs_30', 'cad_probs_40', 'cad_probs_50',
                   'cad_probs_100', 'cad_probs_500', 'cad_probs_1000', 'cad_probs_5000',
                   'cad_probs_10000', 'cad_probs_50000', 'cad_probs_100000', 'cad_probs_500000',
                   'cad_probs_1000000', 'cad_probs_5000000', 'cad_probs_10000000',
                   'cads_avg', 'cads_med', 'cads_std', 'mean',
                   'med_double_to_single_step', 'med_err',
                   'n_epochs', 'std_double_to_single_step', 'std_err',
                   'total_time', 'amplitude',
                   'flux_percentile_ratio_mid20', 'flux_percentile_ratio_mid35', 'flux_percentile_ratio_mid50',
                   'flux_percentile_ratio_mid65', 'flux_percentile_ratio_mid80',
                   'max_slope', 'maximum', 'median', 'median_absolute_deviation', 'minimum',
                   'percent_amplitude', 'percent_beyond_1_std', 'percent_close_to_median', 'percent_difference_flux_percentile',
                   'period_fast', 'qso_log_chi2_qsonu', 'qso_log_chi2nuNULL_chi2nu', 'skew', 'std',
                   'stetson_j', 'stetson_k', 'weighted_average', 'fold2P_slope_10percentile', 'fold2P_slope_90percentile']




### Functions

## Validate a simple, random guess model, against all the synthetic datasets

In this case, the prediction is just random, with 71% probability of pulsating and 29% probability of non-pulsating.

In [21]:
res_df = pd.DataFrame(columns=['Val_sample_ID', 'precision', 'accuracy', 'recall', 'f1_score'])
res_df.head()

Unnamed: 0,Val_sample_ID,precision,accuracy,recall,f1_score


In [22]:
#for i in range(0, 2): # TEST
for i in range(0, N_DS):
    # Load the validation dataset and the classes:
    ds = pd.read_csv(CS_FEATURES_FOLDER + OUT_DATASET_GEN_FILE.replace("<number>", str(i)) , sep=',', decimal='.')
    X_val = ds[KEEP_CS_FEATURES].copy()
    y_val = ds[TARGET_DS4].copy()
    # Random guess
    y_pred = [1 if np.random.rand() < PRIOR else 0 for i in range(0, len(y_val))]
    # Calculate performance:
    precision = precision_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    # Add the data:
    new_data = pd.DataFrame(data={
        'Val_sample_ID': str(i),
        'precision': precision,
        'accuracy': accuracy,
        'recall': recall,
        'f1_score': f1}, index=[0])
    res_df = pd.concat([res_df, new_data], ignore_index=True)



In [23]:
res_df

Unnamed: 0,Val_sample_ID,precision,accuracy,recall,f1_score
0,0,0.705686,0.6,0.745583,0.725086
1,1,0.694737,0.58,0.709677,0.702128
2,2,0.715278,0.6025,0.727915,0.721541
3,3,0.673759,0.55,0.683453,0.678571
4,4,0.672598,0.545,0.677419,0.675
5,5,0.751748,0.605,0.711921,0.731293
6,6,0.699647,0.58,0.704626,0.702128
7,7,0.684211,0.5675,0.701439,0.692718
8,8,0.717391,0.59,0.697183,0.707143
9,9,0.673835,0.58,0.709434,0.691176


In [24]:
min_df = pd.DataFrame(data={
        'Val_sample_ID': 'min',
        'precision': res_df['precision'].min(),
        'accuracy': res_df['accuracy'].min(),
        'recall': res_df['recall'].min(),
        'f1_score': res_df['f1_score'].min()}, index=[0])
max_df = pd.DataFrame(data={
        'Val_sample_ID': 'max',
        'precision': res_df['precision'].max(),
        'accuracy': res_df['accuracy'].max(),
        'recall': res_df['recall'].max(),
        'f1_score': res_df['f1_score'].max()}, index=[0])
mean_df = pd.DataFrame(data={
        'Val_sample_ID': 'mean',
        'precision': res_df['precision'].mean(),
        'accuracy': res_df['accuracy'].mean(),
        'recall': res_df['recall'].mean(),
        'f1_score': res_df['f1_score'].mean()}, index=[0])
std_df = pd.DataFrame(data={
        'Val_sample_ID': 'stdev',
        'precision': res_df['precision'].std(),
        'accuracy': res_df['accuracy'].std(),
        'recall': res_df['recall'].std(),
        'f1_score': res_df['f1_score'].std()}, index=[0])

res_df = pd.concat([res_df, min_df, max_df, std_df, mean_df], ignore_index=True)
res_df

Unnamed: 0,Val_sample_ID,precision,accuracy,recall,f1_score
0,0,0.705686,0.6,0.745583,0.725086
1,1,0.694737,0.58,0.709677,0.702128
2,2,0.715278,0.6025,0.727915,0.721541
3,3,0.673759,0.55,0.683453,0.678571
4,4,0.672598,0.545,0.677419,0.675
5,5,0.751748,0.605,0.711921,0.731293
6,6,0.699647,0.58,0.704626,0.702128
7,7,0.684211,0.5675,0.701439,0.692718
8,8,0.717391,0.59,0.697183,0.707143
9,9,0.673835,0.58,0.709434,0.691176


In [25]:
# Print the table to Latex (with four decimals):
#pandas.options.display.float_format = '{.4f}'.format
print(res_df.to_latex(float_format="%.4f", index=False))

\begin{tabular}{lrrrr}
\toprule
Val\_sample\_ID &  precision &  accuracy &  recall &  f1\_score \\
\midrule
            0 &     0.7057 &    0.6000 &  0.7456 &    0.7251 \\
            1 &     0.6947 &    0.5800 &  0.7097 &    0.7021 \\
            2 &     0.7153 &    0.6025 &  0.7279 &    0.7215 \\
            3 &     0.6738 &    0.5500 &  0.6835 &    0.6786 \\
            4 &     0.6726 &    0.5450 &  0.6774 &    0.6750 \\
            5 &     0.7517 &    0.6050 &  0.7119 &    0.7313 \\
            6 &     0.6996 &    0.5800 &  0.7046 &    0.7021 \\
            7 &     0.6842 &    0.5675 &  0.7014 &    0.6927 \\
            8 &     0.7174 &    0.5900 &  0.6972 &    0.7071 \\
            9 &     0.6738 &    0.5800 &  0.7094 &    0.6912 \\
           10 &     0.7220 &    0.5800 &  0.6873 &    0.7042 \\
           11 &     0.7197 &    0.6075 &  0.7324 &    0.7260 \\
           12 &     0.7169 &    0.5825 &  0.6842 &    0.7002 \\
           13 &     0.7289 &    0.6100 &  0.7238 &    0.7263

In [26]:
res_df.to_csv(VAL_RESULT_FILE, sep=',', decimal='.', index=False)