#  DATASET OVERSAMPLING

In this Notebook we explore the possibility of oversampling the S4 sample so as to make it balanced with pulsating and non-pulsating classes.

We will use the `imblearn` module, and the `SMOTE`, set with the parameter `k_neighbors` = 3, because this is the value returned initially by `GridSearchCV` for the `KNeighborsClassifier`.

## Modules and configuration

### Modules

In [21]:
import pandas as pd
import numpy as np

#import warnings

from imblearn.over_sampling import SMOTE

#from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif

import pickle

#import matplotlib.pyplot as plt
#import seaborn as sns
#sns.set_style("white", {'figure.figsize':(15,10)})

#from IPython.display import display

### Configuration

In [47]:
RANDOM_STATE = 11 # For reproducibility

S4_TRAIN_SET_IN = "../data/DATASETS_ML/S4_02_DS_AfterImputing_TrainTest.csv"
# Train set for S4 sample, all 112 features

REL_FEATURES_IN = "../data/ML_MODELS/ML_pipeline_steps/Reliable_features.pickle"

S4_TRAIN_SET_OUT_OVS = "../data/DATASETS_ML/S4_02_DS_AfterImputing_TrainTest_OVERSAMPLED_n3.csv"
# Oversampled train set for S4 sample, all 112 features

#ML_ADD_COLUMNS = ['Karmn'] # Only cesium features and this column will be kept.
S4_ADD_COLUMNS = ['ID', 'Pulsating', 'frequency', 'amplitudeRV',
                  'offsetRV', 'refepochRV', 'phase'] # Only cesium features and these columns will be kept.

IMAGE_FOLDER = './img/'

### Functions

## Load data

### Load training data

We load the data, which are the S4 sample dataset, scaled, and with `NaN` values imputed by a `KNNImputer`.

In [3]:
s4 = pd.read_csv(S4_TRAIN_SET_IN, sep=',', decimal='.')
s4

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,Star-00163,False,0.0,0.00,0.0,2.457444e+06,0.00,-0.674126,0.519174,0.466681,...,-0.712310,-1.187392,0.425026,-0.002305,0.495906,-0.537353,-0.028926,-0.262548,-0.135686,-0.705143
1,Star-00123,True,30.0,0.72,0.0,2.457401e+06,0.37,-1.626729,1.911247,-0.740748,...,0.040924,-1.110488,-0.289189,0.056551,0.555375,-0.699590,-0.292135,-0.013533,0.443673,-1.207278
2,Star-00022,False,0.0,0.00,0.0,2.457430e+06,0.00,-0.039057,-1.012107,0.013895,...,-0.943428,0.637603,-0.679383,0.020496,-0.496592,-0.001214,-0.101526,-0.011097,-0.293389,0.242263
3,Star-00708,False,0.0,0.00,0.0,2.459677e+06,0.00,-0.039057,1.632833,-0.514355,...,-1.091456,0.759880,-0.161363,-0.210930,0.135863,0.662121,-0.492481,0.015621,-0.724783,0.682494
4,Star-00484,False,0.0,0.00,0.0,2.457400e+06,0.00,0.596012,-0.176863,-1.042605,...,-0.696260,0.153752,0.936459,0.070402,-0.067689,-0.656553,-0.237337,-0.032597,-0.139141,-0.098080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,Star-00795,False,0.0,0.00,0.0,2.457410e+06,0.00,0.278478,-0.803296,1.070396,...,-0.390463,-0.320856,-0.058001,0.061510,-0.608276,-1.229502,-0.332122,-0.410947,1.478398,-0.121757
746,Star-00221,False,0.0,0.00,0.0,2.457478e+06,0.00,-0.039057,-1.290522,-0.438891,...,0.703166,-1.543128,-0.386300,0.084782,-0.063902,0.199957,0.689712,-0.526121,1.559757,-1.150793
747,Star-00463,False,0.0,0.00,0.0,2.457409e+06,0.00,0.596012,-0.733692,-1.193534,...,-0.741328,1.272454,-0.054747,0.066032,-0.327664,0.414920,0.069037,0.039541,-0.218490,1.238094
748,Star-00873,False,0.0,0.00,0.0,2.457416e+06,0.00,-0.674126,-0.524881,0.542146,...,0.798791,-1.532917,-2.547988,0.149859,1.751137,-0.416064,-0.361215,0.544014,-1.894647,-1.094748


In [4]:
s4['Pulsating'] = s4['Pulsating'].map(lambda x: 1 if x == True else 0)
s4

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,Star-00163,0,0.0,0.00,0.0,2.457444e+06,0.00,-0.674126,0.519174,0.466681,...,-0.712310,-1.187392,0.425026,-0.002305,0.495906,-0.537353,-0.028926,-0.262548,-0.135686,-0.705143
1,Star-00123,1,30.0,0.72,0.0,2.457401e+06,0.37,-1.626729,1.911247,-0.740748,...,0.040924,-1.110488,-0.289189,0.056551,0.555375,-0.699590,-0.292135,-0.013533,0.443673,-1.207278
2,Star-00022,0,0.0,0.00,0.0,2.457430e+06,0.00,-0.039057,-1.012107,0.013895,...,-0.943428,0.637603,-0.679383,0.020496,-0.496592,-0.001214,-0.101526,-0.011097,-0.293389,0.242263
3,Star-00708,0,0.0,0.00,0.0,2.459677e+06,0.00,-0.039057,1.632833,-0.514355,...,-1.091456,0.759880,-0.161363,-0.210930,0.135863,0.662121,-0.492481,0.015621,-0.724783,0.682494
4,Star-00484,0,0.0,0.00,0.0,2.457400e+06,0.00,0.596012,-0.176863,-1.042605,...,-0.696260,0.153752,0.936459,0.070402,-0.067689,-0.656553,-0.237337,-0.032597,-0.139141,-0.098080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,Star-00795,0,0.0,0.00,0.0,2.457410e+06,0.00,0.278478,-0.803296,1.070396,...,-0.390463,-0.320856,-0.058001,0.061510,-0.608276,-1.229502,-0.332122,-0.410947,1.478398,-0.121757
746,Star-00221,0,0.0,0.00,0.0,2.457478e+06,0.00,-0.039057,-1.290522,-0.438891,...,0.703166,-1.543128,-0.386300,0.084782,-0.063902,0.199957,0.689712,-0.526121,1.559757,-1.150793
747,Star-00463,0,0.0,0.00,0.0,2.457409e+06,0.00,0.596012,-0.733692,-1.193534,...,-0.741328,1.272454,-0.054747,0.066032,-0.327664,0.414920,0.069037,0.039541,-0.218490,1.238094
748,Star-00873,0,0.0,0.00,0.0,2.457416e+06,0.00,-0.674126,-0.524881,0.542146,...,0.798791,-1.532917,-2.547988,0.149859,1.751137,-0.416064,-0.361215,0.544014,-1.894647,-1.094748


### Load reliable features

In [5]:
rel_features = pickle.load(open(REL_FEATURES_IN, 'rb'))
print(rel_features)

['all_times_nhist_numpeaks', 'all_times_nhist_peak1_bin', 'all_times_nhist_peak2_bin', 'all_times_nhist_peak3_bin', 'all_times_nhist_peak4_bin', 'all_times_nhist_peak_1_to_2', 'all_times_nhist_peak_1_to_3', 'all_times_nhist_peak_1_to_4', 'all_times_nhist_peak_2_to_3', 'all_times_nhist_peak_2_to_4', 'all_times_nhist_peak_3_to_4', 'all_times_nhist_peak_val', 'avg_double_to_single_step', 'cad_probs_1', 'cad_probs_10', 'cad_probs_20', 'cad_probs_30', 'cad_probs_40', 'cad_probs_50', 'cad_probs_100', 'cad_probs_500', 'cad_probs_1000', 'cad_probs_5000', 'cad_probs_10000', 'cad_probs_50000', 'cad_probs_100000', 'cad_probs_500000', 'cad_probs_1000000', 'cad_probs_5000000', 'cad_probs_10000000', 'cads_avg', 'cads_med', 'cads_std', 'med_double_to_single_step', 'n_epochs', 'std_double_to_single_step', 'total_time', 'percent_beyond_1_std', 'freq1_rel_phase2', 'freq1_rel_phase3', 'freq1_rel_phase4', 'freq2_rel_phase2', 'freq2_rel_phase3', 'freq2_rel_phase4', 'freq3_rel_phase2', 'freq3_rel_phase3', '

## Apply standard oversampling

We first extract the relevant columns (metadata are not to be oversampled).

In [6]:
X, y = s4[rel_features], s4['Pulsating']

Create the oversampler (all parameters by default

In [7]:
sm = SMOTE(sampling_strategy='auto', random_state=RANDOM_STATE, k_neighbors=3, n_jobs=None)
sm

And we now apply the oversampling:

In [8]:
X_res, y_res = sm.fit_resample(X, y)

In [9]:
X_res

Unnamed: 0,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,all_times_nhist_peak_2_to_3,all_times_nhist_peak_2_to_4,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,-0.674126,0.519174,0.466681,0.766297,1.786498,-0.304944,0.843252,0.189055,1.390901,0.462908,...,0.646033,0.908818,1.305379,1.413989,0.174334,-0.188773,0.985693,-0.258841,-1.099919,-0.461571
1,-1.626729,1.911247,-0.740748,0.691384,0.168331,1.522002,1.166420,0.157675,0.019744,-0.192345,...,1.532902,-1.224350,0.710232,-1.272791,1.617586,1.392776,0.260283,0.708876,1.030413,0.400968
2,-0.039057,-1.012107,0.013895,-0.357397,1.168762,-0.232282,-0.443941,-0.136007,-0.412519,-0.042766,...,0.384058,0.882515,1.044322,-1.204443,-0.593335,-1.011092,0.592500,0.135213,0.725303,-0.319881
3,-0.039057,1.632833,-0.514355,0.166993,1.477630,-0.544204,-0.572606,-0.586661,-0.338658,-0.449412,...,1.457441,-0.921750,-1.095322,-0.031960,-0.068737,1.152465,-0.672518,0.391616,-1.301501,0.559262
4,0.596012,-0.176863,-1.042605,-0.432310,0.242158,-0.277263,-0.498198,-0.370020,-0.451106,-0.321570,...,-0.829296,-0.057582,0.541269,1.529879,-0.689578,1.475290,-1.268004,1.297212,1.327633,-0.059789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,0.473173,-0.668664,0.135630,-0.259518,-1.373200,-0.470559,-0.589896,-0.548709,-0.448373,-0.491607,...,-0.038051,-0.958444,0.386309,-0.805888,0.697947,0.293784,-0.667106,-0.192877,1.224979,-1.394465
1340,-0.993662,-0.524004,0.918516,1.589867,-0.372796,-0.377095,-0.318659,-0.118490,-0.120285,0.108136,...,0.036617,1.595782,0.586577,-0.903790,0.315474,-0.690108,-0.846619,0.819517,-0.407266,-1.583849
1341,0.610021,-1.068148,0.492761,0.150468,-0.980256,-0.169826,-0.341273,-0.177918,-0.341825,-0.169797,...,-0.194276,-0.105307,1.257649,-0.697622,-0.195618,-0.123771,-0.670504,0.582847,1.003724,-1.285203
1342,1.061356,-0.373715,0.099776,-0.660900,-1.102506,0.339215,-0.083127,0.190607,-0.361995,-0.066276,...,-0.124422,-0.875222,1.033421,-0.335086,-0.822268,-0.909756,-0.608352,-0.050957,1.095146,-1.091260


In [11]:
y_res

0       0
1       1
2       0
3       0
4       0
       ..
1339    1
1340    1
1341    1
1342    1
1343    1
Name: Pulsating, Length: 1344, dtype: int64

In [12]:
y_res[y_res == 0].shape

(672,)

In [13]:
y_res[y_res == 1].shape

(672,)

Hence, we now have a balanced dataset. The only problem is that we cannot put the metadata, as they are really unknown.

We merge the resulting features and targets together before saving

In [14]:
train_res = pd.concat([y_res, X_res], axis=1)
train_res

Unnamed: 0,Pulsating,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,all_times_nhist_peak_2_to_3,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,0,-0.674126,0.519174,0.466681,0.766297,1.786498,-0.304944,0.843252,0.189055,1.390901,...,0.646033,0.908818,1.305379,1.413989,0.174334,-0.188773,0.985693,-0.258841,-1.099919,-0.461571
1,1,-1.626729,1.911247,-0.740748,0.691384,0.168331,1.522002,1.166420,0.157675,0.019744,...,1.532902,-1.224350,0.710232,-1.272791,1.617586,1.392776,0.260283,0.708876,1.030413,0.400968
2,0,-0.039057,-1.012107,0.013895,-0.357397,1.168762,-0.232282,-0.443941,-0.136007,-0.412519,...,0.384058,0.882515,1.044322,-1.204443,-0.593335,-1.011092,0.592500,0.135213,0.725303,-0.319881
3,0,-0.039057,1.632833,-0.514355,0.166993,1.477630,-0.544204,-0.572606,-0.586661,-0.338658,...,1.457441,-0.921750,-1.095322,-0.031960,-0.068737,1.152465,-0.672518,0.391616,-1.301501,0.559262
4,0,0.596012,-0.176863,-1.042605,-0.432310,0.242158,-0.277263,-0.498198,-0.370020,-0.451106,...,-0.829296,-0.057582,0.541269,1.529879,-0.689578,1.475290,-1.268004,1.297212,1.327633,-0.059789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,1,0.473173,-0.668664,0.135630,-0.259518,-1.373200,-0.470559,-0.589896,-0.548709,-0.448373,...,-0.038051,-0.958444,0.386309,-0.805888,0.697947,0.293784,-0.667106,-0.192877,1.224979,-1.394465
1340,1,-0.993662,-0.524004,0.918516,1.589867,-0.372796,-0.377095,-0.318659,-0.118490,-0.120285,...,0.036617,1.595782,0.586577,-0.903790,0.315474,-0.690108,-0.846619,0.819517,-0.407266,-1.583849
1341,1,0.610021,-1.068148,0.492761,0.150468,-0.980256,-0.169826,-0.341273,-0.177918,-0.341825,...,-0.194276,-0.105307,1.257649,-0.697622,-0.195618,-0.123771,-0.670504,0.582847,1.003724,-1.285203
1342,1,1.061356,-0.373715,0.099776,-0.660900,-1.102506,0.339215,-0.083127,0.190607,-0.361995,...,-0.124422,-0.875222,1.033421,-0.335086,-0.822268,-0.909756,-0.608352,-0.050957,1.095146,-1.091260


The next two cells were used to check if the new execution, adding the star identifier, did not change the oversampled train set.

### Add the `ID` field for the original stars (new data points will not have that ID)

For the new data points created for `SMOTE` we just assign them an identifier with the index (SMOTE-\<xxxx\>).

In [26]:
train_res.insert(0, 'ID', "-")
train_res.head()

Unnamed: 0,ID,Pulsating,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,-,0,-0.674126,0.519174,0.466681,0.766297,1.786498,-0.304944,0.843252,0.189055,...,0.646033,0.908818,1.305379,1.413989,0.174334,-0.188773,0.985693,-0.258841,-1.099919,-0.461571
1,-,1,-1.626729,1.911247,-0.740748,0.691384,0.168331,1.522002,1.16642,0.157675,...,1.532902,-1.22435,0.710232,-1.272791,1.617586,1.392776,0.260283,0.708876,1.030413,0.400968
2,-,0,-0.039057,-1.012107,0.013895,-0.357397,1.168762,-0.232282,-0.443941,-0.136007,...,0.384058,0.882515,1.044322,-1.204443,-0.593335,-1.011092,0.5925,0.135213,0.725303,-0.319881
3,-,0,-0.039057,1.632833,-0.514355,0.166993,1.47763,-0.544204,-0.572606,-0.586661,...,1.457441,-0.92175,-1.095322,-0.03196,-0.068737,1.152465,-0.672518,0.391616,-1.301501,0.559262
4,-,0,0.596012,-0.176863,-1.042605,-0.43231,0.242158,-0.277263,-0.498198,-0.37002,...,-0.829296,-0.057582,0.541269,1.529879,-0.689578,1.47529,-1.268004,1.297212,1.327633,-0.059789


In [31]:
train_res.loc[0:len(s4)-1, :]

Unnamed: 0,ID,Pulsating,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,-,0,-0.674126,0.519174,0.466681,0.766297,1.786498,-0.304944,0.843252,0.189055,...,0.646033,0.908818,1.305379,1.413989,0.174334,-0.188773,0.985693,-0.258841,-1.099919,-0.461571
1,-,1,-1.626729,1.911247,-0.740748,0.691384,0.168331,1.522002,1.166420,0.157675,...,1.532902,-1.224350,0.710232,-1.272791,1.617586,1.392776,0.260283,0.708876,1.030413,0.400968
2,-,0,-0.039057,-1.012107,0.013895,-0.357397,1.168762,-0.232282,-0.443941,-0.136007,...,0.384058,0.882515,1.044322,-1.204443,-0.593335,-1.011092,0.592500,0.135213,0.725303,-0.319881
3,-,0,-0.039057,1.632833,-0.514355,0.166993,1.477630,-0.544204,-0.572606,-0.586661,...,1.457441,-0.921750,-1.095322,-0.031960,-0.068737,1.152465,-0.672518,0.391616,-1.301501,0.559262
4,-,0,0.596012,-0.176863,-1.042605,-0.432310,0.242158,-0.277263,-0.498198,-0.370020,...,-0.829296,-0.057582,0.541269,1.529879,-0.689578,1.475290,-1.268004,1.297212,1.327633,-0.059789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,-,0,0.278478,-0.803296,1.070396,0.991036,1.168762,0.022969,-0.301300,-0.379804,...,-0.288258,-0.848774,-0.854496,1.491923,1.008801,-0.519740,-1.680259,-1.300470,0.307095,-1.189971
746,-,0,-0.039057,-1.290522,-0.438891,-1.630917,-0.916097,1.116014,0.520084,-0.017803,...,1.636850,-1.044746,0.342069,-1.435674,-1.390858,-0.052260,-1.612403,1.449496,-0.028414,1.009892
747,-,0,0.596012,-0.733692,-1.193534,-0.731962,-0.993314,-0.671992,0.345105,-0.083419,...,0.554391,-1.609160,1.073499,-1.750994,-1.230980,-0.473819,-0.204503,0.349673,0.121996,-0.524541
748,-,0,-0.674126,-0.524881,0.542146,0.916123,1.245979,0.340946,0.439292,-0.069517,...,-1.035107,0.803325,-0.968768,-1.073748,-0.666836,0.702556,-0.746085,1.140938,-1.325713,0.126896


In [33]:
train_res.loc[0:len(s4)-1, 'ID'] = s4['ID']
train_res

Unnamed: 0,ID,Pulsating,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,Star-00163,0,-0.674126,0.519174,0.466681,0.766297,1.786498,-0.304944,0.843252,0.189055,...,0.646033,0.908818,1.305379,1.413989,0.174334,-0.188773,0.985693,-0.258841,-1.099919,-0.461571
1,Star-00123,1,-1.626729,1.911247,-0.740748,0.691384,0.168331,1.522002,1.166420,0.157675,...,1.532902,-1.224350,0.710232,-1.272791,1.617586,1.392776,0.260283,0.708876,1.030413,0.400968
2,Star-00022,0,-0.039057,-1.012107,0.013895,-0.357397,1.168762,-0.232282,-0.443941,-0.136007,...,0.384058,0.882515,1.044322,-1.204443,-0.593335,-1.011092,0.592500,0.135213,0.725303,-0.319881
3,Star-00708,0,-0.039057,1.632833,-0.514355,0.166993,1.477630,-0.544204,-0.572606,-0.586661,...,1.457441,-0.921750,-1.095322,-0.031960,-0.068737,1.152465,-0.672518,0.391616,-1.301501,0.559262
4,Star-00484,0,0.596012,-0.176863,-1.042605,-0.432310,0.242158,-0.277263,-0.498198,-0.370020,...,-0.829296,-0.057582,0.541269,1.529879,-0.689578,1.475290,-1.268004,1.297212,1.327633,-0.059789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,-,1,0.473173,-0.668664,0.135630,-0.259518,-1.373200,-0.470559,-0.589896,-0.548709,...,-0.038051,-0.958444,0.386309,-0.805888,0.697947,0.293784,-0.667106,-0.192877,1.224979,-1.394465
1340,-,1,-0.993662,-0.524004,0.918516,1.589867,-0.372796,-0.377095,-0.318659,-0.118490,...,0.036617,1.595782,0.586577,-0.903790,0.315474,-0.690108,-0.846619,0.819517,-0.407266,-1.583849
1341,-,1,0.610021,-1.068148,0.492761,0.150468,-0.980256,-0.169826,-0.341273,-0.177918,...,-0.194276,-0.105307,1.257649,-0.697622,-0.195618,-0.123771,-0.670504,0.582847,1.003724,-1.285203
1342,-,1,1.061356,-0.373715,0.099776,-0.660900,-1.102506,0.339215,-0.083127,0.190607,...,-0.124422,-0.875222,1.033421,-0.335086,-0.822268,-0.909756,-0.608352,-0.050957,1.095146,-1.091260


In [42]:
smote_id = list(train_res.loc[len(s4):len(train_res)-1, 'ID'].index)
print(smote_id)

[750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949,

In [43]:
smote_id = ["SMOTE-" + str(elem + 10000).zfill(5) for elem in smote_id]
print(smote_id)

['SMOTE-10750', 'SMOTE-10751', 'SMOTE-10752', 'SMOTE-10753', 'SMOTE-10754', 'SMOTE-10755', 'SMOTE-10756', 'SMOTE-10757', 'SMOTE-10758', 'SMOTE-10759', 'SMOTE-10760', 'SMOTE-10761', 'SMOTE-10762', 'SMOTE-10763', 'SMOTE-10764', 'SMOTE-10765', 'SMOTE-10766', 'SMOTE-10767', 'SMOTE-10768', 'SMOTE-10769', 'SMOTE-10770', 'SMOTE-10771', 'SMOTE-10772', 'SMOTE-10773', 'SMOTE-10774', 'SMOTE-10775', 'SMOTE-10776', 'SMOTE-10777', 'SMOTE-10778', 'SMOTE-10779', 'SMOTE-10780', 'SMOTE-10781', 'SMOTE-10782', 'SMOTE-10783', 'SMOTE-10784', 'SMOTE-10785', 'SMOTE-10786', 'SMOTE-10787', 'SMOTE-10788', 'SMOTE-10789', 'SMOTE-10790', 'SMOTE-10791', 'SMOTE-10792', 'SMOTE-10793', 'SMOTE-10794', 'SMOTE-10795', 'SMOTE-10796', 'SMOTE-10797', 'SMOTE-10798', 'SMOTE-10799', 'SMOTE-10800', 'SMOTE-10801', 'SMOTE-10802', 'SMOTE-10803', 'SMOTE-10804', 'SMOTE-10805', 'SMOTE-10806', 'SMOTE-10807', 'SMOTE-10808', 'SMOTE-10809', 'SMOTE-10810', 'SMOTE-10811', 'SMOTE-10812', 'SMOTE-10813', 'SMOTE-10814', 'SMOTE-10815', 'SMOTE-10

In [44]:
train_res.loc[len(s4):len(train_res)-1, 'ID'] = smote_id
train_res

Unnamed: 0,ID,Pulsating,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,Star-00163,0,-0.674126,0.519174,0.466681,0.766297,1.786498,-0.304944,0.843252,0.189055,...,0.646033,0.908818,1.305379,1.413989,0.174334,-0.188773,0.985693,-0.258841,-1.099919,-0.461571
1,Star-00123,1,-1.626729,1.911247,-0.740748,0.691384,0.168331,1.522002,1.166420,0.157675,...,1.532902,-1.224350,0.710232,-1.272791,1.617586,1.392776,0.260283,0.708876,1.030413,0.400968
2,Star-00022,0,-0.039057,-1.012107,0.013895,-0.357397,1.168762,-0.232282,-0.443941,-0.136007,...,0.384058,0.882515,1.044322,-1.204443,-0.593335,-1.011092,0.592500,0.135213,0.725303,-0.319881
3,Star-00708,0,-0.039057,1.632833,-0.514355,0.166993,1.477630,-0.544204,-0.572606,-0.586661,...,1.457441,-0.921750,-1.095322,-0.031960,-0.068737,1.152465,-0.672518,0.391616,-1.301501,0.559262
4,Star-00484,0,0.596012,-0.176863,-1.042605,-0.432310,0.242158,-0.277263,-0.498198,-0.370020,...,-0.829296,-0.057582,0.541269,1.529879,-0.689578,1.475290,-1.268004,1.297212,1.327633,-0.059789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,SMOTE-11339,1,0.473173,-0.668664,0.135630,-0.259518,-1.373200,-0.470559,-0.589896,-0.548709,...,-0.038051,-0.958444,0.386309,-0.805888,0.697947,0.293784,-0.667106,-0.192877,1.224979,-1.394465
1340,SMOTE-11340,1,-0.993662,-0.524004,0.918516,1.589867,-0.372796,-0.377095,-0.318659,-0.118490,...,0.036617,1.595782,0.586577,-0.903790,0.315474,-0.690108,-0.846619,0.819517,-0.407266,-1.583849
1341,SMOTE-11341,1,0.610021,-1.068148,0.492761,0.150468,-0.980256,-0.169826,-0.341273,-0.177918,...,-0.194276,-0.105307,1.257649,-0.697622,-0.195618,-0.123771,-0.670504,0.582847,1.003724,-1.285203
1342,SMOTE-11342,1,1.061356,-0.373715,0.099776,-0.660900,-1.102506,0.339215,-0.083127,0.190607,...,-0.124422,-0.875222,1.033421,-0.335086,-0.822268,-0.909756,-0.608352,-0.050957,1.095146,-1.091260


In [45]:
train_res[745:755]

Unnamed: 0,ID,Pulsating,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
745,Star-00795,0,0.278478,-0.803296,1.070396,0.991036,1.168762,0.022969,-0.3013,-0.379804,...,-0.288258,-0.848774,-0.854496,1.491923,1.008801,-0.51974,-1.680259,-1.30047,0.307095,-1.189971
746,Star-00221,0,-0.039057,-1.290522,-0.438891,-1.630917,-0.916097,1.116014,0.520084,-0.017803,...,1.63685,-1.044746,0.342069,-1.435674,-1.390858,-0.05226,-1.612403,1.449496,-0.028414,1.009892
747,Star-00463,0,0.596012,-0.733692,-1.193534,-0.731962,-0.993314,-0.671992,0.345105,-0.083419,...,0.554391,-1.60916,1.073499,-1.750994,-1.23098,-0.473819,-0.204503,0.349673,0.121996,-0.524541
748,Star-00873,0,-0.674126,-0.524881,0.542146,0.916123,1.245979,0.340946,0.439292,-0.069517,...,-1.035107,0.803325,-0.968768,-1.073748,-0.666836,0.702556,-0.746085,1.140938,-1.325713,0.126896
749,Star-00998,0,-0.356591,-0.037656,-0.891677,0.316819,0.087724,-0.677428,-0.686637,-0.77778,...,0.139384,0.388687,-0.000302,1.710044,0.34252,0.410963,-0.811926,-0.752704,-1.525543,-0.335513
750,SMOTE-10750,1,-0.29931,-1.119379,0.055917,-1.467186,-0.409797,0.62193,0.608374,0.038711,...,1.025909,1.347524,-0.026255,1.265853,-0.827561,-0.503325,1.469225,-1.126742,-1.194729,0.084397
751,SMOTE-10751,1,1.231081,-0.83423,-1.36752,0.620631,0.799833,-0.189566,-0.21825,-0.4848,...,0.486116,0.5273,0.03732,0.872223,0.224272,0.667999,1.179798,-1.263541,0.781706,-0.192169
752,SMOTE-10752,1,-0.588071,-0.748973,-0.438891,-0.141392,1.019764,0.222623,-0.248203,-0.457994,...,0.482778,1.362256,-0.877604,0.255059,0.780206,0.152106,1.055986,-1.575578,-0.556325,-0.164382
753,SMOTE-10753,1,-0.058033,-0.53514,-1.11356,-0.613474,0.273229,-0.564595,0.19582,0.769002,...,0.270333,0.938805,1.109692,-0.133342,-1.186573,0.970733,-0.157676,0.057997,1.626616,-1.347084
754,SMOTE-10754,1,-0.674126,-0.524881,0.013895,0.691384,0.396592,-0.466417,1.196717,2.700897,...,-0.026856,0.340841,-0.346971,0.652152,0.1476,-0.162988,-0.034958,1.225039,-0.530609,-1.265664


## Save results

### Save the oversampled train set

In [46]:
train_res.to_csv(S4_TRAIN_SET_OUT_OVS, sep=',', decimal='.', index=False)

### Check with the OLD one:

In [51]:
old_train_res = pd.read_csv("../data/DATASETS_ML/OLD_S4_02_DS_AfterImputing_TrainTest_OVERSAMPLED_n3.csv",
                            sep=',', decimal='.')
old_train_res

Unnamed: 0,Pulsating,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,all_times_nhist_peak_2_to_3,...,freq1_rel_phase2,freq1_rel_phase3,freq1_rel_phase4,freq2_rel_phase2,freq2_rel_phase3,freq2_rel_phase4,freq3_rel_phase2,freq3_rel_phase3,freq3_rel_phase4,freq_model_phi1_phi2
0,0,-0.674126,0.519174,0.466681,0.766297,1.786498,-0.304944,0.843252,0.189055,1.390901,...,0.646033,0.908818,1.305379,1.413989,0.174334,-0.188773,0.985693,-0.258841,-1.099919,-0.461571
1,1,-1.626729,1.911247,-0.740748,0.691384,0.168331,1.522002,1.166420,0.157675,0.019744,...,1.532902,-1.224350,0.710232,-1.272791,1.617586,1.392776,0.260283,0.708876,1.030413,0.400968
2,0,-0.039057,-1.012107,0.013895,-0.357397,1.168762,-0.232282,-0.443941,-0.136007,-0.412519,...,0.384058,0.882515,1.044322,-1.204443,-0.593335,-1.011092,0.592500,0.135213,0.725303,-0.319881
3,0,-0.039057,1.632833,-0.514355,0.166993,1.477630,-0.544204,-0.572606,-0.586661,-0.338658,...,1.457441,-0.921750,-1.095322,-0.031960,-0.068737,1.152465,-0.672518,0.391616,-1.301501,0.559262
4,0,0.596012,-0.176863,-1.042605,-0.432310,0.242158,-0.277263,-0.498198,-0.370020,-0.451106,...,-0.829296,-0.057582,0.541269,1.529879,-0.689578,1.475290,-1.268004,1.297212,1.327633,-0.059789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,1,0.473173,-0.668664,0.135630,-0.259518,-1.373200,-0.470559,-0.589896,-0.548709,-0.448373,...,-0.038051,-0.958444,0.386309,-0.805888,0.697947,0.293784,-0.667106,-0.192877,1.224979,-1.394465
1340,1,-0.993662,-0.524004,0.918516,1.589867,-0.372796,-0.377095,-0.318659,-0.118490,-0.120285,...,0.036617,1.595782,0.586577,-0.903790,0.315474,-0.690108,-0.846619,0.819517,-0.407266,-1.583849
1341,1,0.610021,-1.068148,0.492761,0.150468,-0.980256,-0.169826,-0.341273,-0.177918,-0.341825,...,-0.194276,-0.105307,1.257649,-0.697622,-0.195618,-0.123771,-0.670504,0.582847,1.003724,-1.285203
1342,1,1.061356,-0.373715,0.099776,-0.660900,-1.102506,0.339215,-0.083127,0.190607,-0.361995,...,-0.124422,-0.875222,1.033421,-0.335086,-0.822268,-0.909756,-0.608352,-0.050957,1.095146,-1.091260


In [54]:
np.allclose(train_res.drop(columns=['ID']), train_res_old)

True

## Summary

**RESULTS:**

- We applied the oversampling over the Train set, and stored the results.
- We also added the `ID` for the original stars, and created a new identifier for the stars added for SMOTE.
