# FIX: SET `Pulsating` FIELD TO `True`/`False` AND CORRECT NAME IN ALTERNATIVE S4 SAMPLE

This notebook fixes the values in alternative S4 sample, which was incorrectly set to `Pulsating` and `Non-pulsating` during the creation of the S4 synhtetic samples.

It also corrects the filename, for consistency.

## Modules and configuration

### Modules

In [1]:
import pandas as pd

### Configuration

In [2]:
INPUT_FILE = "../data/DATASETS_CESIUM/cesium_ML_FINAL_S4_ALT.csv"
OUTPUT_FILE = "../data/DATASETS_CESIUM/ALT_cesium_ML_FINAL_S4.csv"

## Load file

In [3]:
df = pd.read_csv(INPUT_FILE, sep=',', decimal='.')
df.head()

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,Tobs,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,Star-00000,Pulsating,8.0,0.1,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.977504,0.644803,-0.063853,0.000275,0.92487,0.618087,1.117089,0.765823,2.172233,0.38655
1,Star-00001,Non-Pulsating,0.0,0.0,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.90737,0.673338,-0.041002,-0.00023,0.954109,0.875214,1.706731,1.360577,2.177206,0.526797
2,Star-00002,Pulsating,8.0,0.2,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.919256,0.698475,-0.041303,0.000604,0.887896,0.729239,1.413223,1.231405,2.078944,0.487139
3,Star-00003,Non-Pulsating,0.0,0.0,0.0,0.0,0.0,0,J23505-095,1581.691279,...,1.016947,0.668096,-0.047139,0.000606,1.013569,0.780295,1.715909,1.142045,1.974111,0.637595
4,Star-00004,Pulsating,8.0,0.4,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.840501,0.686223,-0.180135,0.000507,0.893944,1.162059,1.284404,1.307339,1.680354,0.470151


## Transform file

In [4]:
df['Pulsating'] = df['Pulsating'].map(lambda x: True if x == 'Pulsating' else False)
df.head()

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,Tobs,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,Star-00000,True,8.0,0.1,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.977504,0.644803,-0.063853,0.000275,0.92487,0.618087,1.117089,0.765823,2.172233,0.38655
1,Star-00001,False,0.0,0.0,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.90737,0.673338,-0.041002,-0.00023,0.954109,0.875214,1.706731,1.360577,2.177206,0.526797
2,Star-00002,True,8.0,0.2,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.919256,0.698475,-0.041303,0.000604,0.887896,0.729239,1.413223,1.231405,2.078944,0.487139
3,Star-00003,False,0.0,0.0,0.0,0.0,0.0,0,J23505-095,1581.691279,...,1.016947,0.668096,-0.047139,0.000606,1.013569,0.780295,1.715909,1.142045,1.974111,0.637595
4,Star-00004,True,8.0,0.4,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.840501,0.686223,-0.180135,0.000507,0.893944,1.162059,1.284404,1.307339,1.680354,0.470151


In [5]:
df['ID'] = df['ID'].map(lambda x: "ALT_" + x)
df.head()

Unnamed: 0,ID,Pulsating,frequency,amplitudeRV,offsetRV,refepochRV,phase,CARMENES_source_idx,CARMENES_Ref_star,Tobs,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,ALT_Star-00000,True,8.0,0.1,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.977504,0.644803,-0.063853,0.000275,0.92487,0.618087,1.117089,0.765823,2.172233,0.38655
1,ALT_Star-00001,False,0.0,0.0,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.90737,0.673338,-0.041002,-0.00023,0.954109,0.875214,1.706731,1.360577,2.177206,0.526797
2,ALT_Star-00002,True,8.0,0.2,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.919256,0.698475,-0.041303,0.000604,0.887896,0.729239,1.413223,1.231405,2.078944,0.487139
3,ALT_Star-00003,False,0.0,0.0,0.0,0.0,0.0,0,J23505-095,1581.691279,...,1.016947,0.668096,-0.047139,0.000606,1.013569,0.780295,1.715909,1.142045,1.974111,0.637595
4,ALT_Star-00004,True,8.0,0.4,0.0,0.0,0.0,0,J23505-095,1581.691279,...,0.840501,0.686223,-0.180135,0.000507,0.893944,1.162059,1.284404,1.307339,1.680354,0.470151


In [8]:
print(list(df.columns))

['ID', 'Pulsating', 'frequency', 'amplitudeRV', 'offsetRV', 'refepochRV', 'phase', 'CARMENES_source_idx', 'CARMENES_Ref_star', 'Tobs', 'Ps_mean', 'Ps_median', 'Ps_stdev', 'NumPoints', 'errorRV_dist_loc', 'errorRV_dist_scale', 'errorRV_mean', 'errorRV_median', 'errorRV_stdev', 'S4_ALT_file', 'VALID_RECORD', 'all_times_nhist_numpeaks', 'all_times_nhist_peak1_bin', 'all_times_nhist_peak2_bin', 'all_times_nhist_peak3_bin', 'all_times_nhist_peak4_bin', 'all_times_nhist_peak_1_to_2', 'all_times_nhist_peak_1_to_3', 'all_times_nhist_peak_1_to_4', 'all_times_nhist_peak_2_to_3', 'all_times_nhist_peak_2_to_4', 'all_times_nhist_peak_3_to_4', 'all_times_nhist_peak_val', 'avg_double_to_single_step', 'avg_err', 'avgt', 'cad_probs_1', 'cad_probs_10', 'cad_probs_20', 'cad_probs_30', 'cad_probs_40', 'cad_probs_50', 'cad_probs_100', 'cad_probs_500', 'cad_probs_1000', 'cad_probs_5000', 'cad_probs_10000', 'cad_probs_50000', 'cad_probs_100000', 'cad_probs_500000', 'cad_probs_1000000', 'cad_probs_5000000', '

## Save file

In [6]:
df.to_csv(OUTPUT_FILE, sep=',', decimal='.', index=False)

## Summary

We have fixed the error in alternative S4 sample creation.