# BULK FEATURE EXTRACTION OF THE REAL RV CURVES WITH `cesium`

In this notebook we do the bulk feature extraction with `cesium` for all the 363 _Carmencita GTO_ objects.

**IMPORTANT NOTE:** this code is probably not very efficient (for example, too many dataframe `append` operations, which is costly), but there is no special need at the moment to be more efficient. Maybe the solution is to create a 2D numpy array and then, at the end, create the DataFrame.

## Modules and configuration

### Modules

In [1]:
# Module import:
import warnings
import time

import pandas as pd
import numpy as np

from cesium.data_management import TimeSeries
from cesium.featurize import featurize_single_ts

### Configuration

In [2]:
GTO_FILE = "../data/GTO_objects.csv"
RV_FOLDER = "../data/CARMENES_GTO_RVs/"

CS_FEATURES_FOLDER = "../data/DATASETS_CESIUM/"
OUT_DATASET_FILE = "cesium_GTO_Dataset.csv"

# LIST OF STAR METADATA TO ADD (FROM CARMENCITA DATABASE):
METADATA = ['Karmn', 'SpT', 'SpTnum', 'Teff_K', 'eTeff_K', 'logg', 'elogg', '[Fe/H]', 'e[Fe/H]', 'L_Lsol', 'eL_Lsol',
            'R_Rsol', 'eR_Rsol', 'M_Msol', 'eM_Msol', 'muRA_masa-1', 'emuRA_masa-1', 'muDE_masa-1', 'emuDE_masa-1', 'pi_mas',
            'epi_mas', 'd_pc', 'ed_pc', 'Vr_kms-1', 'eVr_kms-1', 'ruwe', 'U_kms-1', 'eU_kms-1', 'V_kms-1', 'eV_kms-1',
            'W_kms-1', 'eW_kms-1', 'sa_m/s/a', 'esa_m/s/a', 'Pop', 'vsini_flag', 'vsini_kms-1', 'P_d',
            'pEWHalpha_A', 'epEWHalpha_A', 'Activity', 'FUV_mag', 'eFUV_mag', 'NUV_mag', 'eNUV_mag', 'u_mag', 'eu_mag',
            'BT_mag', 'eBT_mag', 'B_mag', 'eB_mag', 'BP_mag', 'eBP_mag', 'g_mag', 'eg_mag', 'VT_mag', 'eVT_mag',
            'V_mag', 'eV_mag', 'Ra_mag', 'r_mag', 'er_mag', 'GG_mag', 'eGG_mag', 'i_mag', 'ei_mag', 'RP_mag', 'eRP_mag',
            'IN_mag', 'J_mag', 'eJ_mag', 'H_mag', 'eH_mag', 'Ks_mag', 'eKs_mag', 'QFlag_2M', 'W1_mag', 'eW1_mag',
            'W2_mag', 'eW2_mag', 'W3_mag', 'eW3_mag', 'W4_mag', 'eW4_mag', 'QFlag_WISE', 'Multiplicity',
            'Planet', 'PlanetNum', 'Teff_min_K', 'Teff_max_K', 'logg_min', 'logg_max', 'is_GTO',
            'InstBand_nominal', 'InstBand_ranged']

# A LIST OF ALL THE FEATURES CESIUM CAN EXTRACT (FOR REFERENCE PURPOSES)
ALL_CS_FEATURES = ['all_times_nhist_numpeaks',
                   'all_times_nhist_peak1_bin', 'all_times_nhist_peak2_bin', 'all_times_nhist_peak3_bin', 'all_times_nhist_peak4_bin',
                   'all_times_nhist_peak_1_to_2', 'all_times_nhist_peak_1_to_3', 'all_times_nhist_peak_1_to_4',
                   'all_times_nhist_peak_2_to_3', 'all_times_nhist_peak_2_to_4',
                   'all_times_nhist_peak_3_to_4',
                   'all_times_nhist_peak_val',
                   'avg_double_to_single_step', 'avg_err', 'avgt',
                   'cad_probs_1', 'cad_probs_10', 'cad_probs_20', 'cad_probs_30', 'cad_probs_40', 'cad_probs_50',
                   'cad_probs_100', 'cad_probs_500', 'cad_probs_1000', 'cad_probs_5000',
                   'cad_probs_10000', 'cad_probs_50000', 'cad_probs_100000', 'cad_probs_500000',
                   'cad_probs_1000000', 'cad_probs_5000000', 'cad_probs_10000000',
                   'cads_avg', 'cads_med', 'cads_std', 'mean',
                   'med_double_to_single_step', 'med_err',
                   'n_epochs', 'std_double_to_single_step', 'std_err',
                   'total_time', 'amplitude',
                   'flux_percentile_ratio_mid20', 'flux_percentile_ratio_mid35', 'flux_percentile_ratio_mid50',
                   'flux_percentile_ratio_mid65', 'flux_percentile_ratio_mid80',
                   'max_slope', 'maximum', 'median', 'median_absolute_deviation', 'minimum',
                   'percent_amplitude', 'percent_beyond_1_std', 'percent_close_to_median', 'percent_difference_flux_percentile',
                   'period_fast', 'qso_log_chi2_qsonu', 'qso_log_chi2nuNULL_chi2nu', 'skew', 'std',
                   'stetson_j', 'stetson_k', 'weighted_average', 'fold2P_slope_10percentile', 'fold2P_slope_90percentile',
                   'freq1_amplitude1', 'freq1_amplitude2', 'freq1_amplitude3', 'freq1_amplitude4',
                   'freq1_freq', 'freq1_lambda', 'freq1_rel_phase2', 'freq1_rel_phase3', 'freq1_rel_phase4', 'freq1_signif',
                   'freq2_amplitude1', 'freq2_amplitude2', 'freq2_amplitude3', 'freq2_amplitude4',
                   'freq2_freq', 'freq2_rel_phase2', 'freq2_rel_phase3', 'freq2_rel_phase4',
                   'freq3_amplitude1', 'freq3_amplitude2', 'freq3_amplitude3', 'freq3_amplitude4',
                   'freq3_freq', 'freq3_rel_phase2', 'freq3_rel_phase3', 'freq3_rel_phase4',
                   'freq_amplitude_ratio_21', 'freq_amplitude_ratio_31',
                   'freq_frequency_ratio_21', 'freq_frequency_ratio_31',
                   'freq_model_max_delta_mags', 'freq_model_min_delta_mags', 'freq_model_phi1_phi2',
                   'freq_n_alias', 'freq_signif_ratio_21', 'freq_signif_ratio_31',
                   'freq_varrat', 'freq_y_offset', 'linear_trend', 'medperc90_2p_p',
                   'p2p_scatter_2praw', 'p2p_scatter_over_mad', 'p2p_scatter_pfold_over_mad', 'p2p_ssqr_diff_over_var',
                   'scatter_res_raw']


## Load GTO information table

In [3]:
gto = pd.read_csv(GTO_FILE, sep=',', decimal='.')
gto.head()

Unnamed: 0,Karmn,Name,Comp,GJ,RA_J2016_deg,DE_J2016_deg,RA_J2000,DE_J2000,l_J2016_deg,b_J2016_deg,...,Teff_max_K,logg_min,logg_max,is_GTO,rv_file,lc_file,has_rv,has_lc,InstBand_nominal,InstBand_ranged
0,J23585+076,Wolf 1051,AB,4383.0,359.63642,7.656947,23:58:32.65,+07:39:30.1,100.839686,-52.931049,...,3516.0,4.89,5.05,True,../data/CARMENES_GTO_RVs/J23585+076.dat,,True,False,He3-burning,He3-burning
1,J23556-061,GJ 912,AB,912.0,358.913617,-6.144283,23:55:39.78,-06:08:33.4,88.129933,-65.175491,...,3669.0,4.7,4.98,True,../data/CARMENES_GTO_RVs/J23556-061.dat,,True,False,He3-burning,He3-burning
2,J23548+385,RX J2354.8+3831,-,,358.713658,38.52634,23:54:51.46,+38:31:36.2,110.941908,-23.024449,...,3279.0,5.03,5.23,True,../data/CARMENES_GTO_RVs/J23548+385.dat,../data/CARMENES_GTO_TESS_lc/lightcurves/J2354...,True,True,none,none
3,J23505-095,LP 763-012,-,4367.0,357.634705,-9.560964,23:50:31.64,-09:33:32.7,80.777067,-67.303426,...,3411.0,4.73,4.93,True,../data/CARMENES_GTO_RVs/J23505-095.dat,<ambiguous>,True,True,He3-burning,He3-burning
4,J23492+024,BR Psc,-,908.0,357.306604,2.396918,23:49:12.53,+02:24:04.4,93.567467,-56.885396,...,3596.0,4.81,5.07,True,../data/CARMENES_GTO_RVs/J23492+024.dat,,True,False,He3-burning,He3-burning


In [4]:
print(list(gto.columns))

['Karmn', 'Name', 'Comp', 'GJ', 'RA_J2016_deg', 'DE_J2016_deg', 'RA_J2000', 'DE_J2000', 'l_J2016_deg', 'b_J2016_deg', 'Ref01', 'SpT', 'SpTnum', 'Ref02', 'Teff_K', 'eTeff_K', 'logg', 'elogg', '[Fe/H]', 'e[Fe/H]', 'Ref03', 'L_Lsol', 'eL_Lsol', 'Ref04', 'R_Rsol', 'eR_Rsol', 'Ref05', 'M_Msol', 'eM_Msol', 'Ref06', 'muRA_masa-1', 'emuRA_masa-1', 'muDE_masa-1', 'emuDE_masa-1', 'Ref07', 'pi_mas', 'epi_mas', 'Ref08', 'd_pc', 'ed_pc', 'Ref09', 'Vr_kms-1', 'eVr_kms-1', 'Ref10', 'ruwe', 'Ref11', 'U_kms-1', 'eU_kms-1', 'V_kms-1', 'eV_kms-1', 'W_kms-1', 'eW_kms-1', 'Ref12', 'sa_m/s/a', 'esa_m/s/a', 'Ref13', 'SKG', 'Ref14', 'SKG_lit', 'Ref14_lit', 'Pop', 'Ref15', 'vsini_flag', 'vsini_kms-1', 'evsini_kms-1', 'Ref16', 'P_d', 'eP_d', 'Ref17', 'pEWHalpha_A', 'epEWHalpha_A', 'Ref18', 'log(LHalpha/Lbol)', 'elog(LHalpha/Lbol)', 'Ref19', '1RXS', 'CRT_s-1', 'eCRT_s-1', 'HR1', 'eHR1', 'HR2', 'eHR2', 'Flux_X_E-13_ergcm-2s-1', 'eFlux_X_E-13_ergcm-2s-1', 'LX/LJ', 'eLX/LJ', 'Ref20', 'Activity', 'Ref21', 'FUV_mag',

## Feature extraction with `cesium` for the real RV curves

In [37]:
# DISABLE WARNINGS:
warnings.filterwarnings('ignore')
# Batch processing:
lapse_list = []
median_lapse = None
# Initialize features dataframe and metafeatures (from disk, or new):
try:
    df = pd.read_csv(CS_FEATURES_FOLDER + OUT_DATASET_FILE, sep=',', decimal='.')
    i0 = len(df)
    print("Previous result found, will continue at record %d..." %len(df))
except:
    # No previous data stored in disk, initialize the DataFrame:
    print("No previous results found, initializing dataframe...")
    df = None
    i0=0
metadata_idx = METADATA
#for i in range(0, 3): # TEST
for i in range(i0, len(gto)):
    start_time = time.time()
    print("Record: %d, started at %s..."
          %(i, time.strftime('%d/%m/%Y, %H:%M:%S', time.localtime(start_time))))
    if median_lapse is None:
        print("Previous median lapse time: %s" %median_lapse)
    else:
        print("Previous median lapse time: %.2f seconds" %median_lapse)
    # Get metafeatures values:
    metadata_values = list(gto.loc[i, metadata_idx])
    try:
        # load RV file:
        rv = pd.read_csv(gto.loc[i, 'rv_file'], sep=' ', decimal='.',
                         names=['time', 'rv', 'error_rv'])
        # Create TimeSeries object:
        ts = TimeSeries(t=rv['time'], m=rv['rv'], e=rv['error_rv'])
        # Featurize the time series:
        cs = featurize_single_ts(ts, features_to_use=ALL_CS_FEATURES)
        # Join metadata and features for the dataframe:
        indices = metadata_idx + ['VALID_RECORD'] + list(cs.index.get_level_values('feature'))
        values = metadata_values + [True] + list(cs.values)
    except Exception as e:
        # An exception was found, mark the record as invalid and set the features to 'nan':
        print("***ERROR: some error happened in record %d, marking the record as invalid. Error: %s" %(i, str(e)))
        indices = metadata_idx + ['VALID_RECORD'] + ALL_CS_FEATURES
        values = metadata_values + [False] + [np.nan] * 112
    if df is None:
        # Initialize DataFrame (with the first item):
        df = pd.DataFrame(data=[values], columns=indices)
    else:
        # Create a new DataFrame (with the new item):
        new_df = pd.DataFrame(data=[values], columns=indices)
        # Append the new dataframe to the existing one:
        df = df.append(new_df, ignore_index=True)
    # UPDATE THE AVERAGE RECORD PROCESSING TIME:
    lapse = time.time() - start_time
    lapse_list.append(lapse)
    median_lapse = np.nanmedian(lapse_list)
    # Save the results:
    df.to_csv(CS_FEATURES_FOLDER + OUT_DATASET_FILE, sep=',', decimal='.', index=False)


Previous result found, will continue at record 345...
Record: 345, started at 28/04/2022, 12:09:57...
Previous median lapse time: None
Record: 346, started at 28/04/2022, 12:09:58...
Previous median lapse time: 0.19 seconds
Record: 347, started at 28/04/2022, 12:09:58...
Previous median lapse time: 0.40 seconds
Record: 348, started at 28/04/2022, 12:09:58...
Previous median lapse time: 0.19 seconds
Record: 349, started at 28/04/2022, 12:09:59...
Previous median lapse time: 0.18 seconds
Record: 350, started at 28/04/2022, 12:09:59...
Previous median lapse time: 0.19 seconds
Record: 351, started at 28/04/2022, 12:10:00...
Previous median lapse time: 0.18 seconds
Record: 352, started at 28/04/2022, 12:10:00...
Previous median lapse time: 0.19 seconds
Record: 353, started at 28/04/2022, 12:10:01...
Previous median lapse time: 0.28 seconds
Record: 354, started at 28/04/2022, 12:10:01...
Previous median lapse time: 0.19 seconds
Record: 355, started at 28/04/2022, 12:10:01...
Previous median 

### Next steps are to be executed only if the cell execution is user-interrupted

For example, if the user decided to interrupt the cell execution because it got stuck in some record, the next cells update the info for that record with an "invalid record" mark.

Afterwards, the loop (previous cell) can be executed again and it will start from the record following the problematic one.

In [30]:
i

344

In [31]:
df.tail()

Unnamed: 0,Karmn,SpT,SpTnum,Teff_K,eTeff_K,logg,elogg,[Fe/H],e[Fe/H],L_Lsol,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
339,J02015+637,M3.0 V,3.0,3579.0,23.0,4.87,0.11,-0.08,0.08,0.026266,...,0.95651,0.049057,0.013105,0.001781,1.427002,0.77838,0.785754,0.905758,1.363885,0.295803
340,J02002+130,M3.5: V,3.5,3237.0,32.0,5.22,0.14,-0.25,0.14,0.002513,...,0.758974,0.428651,-3.106295,-0.00062,1.307408,1.0,0.570625,0.586252,1.031757,0.376693
341,J01550+379,M5.0 V,5.0,3100.0,50.0,4.5,0.25,0.0,,0.003569,...,1.375437,0.198762,-0.249762,0.098879,0.688378,1.385859,2.0,2.0,2.303215,0.540301
342,J01518+644,M2.5 V,2.5,3625.0,21.0,5.04,0.1,-0.01,0.05,0.03301,...,1.042462,0.077251,-0.094803,0.001594,2.590766,0.574208,1.216553,1.297595,1.738791,0.210933
343,J01433+043,M2.0 V,2.0,3547.0,20.0,5.1,0.17,-0.09,0.07,0.022719,...,1.004721,0.000752,-0.597323,0.004073,1.445594,1.132954,0.742754,0.694564,1.021603,0.071396


In [32]:
gto.loc[i]

Karmn                 J01352-072
Name                Barta 161 12
Comp                           -
GJ                           NaN
RA_J2016_deg           23.808453
                        ...     
lc_file              <ambiguous>
has_rv                      True
has_lc                      True
InstBand_nominal            none
InstBand_ranged             none
Name: 344, Length: 186, dtype: object

In [33]:
# Update the wrong record:
indices = metadata_idx + ['VALID_RECORD'] + ALL_CS_FEATURES
values = metadata_values + [False] + [np.nan] * 112
new_df = pd.DataFrame(data=[values], columns=indices)
df = df.append(new_df, ignore_index=True)


In [34]:
df.tail()

Unnamed: 0,Karmn,SpT,SpTnum,Teff_K,eTeff_K,logg,elogg,[Fe/H],e[Fe/H],L_Lsol,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
340,J02002+130,M3.5: V,3.5,3237.0,32.0,5.22,0.14,-0.25,0.14,0.002513,...,0.758974,0.428651,-3.106295,-0.00062,1.307408,1.0,0.570625,0.586252,1.031757,0.376693
341,J01550+379,M5.0 V,5.0,3100.0,50.0,4.5,0.25,0.0,,0.003569,...,1.375437,0.198762,-0.249762,0.098879,0.688378,1.385859,2.0,2.0,2.303215,0.540301
342,J01518+644,M2.5 V,2.5,3625.0,21.0,5.04,0.1,-0.01,0.05,0.03301,...,1.042462,0.077251,-0.094803,0.001594,2.590766,0.574208,1.216553,1.297595,1.738791,0.210933
343,J01433+043,M2.0 V,2.0,3547.0,20.0,5.1,0.17,-0.09,0.07,0.022719,...,1.004721,0.000752,-0.597323,0.004073,1.445594,1.132954,0.742754,0.694564,1.021603,0.071396
344,J01352-072,M4.0 V,4.0,3052.0,12.0,4.73,0.15,-0.02,0.08,0.04775,...,,,,,,,,,,


In [35]:
# Save the results:
df.to_csv(CS_FEATURES_FOLDER + OUT_DATASET_FILE, sep=',', decimal='.', index=False)


In [36]:
df.head()

Unnamed: 0,Karmn,SpT,SpTnum,Teff_K,eTeff_K,logg,elogg,[Fe/H],e[Fe/H],L_Lsol,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
0,J23585+076,M3.0 V,3.0,3496.0,20.0,4.97,0.08,-0.06,0.06,0.029308,...,0.720716,0.066449,-36.935032,-3.130905,3.234152,7.193835,0.098248,0.252732,0.063294,0.079189
1,J23556-061,M2.5 V,2.5,3639.0,30.0,4.84,0.14,-0.02,0.08,0.046061,...,0.6648,0.006089,-51.984633,-0.718425,5.582731,1.0,0.143169,0.143169,0.053713,0.038826
2,J23548+385,M4.0 V,4.0,3263.0,16.0,5.13,0.1,-0.55,0.09,0.010424,...,0.948789,0.34326,-2.149188,-0.054085,,1.07728,1.567387,1.073014,1.917366,0.169913
3,J23505-095,M4.0 V,4.0,3377.0,34.0,4.83,0.1,-0.08,0.1,0.010298,...,0.809719,0.498257,-2.800944,9.6e-05,0.793332,1.495594,1.052292,1.496514,0.894181,0.684958
4,J23492+024,M1.0 V,1.0,3573.0,23.0,4.94,0.13,-0.55,0.08,0.025559,...,0.801223,0.598726,-0.009706,0.001959,0.930585,1.783765,0.723985,1.192476,1.009627,0.744844


## Review the records with errors

In [38]:
df[df['VALID_RECORD'] == False]

Unnamed: 0,Karmn,SpT,SpTnum,Teff_K,eTeff_K,logg,elogg,[Fe/H],e[Fe/H],L_Lsol,...,freq_signif_ratio_31,freq_varrat,freq_y_offset,linear_trend,medperc90_2p_p,p2p_scatter_2praw,p2p_scatter_over_mad,p2p_scatter_pfold_over_mad,p2p_ssqr_diff_over_var,scatter_res_raw
92,J17572+707,M7.5 V,7.5,2600.0,50.0,5.0,0.5,0.0,,0.001092,...,,,,,,,,,,
99,J17198+417,M2.5 V,2.5,3540.0,20.0,4.96,0.12,-0.2,0.07,0.018012,...,,,,,,,,,,
115,J16102-193,M3.0 V,3.0,3575.0,55.0,4.48,0.19,-0.02,0.07,0.102624,...,,,,,,,,,,
270,J06318+414,M5.0 V,5.0,3084.0,13.0,4.88,0.07,-0.03,0.05,0.01338,...,,,,,,,,,,
273,J06103+821,M2.0 V,2.0,3554.0,20.0,4.95,0.13,-0.16,0.07,0.02409,...,,,,,,,,,,
309,J04173+088,M4.5 V,4.5,3100.0,50.0,5.5,0.25,0.0,,0.00656,...,,,,,,,,,,
344,J01352-072,M4.0 V,4.0,3052.0,12.0,4.73,0.15,-0.02,0.08,0.04775,...,,,,,,,,,,


In [42]:
print(list(df.loc[df['VALID_RECORD'] == False, 'Karmn']))

['J17572+707', 'J17198+417', 'J16102-193', 'J06318+414', 'J06103+821', 'J04173+088', 'J01352-072']


## Summary

**CONCLUSIONS:**
- Completed the `cesium` feature extraction of the _GTO_ objects.
- Notice that a few objects yielded errors:
  - J17572+707
  - J17198+417
  - J16102-193
  - J06318+414
  - J06103+821
  - J04173+088
  - J01352-072
