# GENERATION OF SAMPLING PATTERN AND NOISE DISTRIBUTIONS POOLS FROM THE PG SUBSAMPLE

In this notebook we generate the pools of sampling patterns and noise distributions from the PG subsample of CARMENES RV curves that we will use to modify the basic synthetic dataset to get a modeled dataset as similar as possible to the real PG subsample dataset.

## Modules and configuration

### Modules

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import json

from IPython.display import clear_output

from scipy.stats import pearsonr, ttest_ind, normaltest

from distfit import distfit

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white", {'figure.figsize':(15,10)})

### Configuration

In [2]:
GTO_FILE = "../data/SELECTION_for_PG_CARM_VIS_objects_with_PG.csv"
DIST_FILES_FOLDER = "../data/DIST_FILES/"
NOISE_DIST_FOLDER = "../data/DIST_FILES/RV_PG_subsample_Stars_noise_dist/"

RV_PATTERN_POOL_FILE = "RV_PG_subsample_sampling_patterns_and_noise.csv"

IMAGE_FOLDER = "./img/"

RANDOM_STATE = 11 # For reproducibility

N_TRIALS = 10 # Number of random noise samples to generate to test the goodness of fit.
ALPHA = 1e-3 # Significance threshold for statistical tests

### Functions

In [3]:
def noise_fit_goodness(source_data, fitted_dist, num_trials, src_min=None, src_max=None):
    '''Returns a median of the p-value for a given number of T-test independence values,
    calculated between a source data and asample from a distribution fitted to that
    source data. Also return a median of the generated values that fall outside the range
    of the source data.'''
    
    # Code to silence output is from Alex Martelli and Nick T at:
    # https://stackoverflow.com/questions/2828953/silence-the-stdout-of-a-function-in-python-without-trashing-sys-stdout-and-resto
    # Redirect output to a file
    save_stdout = sys.stdout
    sys.stdout = open('trash', 'w')

    p_values = []
    oor_values = []
    if src_min is None:
        src_min = source_data.min()
    if src_max is None:
        src_max = source_data.max()
    for trial in range(0, num_trials):
        rnd_values = fitted_dist.generate(n=len(source_data), random_state=RANDOM_STATE)
        #_, p = ttest_ind(source_data, rnd_values)
        #oor_fraction = (sum(rnd_values < src_min) + sum(rnd_values > src_max)) / len(rnd_values)
        #p_values.append(p)
        #oor_values.append(oor_fraction)
        oor_fraction = (sum(rnd_values < src_min) + sum(rnd_values > src_max)) / len(rnd_values)
        oor_values.append(oor_fraction)
        #_, p = ttest_ind(source_data, rnd_values)
        _, ks_p = kstest(source_data, rnd_values)
        p_values.append(ks_p)
    
    # Redirect output to screen again, and delete the file.
    sys.stdout = save_stdout
    os.remove('trash')
    
    return np.nanmedian(p_values), np.nanmedian(oor_values)
    

In [4]:
def rv_extract_sampling_and_noise(rv_filename):
    '''Reads a CARMENES RV curve file and returns the sampling pattern as a list of time differences,
    referred to the time of the initial sample point, and the best fit for the noise distribution,
    as well as noise statistics and estimated goodness of fit for the noise'''
    #if True: # TEST
    try:
        rv_ts = pd.read_csv(rv_filename, sep=' ', decimal='.', names=['time', 'rv', 'rv_error'])
        # Extract sampling pattern:
        ref_time = rv_ts['time'].min()
        sampling_pattern = (rv_ts['time'] - ref_time).tolist()
        # Calculate noise statistics:
        rv_error_median = np.nanmedian(rv_ts['rv_error'])
        rv_error_mean = np.nanmean(rv_ts['rv_error'])
        rv_error_stdev = np.nanstd(rv_ts['rv_error'])
        rv_error_min = np.nanmin(rv_ts['rv_error'])
        rv_error_max = np.nanmax(rv_ts['rv_error'])
        # Fit best noise distribution:
        dist = distfit()
        dist.fit_transform(pd.Series(rv_ts['rv_error']), verbose=0)
        #if True: # TEST
        try:
            fit_p_value, fit_outliers_fraction = \
                noise_fit_goodness(source_data=rv_ts['rv_error'], fitted_dist=dist, num_trials=N_TRIALS,
                                   src_min=rv_error_min, src_max=rv_error_max)
        #else: # TEST
        except:
            fit_p_value, fit_outliers_fraction = 0.0, np.nan
    #else: # TEST
    except Exception as e:
        print("***ERROR: an error happened with file %s. Error: %s" %(rv_filename, str(e)))
        sampling_pattern = None
        fit_p_value = 0.0
    return sampling_pattern, rv_error_min, rv_error_max, rv_error_median, rv_error_mean, rv_error_stdev, \
        dist, fit_p_value, fit_outliers_fraction

In [5]:
def draw_boxplot(data, x, y, fig_title, x_label, y_label, showfliers=None, fig_filename=None):
    '''Draws a boxplot according to the arguments passed.'''
    # Plots and saves the figure
    #kwargs = dict(hist_kws={'alpha':.4}, kde_kws={'linewidth':2})
    plt.figure(figsize=(10,7))
    sns.boxplot(data=data, x=x, y=y, showfliers=showfliers)
    plt.title(fig_title, fontsize=16)
    plt.xlabel(x_label, fontsize=12)
    plt.ylabel(y_label, fontsize=12)
    #plt.grid(axis='both', alpha=0.75)
    if fig_filename is None:
        pass
    else:
        plt.savefig(fig_filename, format='png')


In [6]:
def scatter_graph(data, var_x, var_y, fig_title, x_label, y_label, fig_filename=None):
    '''Plots a scatter plot of two columns of a dataframe.
    It also calculates and shows Pearson's correlation coefficient.'''
    #rv_x_var = var_x
    #tess_y_var = var_y
    #kwargs = dict(hist_kws={'alpha':.4}, kde_kws={'linewidth':2})
    plt.figure(figsize=(10,7))
    plt.title(fig_title, fontsize=16)
    plt.grid(axis='both', alpha=0.75)
    plt.xlabel(x_label, fontsize=12)
    plt.ylabel(y_label, fontsize=12)
    sns.scatterplot(data=data, x=var_x, y=var_y)
#    plt.xlim(0.0, 1.0)
#    plt.ylim(0.0, 1.0)    
    plt.legend(loc='upper right');
    # Save the image:
    if fig_filename is None:
        pass
    else:
        plt.savefig(fig_filename, format='png')
    # Calculate the Pearson correlation coefficient between the two results:
    corr, _ = pearsonr(data[var_x], data[var_y])
    print('Pearson\'s correlation value: %.3f' % corr)

In [7]:
def draw_hist(d1, d1_label, fig_filename=None):
    '''Draws an histogram of the values passed. It also calulates the statistics
    and tests the goodness of fit by a Gaussian distribution'''
    # Plots and saves the figure
    kwargs = dict(hist_kws={'alpha':.4}, kde_kws={'linewidth':2})
    plt.figure(figsize=(10,7))
    plt.title(d1_label + " values distribution", fontsize=16)
    plt.grid(axis='both', alpha=0.75)
    plt.xlabel(d1_label, fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    sns.distplot(d1, label=d1_label, **kwargs)
    #sns.histplot(d1, label=d1_label)
    #plt.xlim(0,100)
    #plt.legend(loc='upper right');
    # Save the image:
    if fig_filename is None:
        pass
    else:
        plt.savefig(fig_filename, format='png')
    # Display basic statistics:
    print(pd.Series(d1).describe())
    # Perform the normaltest (D'Agostino and Pearson) for Gaussian distribution fit:
    k2, p = normaltest(d1)
    alpha = ALPHA
    print("p = {:g}".format(p))
    if p < alpha:  # null hypothesis: x comes from a normal distribution
        print("***This distribution does NOT come from a Gaussian distribution" \
              "(with significance level = %.6f)" %(1.0 - alpha))
    else:
        print("This distribution MIGHT COME come from Gaussian distribution" \
              "(with significance level = %.6f)" %(1.0 - alpha))

## Load the PG subsample file

In [8]:
gto = pd.read_csv(GTO_FILE, sep=',', decimal='.')
gto.head()

Unnamed: 0,Karmn,Name,Comp,GJ,RA_J2016_deg,DE_J2016_deg,RA_J2000,DE_J2000,l_J2016_deg,b_J2016_deg,...,WF_offset_PG_TESS,WF_e_offset_PG_TESS,WF_FAP_PG_TESS,WF_valid_PG_TESS,WF_error_PG_TESS,WF_elapsed_time_PG_TESS,WF_plain_file_TESS,WF_fig_file_TESS,PG_file_RV,PG_file_TESS
0,J23548+385,RX J2354.8+3831,-,,358.713658,38.52634,23:54:51.46,+38:31:36.2,110.941908,-23.024449,...,999.999756,2.151008e-06,1.0,1.0,,94.758838,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23548+38...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23548+385_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23548+385_RV_PG.dat
1,J23505-095,LP 763-012,-,4367.0,357.634705,-9.560964,23:50:31.64,-09:33:32.7,80.777067,-67.303426,...,1000.000122,9.022946e-07,1.0,1.0,,132.607176,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23505-09...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23505-095_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23505-095_RV_PG.dat
2,J23431+365,GJ 1289,-,1289.0,355.781509,36.53631,23:43:06.31,+36:32:13.1,107.922839,-24.336479,...,999.999512,4.306074e-06,1.0,1.0,,97.939914,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23431+36...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23431+365_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23431+365_RV_PG.dat
3,J23381-162,G 273-093,-,4352.0,354.532687,-16.236514,23:38:08.16,-16:14:10.2,61.845437,-69.82522,...,1000.000122,9.022946e-07,1.0,1.0,,136.603404,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23381-16...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23381-162_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23381-162_RV_PG.dat
4,J23245+578,BD+57 2735,-,895.0,351.126628,57.853057,23:24:30.51,+57:51:15.5,111.552287,-3.085183,...,999.999512,3.720858e-06,1.0,1.0,,131.327304,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23245+57...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23245+578_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23245+578_RV_PG.dat


### Clean the object list

To be on the safe side, we remove the objects that gave trouble during the RV periodogram calculations:

In [9]:
gto.loc[gto['valid_PG_RV'] == 0, 'Karmn']

Series([], Name: Karmn, dtype: object)

In [10]:
gto = gto[gto['valid_PG_RV'] == 1].reset_index().copy()
gto.head()

Unnamed: 0,index,Karmn,Name,Comp,GJ,RA_J2016_deg,DE_J2016_deg,RA_J2000,DE_J2000,l_J2016_deg,...,WF_offset_PG_TESS,WF_e_offset_PG_TESS,WF_FAP_PG_TESS,WF_valid_PG_TESS,WF_error_PG_TESS,WF_elapsed_time_PG_TESS,WF_plain_file_TESS,WF_fig_file_TESS,PG_file_RV,PG_file_TESS
0,0,J23548+385,RX J2354.8+3831,-,,358.713658,38.52634,23:54:51.46,+38:31:36.2,110.941908,...,999.999756,2.151008e-06,1.0,1.0,,94.758838,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23548+38...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23548+385_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23548+385_RV_PG.dat
1,1,J23505-095,LP 763-012,-,4367.0,357.634705,-9.560964,23:50:31.64,-09:33:32.7,80.777067,...,1000.000122,9.022946e-07,1.0,1.0,,132.607176,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23505-09...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23505-095_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23505-095_RV_PG.dat
2,2,J23431+365,GJ 1289,-,1289.0,355.781509,36.53631,23:43:06.31,+36:32:13.1,107.922839,...,999.999512,4.306074e-06,1.0,1.0,,97.939914,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23431+36...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23431+365_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23431+365_RV_PG.dat
3,3,J23381-162,G 273-093,-,4352.0,354.532687,-16.236514,23:38:08.16,-16:14:10.2,61.845437,...,1000.000122,9.022946e-07,1.0,1.0,,136.603404,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23381-16...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23381-162_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23381-162_RV_PG.dat
4,4,J23245+578,BD+57 2735,-,895.0,351.126628,57.853057,23:24:30.51,+57:51:15.5,111.552287,...,999.999512,3.720858e-06,1.0,1.0,,131.327304,../data/CARM_VIS_TESS_WinFunc_PGs/WF_J23245+57...,../data/CARM_VIS_TESS_WinFunc_PGs/figures/WF_J...,../data/CARM_VIS_RVs_PGs/J23245+578_RV_PG.dat,../data/CARM_VIS_TESS_PGs/J23245+578_RV_PG.dat


In [11]:
gto.shape

(269, 301)

## CARMENES PG subsample RV sampling patterns and noise distributions pool

### Populate the pools

Notice that we also trasfer some of the original data from the star, so that calculating some possible correlations later on will be easier.

In [12]:
print(gto.columns.to_list())

['index', 'Karmn', 'Name', 'Comp', 'GJ', 'RA_J2016_deg', 'DE_J2016_deg', 'RA_J2000', 'DE_J2000', 'l_J2016_deg', 'b_J2016_deg', 'Ref01', 'SpT', 'SpTnum', 'Ref02', 'Teff_K', 'eTeff_K', 'logg', 'elogg', '[Fe/H]', 'e[Fe/H]', 'Ref03', 'L_Lsol', 'eL_Lsol', 'Ref04', 'R_Rsol', 'eR_Rsol', 'Ref05', 'M_Msol', 'eM_Msol', 'Ref06', 'muRA_masa-1', 'emuRA_masa-1', 'muDE_masa-1', 'emuDE_masa-1', 'Ref07', 'pi_mas', 'epi_mas', 'Ref08', 'd_pc', 'ed_pc', 'Ref09', 'Vr_kms-1', 'eVr_kms-1', 'Ref10', 'ruwe', 'Ref11', 'U_kms-1', 'eU_kms-1', 'V_kms-1', 'eV_kms-1', 'W_kms-1', 'eW_kms-1', 'Ref12', 'sa_m/s/a', 'esa_m/s/a', 'Ref13', 'SKG', 'Ref14', 'SKG_lit', 'Ref14_lit', 'Pop', 'Ref15', 'vsini_flag', 'vsini_kms-1', 'evsini_kms-1', 'Ref16', 'P_d', 'eP_d', 'Ref17', 'pEWHalpha_A', 'epEWHalpha_A', 'Ref18', 'log(LHalpha/Lbol)', 'elog(LHalpha/Lbol)', 'Ref19', '1RXS', 'CRT_s-1', 'eCRT_s-1', 'HR1', 'eHR1', 'HR2', 'eHR2', 'Flux_X_E-13_ergcm-2s-1', 'eFlux_X_E-13_ergcm-2s-1', 'LX/LJ', 'eLX/LJ', 'Ref20', 'Activity', 'Ref21', '

In [13]:
print(gto['Activity'].unique().tolist())

['RV-loud', nan, 'Flare?', 'Flare - RV-loud', 'Flare? - RV-loud', 'Flare']


In [14]:
sum(gto['Activity'].isna())

199

In [15]:
# Initialize the results array
rv_patterns = pd.DataFrame(columns=['Karmn', 'Vr_kms-1', 'n_RV', 'Ps_RV', 'wrms_RV',
                                    'RV_sampling_deltas',
                                    'RV_noise_min', 'RV_noise_max',
                                    'RV_noise_median', 'RV_noise_mean', 'RV_noise_stdev',
                                    'RV_noise_dist', 'RV_noise_name', 'RV_noise_loc', 'RV_noise_scale',
                                    'RV_noise_dist_file', 'RV_noise_fit_p-value', 'Fraction_Out'])
rv_patterns

Unnamed: 0,Karmn,Vr_kms-1,n_RV,Ps_RV,wrms_RV,RV_sampling_deltas,RV_noise_min,RV_noise_max,RV_noise_median,RV_noise_mean,RV_noise_stdev,RV_noise_dist,RV_noise_name,RV_noise_loc,RV_noise_scale,RV_noise_dist_file,RV_noise_fit_p-value,Fraction_Out


In [16]:
# Populate the patterns:
#for i in range(0,5): # TEST
for i in range(0,len(gto)):
    clear_output(wait=True)
    new_pattern, \
        lower_limit, higher_limit, noise_median, noise_mean, noise_stdev, \
        new_noise_dist, new_noise_fit_p, limit_outliers_frac = \
            rv_extract_sampling_and_noise(gto.loc[i, 'rv_file'])
    dist_filename = NOISE_DIST_FOLDER + gto.loc[i, 'Karmn'] + "_RV_noise.pickle"
    new_noise_dist.save(dist_filename)
    try:
        new_noise_name = new_noise_dist.model['name']
    except:
        new_noise_name = np.nan
    try:
        new_noise_loc = new_noise_dist.model['loc']
    except:
        new_noise_loc = np.nan
    try:
        new_noise_scale = new_noise_dist.model['scale']
    except:
        new_noise_scale = np.nan
    rv_patterns = rv_patterns \
        .append({'Karmn': gto.loc[i, 'Karmn'],
                 'Vr_kms-1': gto.loc[i, 'Vr_kms-1'],
                 'n_RV': gto.loc[i, 'n_RV'],
                 'Ps_RV': gto.loc[i, 'Ps_RV'],
                 'wrms_RV': gto.loc[i, 'wrms_RV'],
                 'RV_sampling_deltas': json.dumps(new_pattern),
                 'RV_noise_min': lower_limit,
                 'RV_noise_max': higher_limit,
                 'RV_noise_median': noise_median,
                 'RV_noise_mean': noise_mean,
                 'RV_noise_stdev': noise_stdev,
                 'RV_noise_dist': new_noise_dist.model,
                 'RV_noise_name': new_noise_name,                 
                 'RV_noise_loc': new_noise_loc,
                 'RV_noise_scale':  new_noise_scale,
                 'RV_noise_dist_file': dist_filename,
                 'RV_noise_fit_p-value': new_noise_fit_p,
                 'Fraction_Out': limit_outliers_frac
                },
                ignore_index=True)
rv_patterns.tail()

[pypickle] Pickle file saved: [../data/DIST_FILES/RV_PG_subsample_Stars_noise_dist/J00051+457_RV_noise.pickle]
[distfit] >Saving.. True


Unnamed: 0,Karmn,Vr_kms-1,n_RV,Ps_RV,wrms_RV,RV_sampling_deltas,RV_noise_min,RV_noise_max,RV_noise_median,RV_noise_mean,RV_noise_stdev,RV_noise_dist,RV_noise_name,RV_noise_loc,RV_noise_scale,RV_noise_dist_file,RV_noise_fit_p-value,Fraction_Out
264,J00403+612,2.876543,44.0,1.946459,7.233259,"[0.0, 7.001964730676264, 7.997985835652798, 9....",2.26292,10.339515,2.925878,3.543329,1.503834,{'distr': <scipy.stats._continuous_distns.pare...,pareto,-0.001595,2.264515,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.751918,0.022727
265,J00389+306,-0.626068,60.0,5.092794,2.356139,"[0.0, 2.9924363917671144, 14.006995059549809, ...",0.914389,2.286232,1.308279,1.411491,0.33325,{'distr': <scipy.stats._continuous_distns.logn...,lognorm,0.774437,0.55561,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.684154,0.016667
266,J00286-066,-12.405,50.0,4.001709,2.397639,"[0.0, 10.991652541328222, 17.91978120803833, 2...",0.981611,2.886192,1.354834,1.404884,0.307503,{'distr': <scipy.stats._continuous_distns.logn...,lognorm,0.841154,0.499865,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.494928,0.02
267,J00183+440,11.82,216.0,0.826708,2.617721,"[0.0, 1.0492656417191029, 2.022110936231911, 3...",0.766382,4.264498,1.476962,1.55648,0.527647,{'distr': <scipy.stats._continuous_distns.beta...,beta,0.744054,7.930984,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.184423,0.0
268,J00051+457,-0.229,52.0,3.024201,2.835825,"[0.0, 20.975221153348684, 188.3961285338737, 2...",1.053913,5.406914,1.706992,1.80472,0.676688,{'distr': <scipy.stats._continuous_distns.logn...,lognorm,0.907302,0.727212,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.509757,0.019231


In [17]:
rv_patterns.head()

Unnamed: 0,Karmn,Vr_kms-1,n_RV,Ps_RV,wrms_RV,RV_sampling_deltas,RV_noise_min,RV_noise_max,RV_noise_median,RV_noise_mean,RV_noise_stdev,RV_noise_dist,RV_noise_name,RV_noise_loc,RV_noise_scale,RV_noise_dist_file,RV_noise_fit_p-value,Fraction_Out
0,J23548+385,5.371895,13.0,13.940021,29.081874,"[0.0, 11.003340645693243, 29.863817581906915, ...",2.051089,5.171477,3.113559,3.224349,0.972852,{'distr': <scipy.stats._continuous_distns.beta...,beta,2.051089,3.210288,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.461777,0.0
1,J23505-095,-21.870867,71.0,2.908164,3.362607,"[0.0, 11.957513123750687, 47.87879143701866, 5...",1.199479,4.170413,1.722917,1.881812,0.628702,{'distr': <scipy.stats._continuous_distns.t_ge...,t,1.691561,0.290193,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.103726,0.112676
2,J23431+365,-2.597,23.0,17.0572,2.795865,"[0.0, 8.942367421463132, 9.875712749548256, 25...",0.91662,2.010952,1.377055,1.408319,0.26237,{'distr': <scipy.stats._continuous_distns.dwei...,dweibull,1.403768,0.233191,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.687567,0.043478
3,J23381-162,20.470749,56.0,3.019729,2.405056,"[0.0, 10.978803387377411, 30.955036497674882, ...",1.01608,3.446221,1.423674,1.611703,0.542434,{'distr': <scipy.stats._continuous_distns.gene...,genextreme,1.343269,0.248643,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.482127,0.017857
4,J23245+578,-33.185,60.0,5.992043,4.930567,"[0.0, 138.36809208989143, 166.319802749902, 19...",1.141035,3.048668,1.580168,1.71084,0.429185,{'distr': <scipy.stats._continuous_distns.logn...,lognorm,1.039512,0.554111,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.62968,0.0


In [18]:
# See the last pattern added:
print(new_pattern)

[0.0, 20.975221153348684, 188.3961285338737, 205.38048223266378, 209.38376312609762, 210.38946344563738, 211.3372044339776, 220.15363332116976, 233.3020677617751, 236.28708868660033, 245.30540515342727, 247.20802798401564, 248.3613522797823, 249.32451374921948, 253.2377212671563, 257.244304608088, 278.1298939133994, 285.19921019487083, 288.1815115036443, 295.1172859589569, 305.06492951279506, 311.06623519863933, 510.30484426533803, 536.3358177728951, 537.3314265487716, 539.2210110668093, 546.3397195651196, 551.2010640129447, 554.1786011150107, 556.2090220963582, 560.3323087557219, 561.3038359247148, 562.3306985800155, 564.3055038368329, 565.3272510636598, 568.3514519101009, 570.3849403518252, 571.3390916050412, 577.1731422855519, 587.3009799011052, 597.2429402219132, 600.2649239646271, 601.2674847017042, 602.3093053563498, 605.2399070998654, 607.2822220451199, 609.2366422573105, 630.1764049907215, 658.1371545102447, 659.1520783104934, 660.1032314212061, 693.2164790052921]


### Save the pools to a file

In [19]:
# Save the file:
rv_patterns.to_csv(DIST_FILES_FOLDER + RV_PATTERN_POOL_FILE, sep=',', decimal='.', index=False)

### Test the access to the saved file and patterns

In [20]:
# Check file re-reading:
reloaded_patterns = pd.read_csv(DIST_FILES_FOLDER + RV_PATTERN_POOL_FILE, sep=',', decimal='.')
reloaded_patterns.tail()

Unnamed: 0,Karmn,Vr_kms-1,n_RV,Ps_RV,wrms_RV,RV_sampling_deltas,RV_noise_min,RV_noise_max,RV_noise_median,RV_noise_mean,RV_noise_stdev,RV_noise_dist,RV_noise_name,RV_noise_loc,RV_noise_scale,RV_noise_dist_file,RV_noise_fit_p-value,Fraction_Out
264,J00403+612,2.876543,44.0,1.946459,7.233259,"[0.0, 7.001964730676264, 7.997985835652798, 9....",2.26292,10.339515,2.925878,3.543329,1.503834,{'distr': <scipy.stats._continuous_distns.pare...,pareto,-0.001595,2.264515,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.751918,0.022727
265,J00389+306,-0.626068,60.0,5.092794,2.356139,"[0.0, 2.9924363917671144, 14.006995059549809, ...",0.914389,2.286232,1.308279,1.411491,0.33325,{'distr': <scipy.stats._continuous_distns.logn...,lognorm,0.774437,0.55561,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.684154,0.016667
266,J00286-066,-12.405,50.0,4.001709,2.397639,"[0.0, 10.991652541328222, 17.91978120803833, 2...",0.981611,2.886192,1.354834,1.404884,0.307503,{'distr': <scipy.stats._continuous_distns.logn...,lognorm,0.841154,0.499865,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.494928,0.02
267,J00183+440,11.82,216.0,0.826708,2.617721,"[0.0, 1.0492656417191029, 2.022110936231911, 3...",0.766382,4.264498,1.476962,1.55648,0.527647,{'distr': <scipy.stats._continuous_distns.beta...,beta,0.744054,7.930984,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.184423,0.0
268,J00051+457,-0.229,52.0,3.024201,2.835825,"[0.0, 20.975221153348684, 188.3961285338737, 2...",1.053913,5.406914,1.706992,1.80472,0.676688,{'distr': <scipy.stats._continuous_distns.logn...,lognorm,0.907302,0.727212,../data/DIST_FILES/RV_PG_subsample_Stars_noise...,0.509757,0.019231


In [21]:
# Check pattern access:
recovered_pattern = reloaded_patterns.loc[150, 'RV_sampling_deltas']
recovered_pattern

'[0.0, 3.998832286335528, 28.976991701871157, 44.80734464991838, 44.86575340619311, 51.833188388030976, 69.80620671482757, 96.80198004841805, 311.99729119380936, 368.88961282977834, 408.8633766374551, 432.8361208848655, 999.0888964077458, 1055.0422208704986, 1079.043398166541, 1100.091650831513, 1131.0463412054814, 1160.0143557921983, 1219.856517886743, 1372.0597644774243, 1420.0762271629646, 1448.8389606196433, 1466.973072854802, 1495.8728722897358, 1612.8050209023058, 1777.1086061522365, 1778.0848897150718, 1779.9373235898092, 1822.0452504041605, 1846.8054111041129, 1849.858969540801, 1865.7452220148407, 1903.021297102794, 1941.7209391989745, 1942.8883735244162, 1943.7159586064517, 1956.7314257128164, 1957.7328157578595, 1958.7311653834768, 1961.8168160789646, 1962.7360141994432, 1965.7440983704291, 1966.7438189359382, 1970.7510599107482, 1988.7650840734132, 1990.7602105909027, 1992.75523577258, 1993.757428406272, 2080.081780966837, 2089.0856745732017]'

In [22]:
# Notice that we need to convert this string to an array:
rec_pattern_array = json.loads(recovered_pattern)
print(rec_pattern_array)

[0.0, 3.998832286335528, 28.976991701871157, 44.80734464991838, 44.86575340619311, 51.833188388030976, 69.80620671482757, 96.80198004841805, 311.99729119380936, 368.88961282977834, 408.8633766374551, 432.8361208848655, 999.0888964077458, 1055.0422208704986, 1079.043398166541, 1100.091650831513, 1131.0463412054814, 1160.0143557921983, 1219.856517886743, 1372.0597644774243, 1420.0762271629646, 1448.8389606196433, 1466.973072854802, 1495.8728722897358, 1612.8050209023058, 1777.1086061522365, 1778.0848897150718, 1779.9373235898092, 1822.0452504041605, 1846.8054111041129, 1849.858969540801, 1865.7452220148407, 1903.021297102794, 1941.7209391989745, 1942.8883735244162, 1943.7159586064517, 1956.7314257128164, 1957.7328157578595, 1958.7311653834768, 1961.8168160789646, 1962.7360141994432, 1965.7440983704291, 1966.7438189359382, 1970.7510599107482, 1988.7650840734132, 1990.7602105909027, 1992.75523577258, 1993.757428406272, 2080.081780966837, 2089.0856745732017]


In [23]:
type(rec_pattern_array)

list

In [24]:
# Check access to a given pattern position:
rec_pattern_array[4]

44.86575340619311

## Summary

**CONCLUSIONS:**
- We have generated a pool of sampling patterns for all PG subsample CARMENES RV curves that will be later used to affect the benchmark synthetic database with irregular sampling in a random way.