# CARMENES RV ADDITIONAL SYNTHETIC DATASET 4 CURVES 

In this notebook we create twenty additional _Datasets 4_, different from the one we used for training and validate the model.

The idea behind this operation is to see how our model performs globally on other synthetic datasets, created under the same initial conditions but different from the one used to optimize and train our model.

## Modules and configuration

### Modules

In [1]:
import pandas as pd
import numpy as np

import os

import json

from distfit import distfit

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white", {'figure.figsize':(15,10)})

### Configuration

In [2]:
# CONFIGURATION:
N = 400 # Number of records (i.e. RV curves) to create / add.
RANDOM_STATE = 11 # For reproducibility
NUM_DS = 20 # Number of datasets to create.

NON_PULSATION_FRACTION = 0.29 # Fraction of non-pulsating stars

DIST_SUMMARY_FILE = "../02_04_SyntheticDataset/DIST_FILES/Parameter_distributions_All_GTO.csv"
DIST_FILES_FOLDER = "../02_04_SyntheticDataset/DIST_FILES/"
RV_PATTERNS_FILE = "../02_04_SyntheticDataset/DIST_FILES/RV_All_GTO_sampling_patterns.csv"

DS_FOLDER = "../data/VAL_DATASETS/"
DATASETS_SUMMARY_FILE = "../data/VAL_DATASETS/RV_All_GTO_SyntheticDatasets.csv"

DS_PREFIX="VAL_DS-" # RECOMMENDED - DO NOT CHANGE


### Functions

In [3]:
def sample_value(t, f, A, c, tau, delta, noise):
    '''Returns the value of a benchmark sample at a time 't', depending on the values of the paremeters passed:
    f: frequency; A: amplitude; c: offset; tau:reference epoch; delta:phase'''
    value = A * np.cos(2 * np.pi * (f * (t - tau) + delta)) + c + noise
    return value

In [4]:
np.pi

3.141592653589793

In [5]:
# We vectorize the previous function to be able to pass arrays as parameters,
# in order to calculate the time series in one call
v_sample_value = np.vectorize(sample_value)

## Prepare distributions of parameters

### Read configuration file

Read the file containing the distributions and values used for dataset generation.

In [6]:
dist_params = pd.read_csv(DIST_SUMMARY_FILE, sep=',', decimal='.')
dist_params.head()

Unnamed: 0,Group,Variable,Distribution,Dist_file
0,BENCHMARK,Ps,{'fixed_value': 0.0025},
1,BENCHMARK,Tobs,{'fixed_value': 0.25},
2,BENCHMARK,frequency,{'distr': <scipy.stats._continuous_distns.unif...,BENCHMARK_frequency_All_GTO_dist.pickle
3,BENCHMARK,phase,{'distr': <scipy.stats._continuous_distns.unif...,BENCHMARK_phase_All_GTO_dist.pickle
4,BENCHMARK,amplitudeRV,{'distr': <scipy.stats._continuous_distns.unif...,BENCHMARK_amplitudeRV_All_GTO_dist.pickle


In [7]:
dist_params.dtypes

Group           object
Variable        object
Distribution    object
Dist_file       object
dtype: object

### Register the distributions to use for each variable

In [8]:
# Initialise variable:
param_ranges = {}
for i in range(0,len(dist_params)):
    try:
        # If successful, it is a fixed value:
        d = json.loads(dist_params.loc[i, 'Distribution'].replace('\'', '\"'))
        param_ranges[dist_params.loc[i, 'Variable']] = d
    except:
        # Must be a distribution, so we load it from file:
        try:
            d = distfit()
            d.load(DIST_FILES_FOLDER + dist_params.loc[i, 'Dist_file'])
            param_ranges[dist_params.loc[i, 'Variable']] = d
        except Exception as e:
            # Some error happened:
            print("***ERROR! Could not set parameter %s. Error: %s" \
                  %(dist_params.loc[i, 'Variable'], str(e)))


[pypickle] Pickle file loaded: [../02_04_SyntheticDataset/DIST_FILES/BENCHMARK_frequency_All_GTO_dist.pickle]
[pypickle] Pickle file loaded: [../02_04_SyntheticDataset/DIST_FILES/BENCHMARK_phase_All_GTO_dist.pickle]
[pypickle] Pickle file loaded: [../02_04_SyntheticDataset/DIST_FILES/BENCHMARK_amplitudeRV_All_GTO_dist.pickle]
[pypickle] Pickle file loaded: [../02_04_SyntheticDataset/DIST_FILES/BENCHMARK_offsetRV_All_GTO_dist.pickle]
[pypickle] Pickle file loaded: [../02_04_SyntheticDataset/DIST_FILES/BENCHMARK_refepochRV_All_GTO_dist.pickle]
[pypickle] Pickle file loaded: [../02_04_SyntheticDataset/DIST_FILES/RV_noiseRV_All_GTO_dist.pickle]
[pypickle] Pickle file loaded: [../02_04_SyntheticDataset/DIST_FILES/RV_samplingperiodRV_All_GTO_dist.pickle]
[pypickle] Pickle file loaded: [../02_04_SyntheticDataset/DIST_FILES/RV_numsamplesRV_All_GTO_dist.pickle]


In [9]:
# Show result:
param_ranges

{'Ps': {'fixed_value': 0.0025},
 'Tobs': {'fixed_value': 0.25},
 'frequency': <distfit.distfit.distfit at 0x1adde59ed90>,
 'phase': <distfit.distfit.distfit at 0x1adde540fa0>,
 'amplitudeRV': <distfit.distfit.distfit at 0x1adde5b7940>,
 'offsetRV': <distfit.distfit.distfit at 0x1adda6a5520>,
 'refepochRV': <distfit.distfit.distfit at 0x1adde5ba700>,
 'noiseRV': <distfit.distfit.distfit at 0x1adde5baca0>,
 'samplingperiodRV': <distfit.distfit.distfit at 0x1adde5d2a30>,
 'numsamplesRV': <distfit.distfit.distfit at 0x1adde5e1b50>}

## Create the random datasets and the RV curve files

In [10]:
# TEST 1 - Vectorized function: clean time series.
v_sample_value(t=[0.00, 0.50, 1.00, 1.50, 2.00, 2.50, 3.00, 3.50, 4.00, 4.50, 5.00, 5.50, 6.00, \
                  6.50, 7.00, 7.50, 8.00, 8.50, 9.00, 9.50, 10.00, 10.50, 11.00, 11.50],
               f=0.75, A=5.0, c=20.0, tau=0.5, delta=0.2,
               noise=0)

array([22.2699525 , 21.54508497, 15.54496738, 24.75528258, 17.7300475 ,
       18.45491503, 24.45503262, 15.24471742, 22.2699525 , 21.54508497,
       15.54496738, 24.75528258, 17.7300475 , 18.45491503, 24.45503262,
       15.24471742, 22.2699525 , 21.54508497, 15.54496738, 24.75528258,
       17.7300475 , 18.45491503, 24.45503262, 15.24471742])

In [11]:
# TEST 2 - Vectorized function: noisy time series.
v_sample_value(t=[0.00, 0.50, 1.00, 1.50, 2.00, 2.50, 3.00, 3.50, 4.00, 4.50, 5.00, 5.50, 6.00, \
                  6.50, 7.00, 7.50, 8.00, 8.50, 9.00, 9.50, 10.00, 10.50, 11.00, 11.50],
               f=0.75, A=5.0, c=20.0, tau=0.5, delta=0.2,
               noise=[-1.500668159, -1.379844309, -0.578035218, -0.798004417, 1.543720686, 1.188904223, \
                      -0.161336742, -0.102133465, -1.1281597, 0.933947801, -0.431082827, -0.865645144, \
                      1.968325011, 0.070059952, 1.342700752, -0.102406912, 1.605073208, 0.089842491, \
                      -1.993361815, -0.190272377, 0.061971617, 1.836111736, -1.201716785, -0.221183952])

array([20.76928434, 20.16524066, 14.96693216, 23.95727816, 19.27376819,
       19.64381925, 24.29369588, 15.14258395, 21.1417928 , 22.47903277,
       15.11388455, 23.88963744, 19.69837251, 18.52497498, 25.79773337,
       15.14231051, 23.87502571, 21.63492746, 13.55160556, 24.5650102 ,
       17.79201912, 20.29102676, 23.25331584, 15.02353347])

### Create the samples of time series in each dataset

In [13]:
# Read the sampling patterns from the RV patterns file:
rv_patterns = pd.read_csv(RV_PATTERNS_FILE)
rv_patterns['sampling_delta_RV'] = rv_patterns['sampling_delta_RV'] \
    .map(lambda x: json.loads(x))
rv_patterns.head()

Unnamed: 0,sampling_delta_RV
0,"[0.0, 16.990309999790043, 34.978099999949336, ..."
1,"[0.0, 2.009560000151396, 10.982160000130534, 1..."
2,"[0.0, 11.003340000286698, 29.86382000008598, 5..."
3,"[0.0, 11.957520000170916, 47.87878999998793, 5..."
4,"[0.0, 17.016419999767095, 190.38824999984354, ..."


In [15]:
#for j in range(0, 2): # TEST
for j in range(0, NUM_DS):
    print("Creating validation dataset %d..." %j)
    # Initialize:
    ds_subfolder = DS_FOLDER + DS_PREFIX + str(j) + "/"
    if os.path.isdir(ds_subfolder):
        # Folder exists:
        pass
    else:
        # Create the folder:
        os.mkdir(ds_subfolder)
    dataset_file = ds_subfolder + DS_PREFIX + str(j) + "_SynthDatasets.csv"
    datasets = pd.DataFrame(columns=['ID', 'Pulsating', \
                                     'frequency', 'amplitudeRV', 'offsetRV', 'refepochRV', 'phase', \
                                     'D1_Ps', 'D1_Tobs', \
                                     'D2_noiseRV_mean', 'D2_noiseRV_median', 'D2_noiseRV_stdev', \
                                     'D3_samplingRV_idx', 'D3_PsRV_mean', 'D3_PsRV_median', \
                                     'D3_PsRV_stdev', 'D3_NumRV', \
                                     'D4_noiseRV_mean', 'D4_noiseRV_median', 'D4_noiseRV_stdev',
                                     'ds1_file', 'ds2_file', 'ds3_file', 'ds4_file'])
    

    #for i in range(0, 5): # TEST
    for i in range(len(datasets), len(datasets) + N):
        #if True: # TEST
        try:
            step = "Main parameters"
            # Set the record sequential ID and the pulsation characteristic:
            record_id = "RV-" + str(i)
            datasets.loc[i, 'ID'] = record_id
            if np.asscalar(np.random.rand(1)) <= NON_PULSATION_FRACTION:
                # Non-pulsating star:
                datasets.loc[i, 'Pulsating'] = False
                # Choose the main parameter values:
                # (one value per record, shared by all 4 distributions)
                # In this case, only the distributions for the offsetRV and refepochRV are used:
                frequency = 0.0
                amplitudeRV = 0.0
                offsetRV = param_ranges['offsetRV'].generate(n=1, verbose=0)
                refepochRV = param_ranges['refepochRV'].generate(n=1, verbose=0)
                phase = 0.0
                # Populate the dataset table with the basic parameters:
                datasets.loc[i, 'frequency'] = frequency
                datasets.loc[i, 'amplitudeRV'] = amplitudeRV
                datasets.loc[i, 'offsetRV'] = np.asscalar(offsetRV)
                datasets.loc[i, 'refepochRV'] = np.asscalar(refepochRV)
                datasets.loc[i, 'phase'] = phase
            else:
                # Pulsating star:
                datasets.loc[i, 'Pulsating'] = True
                # Choose the main parameter values:
                # (one value per record, shared by all 4 distributions)
                # All the dsistributions are used to calculate random parameters:
                frequency = param_ranges['frequency'].generate(n=1, verbose=0)
                amplitudeRV = param_ranges['amplitudeRV'].generate(n=1, verbose=0)
                offsetRV = param_ranges['offsetRV'].generate(n=1, verbose=0)
                refepochRV = param_ranges['refepochRV'].generate(n=1, verbose=0)
                phase = param_ranges['phase'].generate(n=1, verbose=0)
                # Populate the dataset table with the basic parameters:
                datasets.loc[i, 'frequency'] = np.asscalar(frequency)
                datasets.loc[i, 'amplitudeRV'] = np.asscalar(amplitudeRV)
                datasets.loc[i, 'offsetRV'] = np.asscalar(offsetRV)
                datasets.loc[i, 'refepochRV'] = np.asscalar(refepochRV)
                datasets.loc[i, 'phase'] = np.asscalar(phase)

            # DS1: Generate the benchmark (noiseless, perfectly sampled) time series:
            step = "DS1 calculation"        
            # Get the timestamps parameters (sampling period and observation time):
            Ps = param_ranges['Ps']['fixed_value']
            Tobs = param_ranges['Tobs']['fixed_value']
            # Generate the timestamps (absolute values):
            ds1_time = np.linspace(refepochRV, refepochRV+Tobs, int(Tobs/Ps) + 1, endpoint=True)
            # Generate the values:
            ds1_value = v_sample_value(t=ds1_time,
                                   f=frequency, A=amplitudeRV, c=offsetRV, tau=refepochRV, delta=phase,
                                   noise=0.0)
            ds1_ts = np.stack([ds1_time, ds1_value], axis=1).reshape(-1,2)
            # Create the DS1 filename:
            ds1_file = ds_subfolder + "DS1-" + record_id + ".dat"
            # Store the DS1 file:
            np.savetxt(ds1_file, ds1_ts, delimiter=' ')
            # Populate the dataset table with the relevant data for DS1:
            datasets.loc[i, 'D1_Ps'] = Ps
            datasets.loc[i, 'D1_Tobs'] = Tobs
            datasets.loc[i, 'ds1_file'] = ds1_file
        
            # DS2: Generate the noise and create the noisy time series:
            step = "DS2 calculation"        
            # Generate a noise array equal in length to  DS1:
            # (Note that the noise can be randomly positive or negative)
            ds2_noise_value = np.random.choice([-1, 1], size=len(ds1_time)) * \
                param_ranges['noiseRV'].generate(n=len(ds1_time), verbose=0)
            ds2_noise_value = ds2_noise_value.reshape(-1,1)
            # Generate DS2 time series (just the sum of two series):
            ds2_value = ds1_value + ds2_noise_value
            ds2_ts = np.stack([ds1_time, ds2_value], axis=1).reshape(-1,2)
            # Calculate noise stats:
            ds2_noise_mean = np.asscalar(np.nanmean(ds2_noise_value.flatten()))
            ds2_noise_median = np.asscalar(np.nanmedian(ds2_noise_value.flatten()))
            ds2_noise_std = np.asscalar(np.nanstd(ds2_noise_value.flatten()))
            # Create the DS2 filename:
            ds2_file = ds_subfolder + "DS2-" + record_id + ".dat"
            # Store the DS2 file:
            np.savetxt(ds2_file, ds2_ts, delimiter=' ')
            # Populate the dataset table with the relevant data for DS2:
            datasets.loc[i, 'D2_noiseRV_mean'] = ds2_noise_mean
            datasets.loc[i, 'D2_noiseRV_median'] = ds2_noise_median
            datasets.loc[i, 'D2_noiseRV_stdev'] = ds2_noise_std
            datasets.loc[i, 'ds2_file'] = ds2_file
        
            # DS3: Generate the imperfectly sampled time series:
            step = "DS3 calculation"        
            # Choose a random sampling pattern from the pool and generate the timestamps:
            pattern_idx = np.asscalar(np.random.randint(low=0, high=len(rv_patterns), size=1))
            rv_t_deltas = rv_patterns.iloc[pattern_idx, 0]
            # Generate DS3 time series:
            ds3_time = refepochRV + rv_t_deltas
            ds3_value = v_sample_value(t=ds3_time,
                                   f=frequency, A=amplitudeRV, c=offsetRV, tau=refepochRV, delta=phase,
                                   noise=0.0)
            ds3_ts = np.stack([ds3_time, ds3_value], axis=1).reshape(-1,2)

            # Calculate sampling stats:
            ds3_deltas = ds3_time[1:] - ds3_time[:-1]
            ds3_PsRV_mean = np.asscalar(np.nanmean(ds3_deltas))
            ds3_PsRV_median = np.asscalar(np.nanmedian(ds3_deltas))
            ds3_PsRV_stdev = np.asscalar(np.std(ds3_deltas))
            ds3_NumRV = len(ds3_time)
            # Create the DS3 filename:
            ds3_file = ds_subfolder + "DS3-" + record_id + ".dat"
            # Store the DS3 file:
            np.savetxt(ds3_file, ds3_ts, delimiter=' ')
            # Populate the dataset table with the relevant data for DS3:
            datasets.loc[i, 'D3_samplingRV_idx'] = pattern_idx
            datasets.loc[i, 'D3_PsRV_mean'] = ds3_PsRV_mean
            datasets.loc[i, 'D3_PsRV_median'] = ds3_PsRV_median
            datasets.loc[i, 'D3_PsRV_stdev'] = ds3_PsRV_stdev
            datasets.loc[i, 'D3_NumRV'] = ds3_NumRV
            datasets.loc[i, 'ds3_file'] = ds3_file

            # DS4: Generate the noisy and imperfectly sampled time series:
            step = "DS4 calculation"        
            # Generate a noise array equal in length to DS3:
            ds4_noise_value = np.random.choice([-1, 1], size=len(ds3_time)) * \
                param_ranges['noiseRV'].generate(n=len(ds3_time), verbose=0)
            ds4_noise_value = ds4_noise_value.reshape(-1,1)
            # Generate DS4 time series (just the sum of two series):
            ds4_value = ds3_value + ds4_noise_value.reshape(-1,)
            ds4_ts = np.stack([ds3_time, ds4_value], axis=1).reshape(-1,2)
            # Calculate noise stats:
            ds4_noise_mean = np.asscalar(np.nanmean(ds4_noise_value.flatten()))
            ds4_noise_median = np.asscalar(np.nanmedian(ds4_noise_value.flatten()))
            ds4_noise_std = np.asscalar(np.nanstd(ds4_noise_value.flatten()))
            # Create the DS4 filename:
            ds4_file = ds_subfolder + "DS4-" + record_id + ".dat"
            # Store the DS2 file:
            np.savetxt(ds4_file, ds4_ts, delimiter=' ')
            # Populate the dataset table with the relevant data for DS4:
            datasets.loc[i, 'D4_noiseRV_mean'] = ds4_noise_mean
            datasets.loc[i, 'D4_noiseRV_median'] = ds4_noise_median
            datasets.loc[i, 'D4_noiseRV_stdev'] = ds4_noise_std
            datasets.loc[i, 'ds4_file'] = ds4_file
    
            # Store the dataset table so far:
            datasets.to_csv(dataset_file, sep=',', decimal='.', index=False)
    
        #else: # TEST
        except Exception as e:
            # Report any possible trouble:
            print("***ERROR: some error happened when generating record %d, at step '%s' Error: %s" \
                  %(i, step, str(e)))

 

Creating validation dataset 0...
Creating validation dataset 1...
Creating validation dataset 2...
Creating validation dataset 3...
Creating validation dataset 4...
Creating validation dataset 5...
Creating validation dataset 6...
Creating validation dataset 7...
Creating validation dataset 8...
Creating validation dataset 9...
Creating validation dataset 10...
Creating validation dataset 11...
Creating validation dataset 12...
Creating validation dataset 13...
Creating validation dataset 14...
Creating validation dataset 15...
Creating validation dataset 16...
Creating validation dataset 17...
Creating validation dataset 18...
Creating validation dataset 19...


### Show the last curves generated

In [None]:
datasets

#### Basic parameters of the curve

In [None]:
# Basic parameters:
print("PARAMETERS OF THE CURVE:")
if datasets.loc[i, 'Pulsating'] == True:
    print("Star type: Pulsating star")
    print("Amplitude: %f" %amplitudeRV[0])
    print("Frequency: %f" %frequency[0])
    print("Reference epoch: %f" %refepochRV[0])
    print("Phase: %f" %phase[0])
    print("Offset: %f" %offsetRV[0])
    print("Benchmark (perfect) sampling period: %f" %Ps)
    print("Benchmark (perfect) observation time: %f" %Tobs)
else:
    print("Star type: Non-pulsating star")
    print("Amplitude: %f" %amplitudeRV)
    print("Frequency: %f" %frequency)
    print("Reference epoch: %f" %refepochRV[0])
    print("Phase: %f" %phase)
    print("Offset: %f" %offsetRV[0])
    print("Benchmark (perfect) sampling period: %f" %Ps)
    print("Benchmark (perfect) observation time: %f" %Tobs)    

#### Simplified, quick plots

In [None]:
# DS1 benchmark curve:
plt.plot(ds1_time, ds1_value)

In [None]:
# DS1 benchmark curve - Only 1 period:
try:
    # Pulsating star:
    plt.plot(ds1_time, ds1_value)
    plt.xlim(ds1_time.min(), ds1_time.min()+1/frequency[0])
except:
    # Non pulsating star:
    plt.plot(ds1_time, ds1_value)
    #plt.xlim(ds1_time.min(), ds1_time.min()+1/frequency)    

In [None]:
# DS1 and DS2 curves:
plt.plot(ds1_time, ds1_value, label='DS1')
plt.plot(ds1_time, ds2_value, label='DS2')
plt.legend()
plt.show();

In [None]:
# DS3 and DS4 curves:
plt.plot(ds3_time, ds3_value, label='DS3')
plt.plot(ds3_time, ds4_value, label='DS4')
plt.legend()
plt.show();

#### More elaborated plots

In [None]:
# DS1 and DS2 curves:
plt.figure(figsize=(10,7))
plt.title("Example - Synthetic curves generated with CARMENES parameters\n(Non-pulsating star. Noiseless curve and noisy curve)", fontsize=16)
plt.grid(axis='both', alpha=0.75)
plt.xlabel("Time [JD]", fontsize=12)
plt.ylabel('RV [$ms^{-1}$]', fontsize=12)
sns.lineplot(x=ds1_time.flatten(), y=ds1_value.flatten(), label='DS1 - Noiseless curve')
sns.lineplot(x=ds1_time.flatten(), y=ds2_value.flatten(), label='DS2 - Noisy curve')
plt.show();

In [None]:
# DS3 and DS4 curves:
plt.figure(figsize=(10,7))
plt.title("Example - Synthetic curves generated with CARMENES parameters\n" \
          "(Non-pulsating star. Irregularly sampled curves: noiseless and noisy)", fontsize=16)
plt.grid(axis='both', alpha=0.75)
plt.xlabel("Time [JD]", fontsize=12)
plt.ylabel('RV [$ms^{-1}$]', fontsize=12)
sns.lineplot(x=ds3_time.flatten(), y=ds3_value.flatten(), label='DS3 - Irregularly sampled, noiseless curve')
sns.lineplot(x=ds3_time.flatten(), y=ds4_value.flatten(), label='DS4 - Irregularly sampled, noisy curve')
plt.show();