## Predikcia vsetkych parametrov pomocou jednej NN pre overcontact data

In [1]:
# Blok 1 - Nacitanie kniznic
import numpy as np
import pandas as pd

from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Input, Dense, LSTM, Dropout, Flatten
from keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

np.random.seed(1234)
pd.set_option('display.max_rows', None)

In [2]:
# Blok 2 - Funkcie pre generovanie sumu. Nastavenie pseudo-nahodneho generatora.
def generate_observation_sigma(space_obs_frac=0.5):
    """
    Draws a standard deviation of noise in light curve points from a "true" value provided in synthetic light curve.
    Noise sigma is drawn from bimodal distribution taking into account contributions from space based and earth based
    observations which have different levels of stochastic noise.

    :param space_obs_frac: ratio between earth based and space based observations
    :return: float; standard deviation of the light curve noise
    """
    earth_based_sigma = 4e-3
    space_based_sigma = 2e-4
    sigma = np.random.choice([earth_based_sigma, space_based_sigma], p=[1-space_obs_frac, space_obs_frac])
    return np.random.rayleigh(sigma)

def stochastic_noise_generator(curve):
    """
    Introduces gaussian noise into synthetic observation provided in `curve`.

    :param curve: numpy.array; normalized light curve
    :return: Tuple(numpy.array, float); normalized light curve with added noise, standard deviation of observations
    """
    sigma = generate_observation_sigma()
    return np.random.normal(curve, sigma), np.full(curve.shape, sigma)

### Data loading

In [3]:
# Blok 3 - Nacitanie dat
data = pd.read_pickle("overcontact_all_parameters.pkl").reset_index()

In [4]:
# Blok 4 - Ukazka dat
data.head()

Unnamed: 0,index,id,curve,primary__t_eff,secondary__t_eff,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential,t1/t2,filter,critical_surface_potential,primary__equivalent_radius,secondary__equivalent_radius,primary__filling_factor,secondary__filling_factor
0,0,5525038,"[0.9271109336686163, 0.9271335908185164, 0.927...",5500,5250,0.766994,0.1,1.948052,1.948052,1.047619,Bessell_U,1.959104,0.585781,0.21126,0.169244,0.169244
1,1,5525038,"[0.9267426667358384, 0.9267640025030627, 0.926...",5500,5250,0.766994,0.1,1.948052,1.948052,1.047619,Bessell_B,1.959104,0.585781,0.21126,0.169244,0.169244
2,2,5525038,"[0.9271736551553694, 0.927193188167849, 0.9272...",5500,5250,0.766994,0.1,1.948052,1.948052,1.047619,Bessell_V,1.959104,0.585781,0.21126,0.169244,0.169244
3,3,5525038,"[0.9286697051715368, 0.9286879105609007, 0.928...",5500,5250,0.766994,0.1,1.948052,1.948052,1.047619,Bessell_R,1.959104,0.585781,0.21126,0.169244,0.169244
4,4,5525038,"[0.9304596200748534, 0.9304764401089076, 0.930...",5500,5250,0.766994,0.1,1.948052,1.948052,1.047619,Bessell_I,1.959104,0.585781,0.21126,0.169244,0.169244


In [5]:
# Blok 5 - vyselektovanie nahodnej vzorky dat o velkosti 100 000 zaznamov
data_sample = data.sample(n=100000)

### Train-test split

In [6]:
# Blok 6 - Vytvorenie pola kriviek
X = []
for row in data_sample["curve"]:
    X.append(row)
X=np.array(X)

In [7]:
# Blok 7 - vytvorenie pola features, ktore bude model predikovat
# Nepotrebujeme "primary__t_eff", "secondary__t_eff" - staci pomer tychto hodnot = t1/t2
y = np.array(data_sample[[
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1/t2",
    "critical_surface_potential",
    "primary__equivalent_radius",
    "secondary__equivalent_radius",
    "primary__filling_factor",
    "secondary__filling_factor"]])

In [8]:
# Blok 8 - rozdelenie dat na trenovaciu / testovaciu mnozinu v pomere 80/20
X_train1, X_test, y_train1, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
# Blok 9 - pridanie sumu do trenovacich dat
X_train = []
y_train = []
for i in range(len(X_train1)):
    for j in range(3):
        curve = stochastic_noise_generator(X_train1[i])
        X_train.append(curve[0])
        y_train.append(y_train1[i])
X_train = np.array(X_train)
y_train=np.array(y_train)

In [15]:
# Blok 10 - vypis poctu dat v jednotlivych datovych mnozinach
print("Number of records in dataset: ", len(data),
    "\nNumber of records in sample: ", len(X),
    "\nNumber of train data without noise: ", len(X_train1),
    "\nNumber of train data with noise: ", len(X_train),
    "\nNumber of test data without noise: ", len(X_test))

Number of records in dataset:  1212796 
Number of records in sample:  100000 
Number of train data without noise:  80000 
Number of train data with noise:  240000 
Number of test data without noise:  20000


## Model

In [None]:
# Blok 11 - zadefinovanie architektury modelu NN
inputs = Input(shape=(400, 1))
b = Conv1D(64, kernel_size = 3, padding = "valid")(inputs)
b = MaxPooling1D(2)(b)
b = Dropout(0.2)(b)
b = LSTM(64, return_sequences=True)(b)
b = Flatten()(b)
b = Dense(64, activation='relu')(b)
x = Dense(32, activation='relu')(b)
output = Dense(10, activation='linear')(x)
model = Model(inputs=inputs, outputs=output)
model.compile(loss='mse', optimizer='adam', metrics=["mae", "mape"])


saved_model = "models/over_allParams.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor = 'val_mae', verbose = 1, save_best_only = True, mode = 'min')
early = EarlyStopping(monitor = "val_mae", mode = "min", patience = 25)
callbacks_list = [checkpoint, early]

print(model.summary())

In [None]:
# Blok 12 - trenovanie modelu
history = model.fit(X_train, y_train, validation_split = 0.1, epochs = 10, verbose = 1, callbacks = callbacks_list, batch_size = 64)

In [10]:
# Blok 13 - necitanie modelu
model = load_model("models/over_allParams.hdf5")

In [15]:
# Blok 14 - evaluacia modelu na testovacej mnozine bez sumu
scores = model.evaluate(X_test, y_test)
print('Loss: {:.4f}, MAE: {:.4f}'.format(scores[0], scores[1]))

Loss: 0.0483, MAE: 0.1115


In [12]:
# Blok 15 - pridanie sumu do testovacich dat
X_test_n = []
y_test_n = []
for i in range(len(X_test)):
    for j in range(3):
        curve = stochastic_noise_generator(X_test[i])
        X_test_n.append(curve[0])
        y_test_n.append(y_test[i])
        j += 1
X_test_n = np.array(X_test_n)
y_test_n = np.array(y_test_n)

In [14]:
# Blok 16 - evaluacia modelu na testovacej mnozine so sumom
scores_n = model.evaluate(X_test_n, y_test_n)
print('Loss: {:.4f}, MAE: {:.4f}'.format(scores_n[0], scores_n[1]))

Loss: 0.0611, MAE: 0.1208


### Prediction on synthetic test data - without noise

In [16]:
# Blok 17 - predikcia na datach bez sumu
y_pred = model.predict(X_test)

In [17]:
# Blok 18 - vytvorenie df z predikcie
pred_df = pd.DataFrame(y_pred,
                        columns = [
                            "p_inclination",
                            "p_mass_ratio",
                            "p_primary__surface_potential",
                            "p_secondary__surface_potential",
                            "p_t1_t2",
                            "p_critical_surface_potential",
                            "p_primary_equivalent_radius",
                            "p_secondary_equivalent_radius",
                            "p_primary_filling_factor",
                            "p_secondary_filling_factor"
                            ])
pred_df.head()

Unnamed: 0,p_inclination,p_mass_ratio,p_primary__surface_potential,p_secondary__surface_potential,p_t1_t2,P_critical_surface_potential,P_primary_equivalent_radius,P_secondary_equivalent_radius,P_primary_filling_factor,P_secondary_filling_factor
0,1.094681,4.739271,8.533597,8.441549,1.022082,8.673995,0.278536,0.522325,0.448388,0.364946
1,1.017328,3.622497,7.047381,7.028968,1.030903,7.331349,0.318094,0.509637,0.482214,0.460357
2,1.463987,0.916591,3.539768,3.51159,1.062556,3.637359,0.404025,0.389204,0.258317,0.258596
3,0.920394,1.3347,3.860328,3.863567,1.036972,4.303124,0.440025,0.479571,0.73531,0.743143
4,1.198251,0.128971,2.173163,2.176864,1.064237,2.21261,0.526068,0.292184,0.294052,0.300588


In [18]:
# Blok 19 - vypocet priemernych hodnot
pred_mean = pred_df.mean(axis=0)
pred_mean

p_inclination                     1.193492
p_mass_ratio                      1.417531
p_primary__surface_potential      3.968585
p_secondary__surface_potential    3.963089
p_t1_t2                           1.046490
P_critical_surface_potential      4.314015
P_primary_equivalent_radius       0.440175
P_secondary_equivalent_radius     0.450492
P_primary_filling_factor          0.648981
P_secondary_filling_factor        0.652364
dtype: float32

In [21]:
# Blok 20 - vytvorenie df z testovacich dat bez sumu, vypocet priemernych hodnot
y_test_df = pd.DataFrame(y_test,
                        columns = [
                            "inclination",
                            "mass_ratio",
                            "primary__surface_potential",
                            "secondary__surface_potential",
                            "t1_t2",
                            "critical_surface_potential",
                            "primary_equivalent_radius",
                            "secondary_equivalent_radius",
                            "primary_filling_factor",
                            "secondary_filling_factor"
                            ])
test_mean = y_test_df.mean(axis=0)
test_mean

inclination                     1.221348
mass_ratio                      1.412813
primary__surface_potential      3.972770
secondary__surface_potential    3.972770
t1_t2                           1.048076
critical_surface_potential      4.286547
primary_equivalent_radius       0.431312
secondary_equivalent_radius     0.446961
primary_filling_factor          0.615457
secondary_filling_factor        0.615457
dtype: float64

In [22]:
# Blok 22 - vytvorenie df na porovnanie priemernych predikovanych a skutocnych hodnot
eval_pred = pd.DataFrame({'Attribute': test_mean.index,
            'AVG True values': test_mean.values,
            'AVG Pred Values': pred_mean.values,
            'MAE': abs(test_mean.values - pred_mean.values)})
eval_pred

Unnamed: 0,Attribute,AVG True values,AVG Pred Values,MAE
0,inclination,1.221348,1.193492,0.027856
1,mass_ratio,1.412813,1.417531,0.004717
2,primary__surface_potential,3.97277,3.968585,0.004184
3,secondary__surface_potential,3.97277,3.963089,0.00968
4,t1_t2,1.048076,1.04649,0.001586
5,critical_surface_potential,4.286547,4.314015,0.027469
6,primary_equivalent_radius,0.431312,0.440175,0.008863
7,secondary_equivalent_radius,0.446961,0.450492,0.003531
8,primary_filling_factor,0.615457,0.648981,0.033524
9,secondary_filling_factor,0.615457,0.652364,0.036907


### Prediction on synthetic test data - with  noise

In [23]:
# Blok 23 - preidkcia na testovaich datach so sumom
y_pred_n=model.predict(X_test_n)

In [24]:
# Blok 24 - vytvorenie df z testovacich dat so sumom, vypocet priemernych hodnot
pred_n_df = pd.DataFrame(y_pred_n,
                        columns = [
                            "p_inclination",
                            "p_mass_ratio",
                            "p_primary__surface_potential",
                            "p_secondary__surface_potential",
                            "p_t1_t2",
                            "p_critical_surface_potential",
                            "p_primary_equivalent_radius",
                            "p_secondary_equivalent_radius",
                            "p_primary_filling_factor",
                            "p_secondary_filling_factor"
                            ])
pred_mean_n = pred_n_df.mean(axis=0)
pred_mean_n

p_inclination                     1.193694
p_mass_ratio                      1.419740
p_primary__surface_potential      3.971508
p_secondary__surface_potential    3.966136
p_t1_t2                           1.046527
p_critical_surface_potential      4.317110
p_primary_equivalent_radius       0.440108
p_secondary_equivalent_radius     0.450516
p_primary_filling_factor          0.648972
p_secondary_filling_factor        0.652439
dtype: float32

In [25]:
# Blok 20 - vytvorenie df z testovacich dat bez sumu, vypocet priemernych hodnot
y_test_df_n = pd.DataFrame(y_test_n,
                        columns = [
                            "inclination",
                            "mass_ratio",
                            "primary__surface_potential",
                            "secondary__surface_potential",
                            "t1_t2",
                            "critical_surface_potential",
                            "primary_equivalent_radius",
                            "secondary_equivalent_radius",
                            "primary_filling_factor",
                            "secondary_filling_factor"
                            ])
test_mean_n = y_test_df_n.mean(axis=0)
test_mean_n

inclination                     1.221348
mass_ratio                      1.412813
primary__surface_potential      3.972770
secondary__surface_potential    3.972770
t1_t2                           1.048076
critical_surface_potential      4.286547
primary_equivalent_radius       0.431312
secondary_equivalent_radius     0.446961
primary_filling_factor          0.615457
secondary_filling_factor        0.615457
dtype: float64

In [26]:
# Blok 21 - vytvorenie df na porovnanie priemernych predikovanych a skutocnych hodnot
eval_pred = pd.DataFrame({'Attribute': test_mean_n.index,
            'AVG True values': test_mean_n.values,
            'AVG Pred Values': pred_mean_n.values,
            'MAE': abs(test_mean_n.values - pred_mean_n.values)})
eval_pred

Unnamed: 0,Attribute,AVG True values,AVG Pred Values,MAE
0,inclination,1.221348,1.193694,0.027655
1,mass_ratio,1.412813,1.41974,0.006927
2,primary__surface_potential,3.97277,3.971508,0.001262
3,secondary__surface_potential,3.97277,3.966136,0.006634
4,t1_t2,1.048076,1.046527,0.001549
5,critical_surface_potential,4.286547,4.31711,0.030563
6,primary_equivalent_radius,0.431312,0.440108,0.008796
7,secondary_equivalent_radius,0.446961,0.450516,0.003555
8,primary_filling_factor,0.615457,0.648972,0.033515
9,secondary_filling_factor,0.615457,0.652439,0.036982
