## Predikcia parametrov pre detached krivky
### Predikovane parametre: inclination, mass ratio, primary radius, secondary radius, temperature ratio
Model je trenovany na originalnych parametroch a krivkach so sumom - bez normalizacie

In [1]:
# Blok 1 - nacitanie kniznic
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from keras.utils import np_utils, plot_model
from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, SpatialDropout1D, GlobalAveragePooling1D
from keras.layers import Input, Dense, concatenate, Activation, LSTM, Dropout, Flatten
from keras.models import Model
from keras.layers import Concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger

np.random.seed(1234)
pd.set_option('display.max_rows', None)

In [2]:
# Blok 2 - Funkcie pre generovanie sumu. Nastavenie pseudo-nahodneho generatora
def generate_observation_sigma(space_obs_frac=0.5):
    """
    Draws a standard deviation of noise in light curve points from a "true" value provided in synthetic light curve.
    Noise sigma is drawn from bimodal distribution taking into account contributions from space based and earth based
    observations which have different levels of stochastic noise.

    :param space_obs_frac: ratio between earth based and space based observations
    :return: float; standard deviation of the light curve noise
    """
    earth_based_sigma = 4e-3
    space_based_sigma = 2e-4
    sigma = np.random.choice([earth_based_sigma, space_based_sigma], p=[1-space_obs_frac, space_obs_frac])
    return np.random.rayleigh(sigma)

def stochastic_noise_generator(curve):
    """
    Introduces gaussian noise into synthetic observation provided in `curve`.

    :param curve: numpy.array; normalized light curve
    :return: Tuple(numpy.array, float); normalized light curve with added noise, standard deviation of observations
    """
    sigma = generate_observation_sigma()
    return np.random.normal(curve, sigma), np.full(curve.shape, sigma)

## Data loading

In [3]:
# Blok 3 - nacitanie dat, vyselektovanie nahodnej vozrky dat o velkosti 200 000 zaznamov
data = pd.read_pickle("detached_all_parameters.pkl").reset_index()
data_sample = data.sample(n=200000)

In [4]:
# Blok 4 - ukazka dat
data_sample.head()

Unnamed: 0,index,id,curve,primary__t_eff,secondary__t_eff,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential,t1_t2,filter,critical_surface_potential,primary__equivalent_radius,secondary__equivalent_radius,primary__filling_factor,secondary__filling_factor
647973,647973,10032749,"[0.48835963039886077, 0.4913287663336375, 0.49...",45000,5000,1.334076,0.6,20.601251,4.071076,9.0,Bessell_B,3.063442,0.049981,0.210724,-49.955205,-2.870176
1109932,1109932,16384835,"[0.5932746668156491, 0.594846617739326, 0.5994...",12000,5000,1.310729,1.111111,7.008254,5.425004,2.4,SLOAN_u,3.928447,0.170106,0.25046,-5.570841,-2.707013
731380,731380,10579972,"[0.0559029780103817, 0.05590501277853746, 0.05...",10000,5000,1.427398,1.666667,9.3676,5.812,2.0,Bessell_U,4.772403,0.130062,0.330428,-7.853457,-1.776732
692703,692703,10341530,"[0.9719087814691745, 0.9719390320925176, 0.971...",9000,5000,1.223879,1.666667,9.3676,8.640628,1.8,GaiaDR2,4.772403,0.130062,0.209983,-7.853457,-6.611019
1145336,1145336,16615611,"[0.8654052436804602, 0.8654499986635169, 0.865...",20000,16000,1.180696,0.9,3.676833,19.051127,1.25,Kepler,3.585603,0.373969,0.049982,-0.183357,-31.082753


In [5]:
# Blok 5 - vytvorenie pola kriviek, vytvorenie pola features, ktore budeme predikovat
X = []
for row in data_sample["curve"]:
    X.append(row)
X=np.array(X)

y = np.array(data_sample[[
    "inclination",
    "mass_ratio",
    "primary__equivalent_radius",
    "secondary__equivalent_radius",
    "t1_t2"]])

In [6]:
# Blok 6 - rozdelenie dat na trenovaciu a testovaciu mnozinu v pomere 80/20
X_train1, X_test, y_train1, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
# Blok 7 - pridanie sumu na trenovacie data
X_train_n = []
y_train_n = []
for i in range(len(X_train1)):
    for j in range(3):
        curve = stochastic_noise_generator(X_train1[i])
        X_train_n.append(curve[0])
        y_train_n.append(y_train1[i])
X_train_n = np.array(X_train_n)
y_train_n=np.array(y_train_n)

In [8]:
# Blok 8 - vypis poctu zaznamov v jednotlivych datovych mnozinach
print("Number of records in dataset: ", len(data),
    "\nNumber of records in sample: ", len(data_sample),
    "\nNumber of train data without noise: ", len(X_train1),
    "\nNumber of train data with noise: ", len(X_train_n),
    "\nNumber of test data without noise: ", len(X_test))

Number of records in dataset:  1300000 
Number of records in sample:  200000 
Number of train data without noise:  160000 
Number of train data with noise:  480000 
Number of test data without noise:  40000


In [9]:
# Blok 9 - vytvorenie jendotlivych poli pre jednotlive parametre
y_inc = np.array([arr[0] for arr in y_train_n])
y_mass = np.array([arr[1] for arr in y_train_n])
y_prim_radius = np.array([arr[2] for arr in y_train_n])
y_sec_radius = np.array([arr[3] for arr in y_train_n])
y_temp_ratio = np.array([arr[4] for arr in y_train_n])

In [10]:
# Blok 10 - zadefinovanie architektury Multi NN modelu
inputs_y = Input(shape=(400, 1))

a = Conv1D(128, kernel_size = 3, padding = "valid")(inputs_y)
a = LSTM(64, return_sequences=True)(a)
a = Flatten()(a)
a = Dense(64, activation='relu')(a)
a = Model(inputs=inputs_y, outputs=a)

b = Conv1D(128, kernel_size = 3, padding = "valid")(inputs_y)
b = MaxPooling1D(2)(b)
b = Conv1D(128, kernel_size = 3, padding = "valid")(b)
b = LSTM(64, return_sequences=True)(b)
b = LSTM(64, return_sequences=True)(b)
b = Flatten()(b)
b = Dense(64, activation='relu')(b)
b = Model(inputs=inputs_y, outputs=b)

c = Conv1D(128, kernel_size = 3, padding = "valid")(inputs_y)
c = MaxPooling1D(2)(c)
c = Conv1D(128, kernel_size = 3, padding = "valid")(c)
c = LSTM(64, return_sequences=True)(c)
c = LSTM(64, return_sequences=True)(c)
c = Flatten()(c)
c = Dense(64, activation='relu')(c)
c = Model(inputs=inputs_y, outputs=c)

d = Conv1D(128, kernel_size = 3, padding = "valid")(inputs_y)
d = MaxPooling1D(2)(d)
d = Conv1D(128, kernel_size = 3, padding = "valid")(d)
d = LSTM(64, return_sequences=True)(d)
d = LSTM(64, return_sequences=True)(d)
d = Flatten()(d)
d = Dense(64, activation='relu')(d)
d = Model(inputs=inputs_y, outputs=d)

e = Conv1D(128, kernel_size = 3, padding = "valid")(inputs_y)
e = MaxPooling1D(2)(e)
e = Conv1D(128, kernel_size = 3, padding = "valid")(e)
e = LSTM(64, return_sequences=True)(e)
e = LSTM(64, return_sequences=True)(e)
e = Flatten()(e)
e = Dense(64, activation='relu')(e)
e = Model(inputs=inputs_y, outputs=e)

x = concatenate([a.output, b.output, c.output, d.output, e.output])
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)

output = Dense(5, activation='linear')(x)

detached_multiNN_model = Model(inputs=inputs_y, outputs=output)
detached_multiNN_model.compile(loss='mse', optimizer='adam', metrics=["mae", "mape"])

saved_model = "models/det_multiNN_radius_model.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor = 'val_mae', verbose = 1, save_best_only = True, mode = 'min')
early = EarlyStopping(monitor = "val_mae", mode = "min", patience = 25)
csv_logger = CSVLogger('det_multiNN_radius_model.log', separator=',', append = True)
callbacks_list = [checkpoint, early, csv_logger]

print(detached_multiNN_model.summary())

2023-01-15 10:05:36.546357: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6674 MB memory:  -> device: 0, name: Quadro RTX 4000, pci bus id: 0000:8b:00.0, compute capability: 7.5


Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 400, 1)]     0           []                               
                                                                                                  
 conv1d_1 (Conv1D)              (None, 398, 128)     512         ['input_1[0][0]']                
                                                                                                  
 conv1d_3 (Conv1D)              (None, 398, 128)     512         ['input_1[0][0]']                
                                                                                                  
 conv1d_5 (Conv1D)              (None, 398, 128)     512         ['input_1[0][0]']                
                                                                                            

In [13]:
# Blok 11 - trenovanie multi NN modelu
history_multiNN = detached_multiNN_model.fit(
    x=X_train_n,
    y=[y_inc, y_mass, y_prim_radius, y_sec_radius, y_temp_ratio],
    validation_split = 0.1,
    epochs = 15,
    verbose = 1,
    callbacks = callbacks_list,
    batch_size = 64)

Epoch 1/15


2023-01-15 10:07:09.018818: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8201


Epoch 1: val_mae improved from inf to 0.16272, saving model to models/det_multiNN_radius_model.hdf5
Epoch 2/15
Epoch 2: val_mae improved from 0.16272 to 0.09779, saving model to models/det_multiNN_radius_model.hdf5
Epoch 3/15
Epoch 3: val_mae improved from 0.09779 to 0.08108, saving model to models/det_multiNN_radius_model.hdf5
Epoch 4/15
Epoch 4: val_mae did not improve from 0.08108
Epoch 5/15
Epoch 5: val_mae improved from 0.08108 to 0.06119, saving model to models/det_multiNN_radius_model.hdf5
Epoch 6/15
Epoch 6: val_mae did not improve from 0.06119
Epoch 7/15
Epoch 7: val_mae did not improve from 0.06119
Epoch 8/15
Epoch 8: val_mae improved from 0.06119 to 0.06111, saving model to models/det_multiNN_radius_model.hdf5
Epoch 9/15
Epoch 9: val_mae improved from 0.06111 to 0.04957, saving model to models/det_multiNN_radius_model.hdf5
Epoch 10/15
Epoch 10: val_mae did not improve from 0.04957
Epoch 11/15
Epoch 11: val_mae did not improve from 0.04957
Epoch 12/15
Epoch 12: val_mae did no

## Vyhodnotenie modelu - loss, MAE

In [14]:
# Blok 12 - nacitanie modelu
multi_model_radius = load_model("models/det_multiNN_radius_model.hdf5")

In [15]:
# Blok 13 - evaluacia modelu na testovacich datach bez sumu
scores = multi_model_radius.evaluate(X_test, y_test)
print('Multi NN model- radius -- Loss: {:.4f}, MAE: {:.4f}'.format(scores[0], scores[1]))

Multi NN model- radius -- Loss: 2.5133, MAE: 0.9991


In [16]:
# Blok 14 - pridanie sumu na testovacie data
X_test_n = []
y_test_n = []
for i in range(len(X_test)):
    for j in range(3):
        curve = stochastic_noise_generator(X_test[i])
        X_test_n.append(curve[0])
        y_test_n.append(y_test[i])
        j += 1
X_test_n = np.array(X_test_n)
y_test_n = np.array(y_test_n)

In [17]:
# Blok 15 - evaluacia modelu na testovacich datach so sumom
scores_n = multi_model_radius.evaluate(X_test_n, y_test_n)
print('Multi NN model - radius - data with noise -- Loss: {:.4f}, MAE: {:.4f}'.format(scores_n[0], scores_n[1]))

Multi NN model - radius - data with noise -- Loss: 2.5135, MAE: 0.9997


## Predikcie - data bez sumu

In [18]:
# Blok 16 - predikcia na datach bez sumu
pred_multi = multi_model_radius.predict(X_test)
pred_multi[0]



array([1.0002089, 1.0022469, 0.9973445, 1.0024695, 1.0025884],
      dtype=float32)

In [19]:
# Blok 17 - vytvorenie df z predikcii, vypocet priemernych hodnot
predictions_df = pd.DataFrame(pred_multi,
                        columns = [
                            "inclination",
                            "mass_ratio",
                            "primary_radius",
                            "secondary_radius",
                            "t1_t2"
                            ])
pred_mean = predictions_df.mean(axis=0)
pred_mean

inclination         1.348574
mass_ratio          1.349623
primary_radius      1.348139
secondary_radius    1.349457
t1_t2               1.349436
dtype: float32

In [20]:
# Blok 18 - vytvorenie df z testovacich dat, vypocet priemernych hodnot
test_df = pd.DataFrame(y_test,
                        columns = [
                            "inclination",
                            "mass_ratio",
                            "primary__equivalent_radius",
                            "secondary__equivalent_radius",
                            "t1_t2"
                            ])
test_df = test_df.mean(axis=0)
test_df

inclination                     1.375976
mass_ratio                      1.755052
primary__equivalent_radius      0.176406
secondary__equivalent_radius    0.199077
t1_t2                           2.656609
dtype: float64

In [21]:
# Blok 19 - vytvorenie df pre porovnanie priemernych skutocnych a predikovanych hodnot
eval_pred = pd.DataFrame({'attribute': test_df.index,
            'avg_true': test_df.values,
            'avg_pred': pred_mean.values,
            'MAE': abs(test_df.values - pred_mean.values)})
eval_pred

Unnamed: 0,attribute,avg_true,avg_pred,MAE
0,inclination,1.375976,1.348574,0.027402
1,mass_ratio,1.755052,1.349623,0.405429
2,primary__equivalent_radius,0.176406,1.348139,1.171734
3,secondary__equivalent_radius,0.199077,1.349457,1.15038
4,t1_t2,2.656609,1.349436,1.307174


## Predikcie - data so sumom

In [23]:
# Blok 20 - predikcia na datach so sumom
pred_multi_n= multi_model_radius.predict(X_test_n)
pred_multi_n[0]



array([1.0206811, 1.0229884, 1.0206231, 1.0221703, 1.0221556],
      dtype=float32)

In [24]:
# Blok 21 - vytvorenie df z predikcii, vypocet priemernych hodnot
predictions_n_df = pd.DataFrame(pred_multi_n,
                        columns = [
                            "inclination",
                            "mass_ratio",
                            "primary_radius",
                            "secondary_radius",
                            "t1_t2"
                            ])
pred_mean_n = predictions_n_df.mean(axis=0)
pred_mean_n

inclination         1.349129
mass_ratio          1.350171
primary_radius      1.348709
secondary_radius    1.350015
t1_t2               1.349980
dtype: float32

In [25]:
# Blok 22 - vytvorenie df z testovacich dat, vypocet priemernych hodnot
test_df_n = pd.DataFrame(y_test_n,
                        columns = [
                            "inclination",
                            "mass_ratio",
                            "primary__surface_potential",
                            "secondary__surface_potential",
                            "t1_t2"
                            ])
test_df_n = test_df_n.mean(axis=0)
test_df_n

inclination                     1.375976
mass_ratio                      1.755052
primary__surface_potential      0.176406
secondary__surface_potential    0.199077
t1_t2                           2.656609
dtype: float64

In [26]:
# Blok 23 - vytvorenie df pre porovnanie priemernych skutocnych a predikovanych hodnot
eval_pred_n = pd.DataFrame({'attribute': test_df_n.index,
            'avg_true': test_df_n.values,
            'avg_pred': pred_mean_n.values,
            'MAE': abs(test_df_n.values - pred_mean_n.values)})
eval_pred_n

Unnamed: 0,attribute,avg_true,avg_pred,MAE
0,inclination,1.375976,1.349129,0.026847
1,mass_ratio,1.755052,1.350171,0.404881
2,primary__surface_potential,0.176406,1.348709,1.172303
3,secondary__surface_potential,0.199077,1.350015,1.150938
4,t1_t2,2.656609,1.34998,1.306629
