## Predikcia vybranych parametrov pre overcontact data
### Predikcia parametrov inclination, mass ratio, temperature ratio, primary potential, secondary potential

In [1]:
# Blok 1 - nacitanie kniznic
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from keras.utils import np_utils
from keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, SpatialDropout1D, GlobalAveragePooling1D
from keras.layers import Input, Dense, concatenate, Activation, LSTM, Dropout, Flatten
from keras.models import Model
from keras.layers.merge import Concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

np.random.seed(1234)
pd.set_option('display.max_rows', None)

In [4]:
# Blok 2 - Funkcie pre generovanie sumu. Nastavenie pseudo-nahodneho generatora.
def generate_observation_sigma(space_obs_frac=0.5):
    """
    Draws a standard deviation of noise in light curve points from a "true" value provided in synthetic light curve.
    Noise sigma is drawn from bimodal distribution taking into account contributions from space based and earth based
    observations which have different levels of stochastic noise.

    :param space_obs_frac: ratio between earth based and space based observations
    :return: float; standard deviation of the light curve noise
    """
    earth_based_sigma = 4e-3
    space_based_sigma = 2e-4
    sigma = np.random.choice([earth_based_sigma, space_based_sigma], p=[1-space_obs_frac, space_obs_frac])
    return np.random.rayleigh(sigma)

def stochastic_noise_generator(curve):
    """
    Introduces gaussian noise into synthetic observation provided in `curve`.

    :param curve: numpy.array; normalized light curve
    :return: Tuple(numpy.array, float); normalized light curve with added noise, standard deviation of observations
    """
    sigma = generate_observation_sigma()
    return np.random.normal(curve, sigma), np.full(curve.shape, sigma)

## Data loading

In [5]:
# Blok 3 - nacitanie dat
data = pd.read_pickle("overcontact_all_parameters.pkl").reset_index()

In [6]:
# Blok 4 - vyselektovanie nahodnej vzorky dat o velkost 300 000 zaznamov
data_sample = data.sample(n=300000)

In [8]:
# Blok 5 - vytvorenie pola kriviek
X = []
for row in data_sample["curve"]:
    X.append(row)
X=np.array(X)

In [9]:
# Blok 6 - vyselektovanie features, ktore budeme predikovat
y = np.array(data_sample[[
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1/t2"]])

In [10]:
# Blok 7 -zadefinovanie scalera na minmax normalizaciu
scaler = MinMaxScaler()
y_minmax_scaled = scaler.fit_transform(y)
y_minmax_scaled[0]

array([0.79148929, 0.49494949, 0.53031622, 0.53031622, 0.        ])

In [11]:
# Blok 8 - rozdelenie dat na trenovaciu a testovaciu mnozinu v pomere 80/20
X_train1, X_test, y_train1, y_test = train_test_split(X, y_minmax_scaled, test_size=0.2)

In [12]:
# Blok 9 - pridanie sumu do trenovacich normovanych dat
X_train_n = []
y_train_n = []
for i in range(len(X_train1)):
    for j in range(3):
        curve = stochastic_noise_generator(X_train1[i])
        X_train_n.append(curve[0])
        y_train_n.append(y_train1[i])
X_train_n = np.array(X_train_n)
y_train_n=np.array(y_train_n)

In [13]:
# Blok 10 - vypis poctu zaznamov v jednotlivych datovych mnozinach
print("Number of records in dataset: ", len(data),
    "\nNumber of records in sample: ", len(X),
    "\nNumber of train data without noise: ", len(X_train1),
    "\nNumber of train data with noise: ", len(X_train_n),
    "\nNumber of test data without noise: ", len(X_test))

Number of records in dataset:  1212796 
Number of records in sample:  300000 
Number of train data without noise:  240000 
Number of train data with noise:  720000 
Number of test data without noise:  60000


## Model

In [14]:
# Blok 11 - zadefinovanie architektury modelu 
inputs = Input(shape=(400, 1))
b = Conv1D(64, kernel_size = 3, padding = "valid")(inputs)
b = MaxPooling1D(2)(b)
b = Dropout(0.2)(b)
b = LSTM(64, return_sequences=True)(b)
b = Flatten()(b)
b = Dense(64, activation='relu')(b)
x = Dense(32, activation='relu')(b)
output = Dense(5, activation='linear')(x)
model = Model(inputs=inputs, outputs=output)
model.compile(loss='mse', optimizer='adam', metrics=["mae", "mape"])

saved_model = "models/norm_overcontact_selection.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor = 'val_mae', verbose = 1, save_best_only = True, mode = 'min')
early = EarlyStopping(monitor = "val_mae", mode = "min", patience = 25)
callbacks_list = [checkpoint, early]

print(model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 400, 1)]          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 398, 64)           256       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 199, 64)           0         
_________________________________________________________________
dropout (Dropout)            (None, 199, 64)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 199, 64)           33024     
_________________________________________________________________
flatten (Flatten)            (None, 12736)             0         
_________________________________________________________________
dense (Dense)                (None, 64)               

In [16]:
# Blok 12 - trenovanie modelu
history = model.fit(X_train_n, y_train_n, validation_split = 0.1, epochs = 10, verbose = 1, callbacks = callbacks_list, batch_size = 64)

Epoch 1/10
Epoch 00001: val_mae improved from inf to 0.04150, saving model to models\norm_overcontact_selection.hdf5
Epoch 2/10
Epoch 00002: val_mae improved from 0.04150 to 0.03797, saving model to models\norm_overcontact_selection.hdf5
Epoch 3/10
Epoch 00003: val_mae improved from 0.03797 to 0.03612, saving model to models\norm_overcontact_selection.hdf5
Epoch 4/10
Epoch 00004: val_mae did not improve from 0.03612
Epoch 5/10
Epoch 00005: val_mae improved from 0.03612 to 0.03433, saving model to models\norm_overcontact_selection.hdf5
Epoch 6/10
Epoch 00006: val_mae did not improve from 0.03433
Epoch 7/10
Epoch 00007: val_mae improved from 0.03433 to 0.03414, saving model to models\norm_overcontact_selection.hdf5
Epoch 8/10
Epoch 00008: val_mae did not improve from 0.03414
Epoch 9/10
Epoch 00009: val_mae did not improve from 0.03414
Epoch 10/10
Epoch 00010: val_mae improved from 0.03414 to 0.03358, saving model to models\norm_overcontact_selection.hdf5


In [14]:
# Blok 13 - nacitanie modelu
model = load_model("models/norm_overcontact_selection.hdf5")

## Model evaluation on normalized test data

In [16]:
# Blok 14 - evaluacia modelu na testovacich normovanych datach bez sumu
scores = model.evaluate(X_test, y_test)
print('Loss: {:.4f}, MAE: {:.4f}'.format(scores[0], scores[1]))

Loss: 0.0038, MAE: 0.0322


In [15]:
# Blok 15 - pridanie sumu do testovacich normovanych dat
X_test_n = []
y_test_norm_n = []
for i in range(len(X_test)):
    for j in range(3):
        curve = stochastic_noise_generator(X_test[i])
        X_test_n.append(curve[0])
        y_test_norm_n.append(y_test[i])
        j += 1
X_test_n = np.array(X_test_n)
y_test_norm_n = np.array(y_test_norm_n)

In [17]:
# Blok 16 - evaluacia moodelu na testovacich normovanych datach so sumom
scores_n = model.evaluate(X_test_n, y_test_norm_n)
print('Loss: {:.4f}, MAE: {:.4f}'.format(scores_n[0], scores_n[1]))

Loss: 0.0040, MAE: 0.0336


## Prediction on normalized test data without noise + inverse normalization

In [18]:
# Blok 17 - predikcia na datach bez sumu
y_pred_norm = model.predict(X_test)

In [20]:
# Blok 18 - spatna normalizacia predikcii
denorm = scaler.inverse_transform(y_pred_norm)
denorm[0]

array([1.00987  , 1.2422509, 3.814539 , 3.813457 , 1.0802377],
      dtype=float32)

In [21]:
# Blok 19 - vytvorenie df z denormovanych predikcii 
denorm_pred_df = pd.DataFrame(denorm,
                           columns = [
                                "inclination",
                                "mass_ratio",
                                "primary__surface_potential",
                                "secondary__surface_potential",
                                "t1_t2"
                            ])
denorm_pred_df.head()

Unnamed: 0,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential,t1_t2
0,1.00987,1.242251,3.814539,3.813457,1.080238
1,1.434893,0.897172,3.358399,3.383627,1.033528
2,1.009152,0.99171,3.349905,3.248664,1.077547
3,1.329317,0.905684,3.49023,3.525199,1.076381
4,0.887441,0.914283,3.264172,3.233231,1.063057


In [22]:
# Blok 20 - vypocet priemernych hodnot
pred_mean = denorm_pred_df.mean(axis=0)
pred_mean

inclination                     1.207262
mass_ratio                      1.460771
primary__surface_potential      4.067463
secondary__surface_potential    4.059316
t1_t2                           1.045363
dtype: float32

In [23]:
# Blok 21 - vytvorenie df z testovaich dat bez sumu, vypocet priemernych hodnot
y_test_df = pd.DataFrame(y,
                        columns = [
                            "inclination",
                            "mass_ratio",
                            "primary__surface_potential",
                            "secondary__surface_potential",
                            "t1_t2"
                            ])
test_mean = y_test_df.mean(axis=0)
test_mean

inclination                     1.222845
mass_ratio                      1.422204
primary__surface_potential      3.987477
secondary__surface_potential    3.987477
t1_t2                           1.048233
dtype: float64

In [24]:
# Blok 22 - vytvorenie df na porovnanie priemernych predikovanych a skutocnych hodnot
eval_pred = pd.DataFrame({'attribute': test_mean.index,
            'avg_true': test_mean.values,
            'avg_pred': pred_mean.values,
            'MAE': abs(test_mean.values - pred_mean.values)})
eval_pred

Unnamed: 0,attribute,avg_true,avg_pred,MAE
0,inclination,1.222845,1.207262,0.015583
1,mass_ratio,1.422204,1.460771,0.038567
2,primary__surface_potential,3.987477,4.067463,0.079986
3,secondary__surface_potential,3.987477,4.059316,0.071839
4,t1_t2,1.048233,1.045363,0.002869


## Prediction on normalized test data with noise + inverse normalization

In [25]:
# Blok 23 - predikcia na testovaich datach so sumom
y_pred_norm_n = model.predict(X_test_n)

In [26]:
# Blok 24 - spatna normalizacia predikcii
denorm_n = scaler.inverse_transform(y_pred_norm_n)
denorm_n[0]

array([1.0137995, 1.2389269, 3.8086123, 3.8100085, 1.0807337],
      dtype=float32)

In [27]:
# Blok 25 - vytvorenie df z denormovanych predikcii
denorm_pred_n_df = pd.DataFrame(denorm_n,
                            columns = [
                                "inclination",
                                "mass_ratio",
                                "primary__surface_potential",
                                "secondary__surface_potential",
                                "t1_t2"
                            ])
denorm_pred_n_df.head()

Unnamed: 0,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential,t1_t2
0,1.0138,1.238927,3.808612,3.810009,1.080734
1,1.009084,1.247791,3.822558,3.821386,1.079962
2,1.010949,1.236768,3.806676,3.805421,1.079818
3,1.434906,0.89939,3.361198,3.386827,1.033581
4,1.435269,0.87211,3.324525,3.346521,1.032596


In [28]:
# Blok 26 - vypocet priemernych hodnot denormovanych predikcii na datach so sumom
pred_n_mean = denorm_pred_n_df.mean(axis=0)
pred_n_mean

inclination                     1.207670
mass_ratio                      1.463276
primary__surface_potential      4.071081
secondary__surface_potential    4.063103
t1_t2                           1.045390
dtype: float32

In [30]:
# Blok 27 - spatna normalizacia testovaich dat, vytvorenie df a vypocet priemernych hodnot
y_test_n_denorm = scaler.inverse_transform(y_test_norm_n)
y_test_norm_n_df = pd.DataFrame(y_test_n_denorm,
                            columns = [
                            "inclination",
                            "mass_ratio",
                            "primary__surface_potential",
                            "secondary__surface_potential",
                            "t1_t2"
                            ])
test_mean_n = y_test_norm_n_df.mean(axis=0)
test_mean_n

inclination                     1.223116
mass_ratio                      1.434141
primary__surface_potential      4.002988
secondary__surface_potential    4.002988
t1_t2                           1.048256
dtype: float64

In [31]:
# Blok 28 - vytvorenie df na porovnanie priemernych skutocnych hodnot a predikcii
eval_pred = pd.DataFrame({'attribute': test_mean_n.index,
            'avg_true': test_mean_n.values,
            'avg_pred': pred_n_mean.values,
            'MAE': abs(test_mean_n.values - pred_n_mean.values)})
eval_pred

Unnamed: 0,attribute,avg_true,avg_pred,MAE
0,inclination,1.223116,1.20767,0.015446
1,mass_ratio,1.434141,1.463276,0.029135
2,primary__surface_potential,4.002988,4.071081,0.068094
3,secondary__surface_potential,4.002988,4.063103,0.060115
4,t1_t2,1.048256,1.04539,0.002866
