## Predikcia vybranych parametrov
### Detached krivky

### Predikcia inclination, mass ratio, temperature ratio, potentials

In [1]:
# LIBRARIES
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler


In [2]:
from keras.utils import np_utils
from keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, SpatialDropout1D, GlobalAveragePooling1D
from keras.layers import Input, Dense, concatenate, Activation, LSTM, Dropout, Flatten
from keras.models import Model
from keras.layers.merge import Concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [3]:
np.random.seed(1234)
pd.set_option('display.max_rows', None)

In [4]:
def generate_observation_sigma(space_obs_frac=0.5):
    """
    Draws a standard deviation of noise in light curve points from a "true" value provided in synthetic light curve.
    Noise sigma is drawn from bimodal distribution taking into account contributions from space based and earth based
    observations which have different levels of stochastic noise.

    :param space_obs_frac: ratio between earth based and space based observations
    :return: float; standard deviation of the light curve noise
    """
    earth_based_sigma = 4e-3
    space_based_sigma = 2e-4
    sigma = np.random.choice([earth_based_sigma, space_based_sigma], p=[1-space_obs_frac, space_obs_frac])
    return np.random.rayleigh(sigma)

def stochastic_noise_generator(curve):
    """
    Introduces gaussian noise into synthetic observation provided in `curve`.

    :param curve: numpy.array; normalized light curve
    :return: Tuple(numpy.array, float); normalized light curve with added noise, standard deviation of observations
    """
    sigma = generate_observation_sigma()
    return np.random.normal(curve, sigma), np.full(curve.shape, sigma)

## Data loading

In [5]:
data = pd.read_pickle("detached_all_parameters.pkl").reset_index()

In [6]:
data_sample = data.sample(n=300000)

In [7]:
list(data_sample.columns)

['index',
 'id',
 'curve',
 'primary__t_eff',
 'secondary__t_eff',
 'inclination',
 'mass_ratio',
 'primary__surface_potential',
 'secondary__surface_potential',
 't1_t2',
 'filter',
 'critical_surface_potential',
 'primary__equivalent_radius',
 'secondary__equivalent_radius',
 'primary__filling_factor',
 'secondary__filling_factor']

In [8]:
X = []
for row in data_sample["curve"]:
    X.append(row)
X=np.array(X)

In [9]:
y = np.array(data_sample[[
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1_t2"]])

In [10]:
# MinMax Scaler
scaler = MinMaxScaler()
y_minmax_scaled = scaler.fit_transform(y)
y_minmax_scaled[0]

array([0.71584635, 0.05050505, 0.17208073, 0.0020424 , 0.7804878 ])

In [11]:
X_train1, X_test, y_train1, y_test = train_test_split(X, y_minmax_scaled, test_size=0.2)

In [12]:
# Adding noise to train data
X_train_n = []
y_train_n = []
for i in range(len(X_train1)):
    for j in range(3):
        curve = stochastic_noise_generator(X_train1[i])
        X_train_n.append(curve[0])
        y_train_n.append(y_train1[i])
X_train_n = np.array(X_train_n)
y_train_n=np.array(y_train_n)

In [13]:
print("Number of records in dataset: ", len(data),
    "\nNumber of records in sample: ", len(X),
    "\nNumber of train data without noise: ", len(X_train1),
    "\nNumber of train data with noise: ", len(X_train_n),
    "\nNumber of test data without noise: ", len(X_test))

Number of records in dataset:  1300000 
Number of records in sample:  300000 
Number of train data without noise:  240000 
Number of train data with noise:  720000 
Number of test data without noise:  60000


## Model

In [19]:
inputs = Input(shape=(400, 1))
b = Conv1D(64, kernel_size = 3, padding = "valid")(inputs)
b = MaxPooling1D(2)(b)
b = Dropout(0.2)(b)
b = LSTM(64, return_sequences=True)(b)
b = Flatten()(b)
b = Dense(64, activation='relu')(b)
x = Dense(32, activation='relu')(b)
output = Dense(5, activation='linear')(x)
model = Model(inputs=inputs, outputs=output)
model.compile(loss='mse', optimizer='adam', metrics=["mae", "mape"])
print(model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 400, 1)]          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 398, 64)           256       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 199, 64)           0         
_________________________________________________________________
dropout (Dropout)            (None, 199, 64)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 199, 64)           33024     
_________________________________________________________________
flatten (Flatten)            (None, 12736)             0         
_________________________________________________________________
dense (Dense)                (None, 64)               

In [20]:
saved_model = "models/norm_detached_selection.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor = 'val_mae', verbose = 1, save_best_only = True, mode = 'min')
early = EarlyStopping(monitor = "val_mae", mode = "min", patience = 25)
callbacks_list = [checkpoint, early]

In [21]:
history = model.fit(X_train_n, y_train_n, validation_split = 0.1, epochs = 10, verbose = 1, callbacks = callbacks_list, batch_size = 64)

Epoch 1/10
Epoch 00001: val_mae improved from inf to 0.07035, saving model to models\norm_detached_selection.hdf5
Epoch 2/10
Epoch 00002: val_mae improved from 0.07035 to 0.06169, saving model to models\norm_detached_selection.hdf5
Epoch 3/10
Epoch 00003: val_mae improved from 0.06169 to 0.06023, saving model to models\norm_detached_selection.hdf5
Epoch 4/10
Epoch 00004: val_mae improved from 0.06023 to 0.05767, saving model to models\norm_detached_selection.hdf5
Epoch 5/10
Epoch 00005: val_mae improved from 0.05767 to 0.05457, saving model to models\norm_detached_selection.hdf5
Epoch 6/10
Epoch 00006: val_mae did not improve from 0.05457
Epoch 7/10
Epoch 00007: val_mae improved from 0.05457 to 0.05361, saving model to models\norm_detached_selection.hdf5
Epoch 8/10
Epoch 00008: val_mae improved from 0.05361 to 0.05269, saving model to models\norm_detached_selection.hdf5
Epoch 9/10
Epoch 00009: val_mae improved from 0.05269 to 0.05211, saving model to models\norm_detached_selection.hdf5

In [14]:
model = load_model("models/norm_detached_selection.hdf5")

## Model evaluation on  normalized test data

In [18]:
# Normalized test data without noise
scores = model.evaluate(X_test, y_test)
print('Loss: {:.4f}, MAE: {:.4f}'.format(scores[0], scores[1]))

Loss: 0.0094, MAE: 0.0479


In [19]:
# Adding noise to normalized test data
X_test_n = []
y_test_norm_n = []
for i in range(len(X_test)):
    for j in range(3):
        curve = stochastic_noise_generator(X_test[i])
        X_test_n.append(curve[0])
        y_test_norm_n.append(y_test[i])
        j += 1
X_test_n = np.array(X_test_n)
y_test_norm_n = np.array(y_test_norm_n)

In [20]:
# Normalized test data with noise
scores_n = model.evaluate(X_test_n, y_test_norm_n)
print('Loss: {:.4f}, MAE: {:.4f}'.format(scores_n[0], scores_n[1]))

Loss: 0.0104, MAE: 0.0504


## Prediction on normalized test data without noise + inverse normalization

In [21]:
y_pred_norm = model.predict(X_test)

In [22]:
y_pred_norm[0]

array([ 0.4470291 ,  0.06648329,  0.00868994, -0.0027322 ,  0.07256927],
      dtype=float32)

In [23]:
denorm = scaler.inverse_transform(y_pred_norm)
denorm[0]

array([ 1.1101327,  0.7581846,  2.9582992, -0.6770748,  1.743835 ],
      dtype=float32)

In [25]:
denorm_pred_df = pd.DataFrame(denorm,
                           columns = [
                                "inclination",
                                "mass_ratio",
                                "primary__surface_potential",
                                "secondary__surface_potential",
                                "t1_t2"
                            ])
denorm_pred_df.head()

Unnamed: 0,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential,t1_t2
0,1.110133,0.758185,2.958299,-0.677075,1.743835
1,1.383782,1.111504,5.783891,7.699143,1.429892
2,1.154522,1.166941,2.625816,7.953134,2.057881
3,1.247581,1.320555,5.959751,4.120866,1.3106
4,1.432467,1.215662,12.611739,15.475405,2.412863


In [26]:
pred_mean = denorm_pred_df.mean(axis=0)
pred_mean

inclination                      1.375138
mass_ratio                       1.699958
primary__surface_potential      16.951014
secondary__surface_potential    15.650300
t1_t2                            2.704694
dtype: float32

In [27]:
y_test_df = pd.DataFrame(y,
                        columns = [
                            "inclination",
                            "mass_ratio",
                            "primary__surface_potential",
                            "secondary__surface_potential",
                            "t1_t2"
                            ])
test_mean = y_test_df.mean(axis=0)
test_mean

inclination                      1.375399
mass_ratio                       1.753314
primary__surface_potential      18.080464
secondary__surface_potential    16.713742
t1_t2                            2.653047
dtype: float64

In [28]:
eval_pred = pd.DataFrame({'attribute': test_mean.index,
            'avg_true': test_mean.values,
            'avg_pred': pred_mean.values,
            'MAE': abs(test_mean.values - pred_mean.values)})
eval_pred

Unnamed: 0,attribute,avg_true,avg_pred,MAE
0,inclination,1.375399,1.375138,0.000261
1,mass_ratio,1.753314,1.699958,0.053357
2,primary__surface_potential,18.080464,16.951014,1.12945
3,secondary__surface_potential,16.713742,15.6503,1.063442
4,t1_t2,2.653047,2.704694,0.051647


## Prediction on normalized test data with noise + inverse normalization

In [29]:
y_pred_norm_n = model.predict(X_test_n)

In [30]:
denorm_n = scaler.inverse_transform(y_pred_norm_n)
denorm_n[0]

array([ 1.1112685 ,  0.755074  ,  2.9488156 , -0.70702505,  1.7694119 ],
      dtype=float32)

In [31]:
denorm_pred_n_df = pd.DataFrame(denorm_n,
                            columns = [
                                "inclination",
                                "mass_ratio",
                                "primary__surface_potential",
                                "secondary__surface_potential",
                                "t1_t2"
                            ])
denorm_pred_n_df.head()

Unnamed: 0,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential,t1_t2
0,1.111269,0.755074,2.948816,-0.707025,1.769412
1,1.16178,0.932129,4.102746,3.328862,2.261246
2,1.110087,0.757919,2.957381,-0.680879,1.744319
3,1.383999,1.021811,5.351217,7.759612,1.431372
4,1.380568,1.251057,6.230495,8.264846,1.418277


In [32]:
pred_n_mean = denorm_pred_n_df.mean(axis=0)
pred_n_mean

inclination                      1.374322
mass_ratio                       1.706649
primary__surface_potential      16.950588
secondary__surface_potential    15.754042
t1_t2                            2.710346
dtype: float32

In [34]:
y_test_n_denorm = scaler.inverse_transform(y_test_norm_n)
y_test_norm_n_df = pd.DataFrame(y_test_n_denorm,
                            columns = [
                            "inclination",
                            "mass_ratio",
                            "primary__surface_potential",
                            "secondary__surface_potential",
                            "t1_t2"
                            ])
test_mean_n = y_test_norm_n_df.mean(axis=0)
test_mean_n

inclination                      1.375144
mass_ratio                       1.759717
primary__surface_potential      18.140358
secondary__surface_potential    16.718059
t1_t2                            2.654019
dtype: float64

In [35]:
eval_pred = pd.DataFrame({'attribute': test_mean_n.index,
            'avg_true': test_mean_n.values,
            'avg_pred': pred_n_mean.values,
            'MAE': abs(test_mean_n.values - pred_n_mean.values)})
eval_pred

Unnamed: 0,attribute,avg_true,avg_pred,MAE
0,inclination,1.375144,1.374322,0.000822
1,mass_ratio,1.759717,1.706649,0.053068
2,primary__surface_potential,18.140358,16.950588,1.18977
3,secondary__surface_potential,16.718059,15.754042,0.964018
4,t1_t2,2.654019,2.710346,0.056327
