## Skript pre porovnanie predikcii vsetkych modelov
Modely trenovane na detached krivkach

In [4]:
# Blok 1
# Nacitanie kniznic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import load_model

np.random.seed(1234)
pd.set_option('display.max_rows', None)

In [3]:
# BLOK 2
# Funkcie pre generovanie sumu. Nastavenie pseudo-nahodneho generatora.

def generate_observation_sigma(space_obs_frac=0.5):
    """
    Draws a standard deviation of noise in light curve points from a "true" value provided in synthetic light curve.
    Noise sigma is drawn from bimodal distribution taking into account contributions from space based and earth based
    observations which have different levels of stochastic noise.

    :param space_obs_frac: ratio between earth based and space based observations
    :return: float; standard deviation of the light curve noise
    """
    earth_based_sigma = 4e-3
    space_based_sigma = 2e-4
    sigma = np.random.choice([earth_based_sigma, space_based_sigma], p=[1-space_obs_frac, space_obs_frac])
    return np.random.rayleigh(sigma)

def stochastic_noise_generator(curve):
    """
    Introduces gaussian noise into synthetic observation provided in `curve`.

    :param curve: numpy.array; normalized light curve
    :return: Tuple(numpy.array, float); normalized light curve with added noise, standard deviation of observations
    """
    sigma = generate_observation_sigma()
    return np.random.normal(curve, sigma), np.full(curve.shape, sigma)

In [5]:
# BLOK 3 - MODELY
# 1. Model pre predikciu vsetkych parametrov
## Vystup - vsetky parametre, trenovany na ORIGINAL datach so sumom
det_all_params = load_model("models/detached_allParams.hdf5")

# 2. Model pre predikciu vsetkych parametrov
## Vystup - vsetky parametre, trenovany na NORMOVANYCH datach so sumom
det_all_params_norm = load_model("models/norm_detached_all_params.hdf5")

# 3. Modely pre predikciu jednotlivych parametrov samostatne
## Kazdy model ma na vystupe prislusny 1 parameter, modely trenovane na ORIGINAL datach so sumom
det_inc = load_model("models/detached_inclination.hdf5")        # inclination
det_mass_ratio = load_model("models/detached_mass_ratio.hdf5")  # mass ratio
det_prim_potent = load_model("models/detached_pSP.hdf5")        # primary surface potential
det_sec_potent = load_model("models/detached_sSP.hdf5")         # secondary surface potential
det_temp_ratio = load_model("models/detached_t1_t2.hdf5")       # temperature ratio

# 4. Modely pre predikciu vybranych parametrov - trenovany na normovanych krivkach - 1 model
## Vystup - 5 parametrov - inc, mass, prim. potential, sec. potential, temp. ratio
## vystupom su normovane krivky - potrebne denormovat
det_selected_params_norm = load_model("models/norm_detached_sel_v4.hdf5")

# 5. Multi NN model - trenovany na normovanych krivkach
## vystupom je 5 parametrov - inc, mass, prim. potential, sec. potential, temp. ratio
## vystupom su normovane krivky - potrebne denormovat
det_multi_nn_norm = load_model("models/det_multiNN_norm_model.hdf5")

# 6. Multi NN del - trenovany na original krivkach
## vystupom je 5 parametrov - inc, mass, prim. radius, sec. radius, temp. ratio
## vystupom su normovane krivky - potrebne denormovat
det_multi_radius_nn = load_model("models/det_multiNN_radius_model.hdf5")

In [6]:
# BLOK 4 - Nacitanie dat
data = pd.read_pickle("detached_all_parameters.pkl").reset_index()
data_sample = data.sample(n=10000)
data_sample.head()

Unnamed: 0,index,id,curve,primary__t_eff,secondary__t_eff,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential,t1_t2,filter,critical_surface_potential,primary__equivalent_radius,secondary__equivalent_radius,primary__filling_factor,secondary__filling_factor
647973,647973,10032749,"[0.48835963039886077, 0.4913287663336375, 0.49...",45000,5000,1.334076,0.6,20.601251,4.071076,9.0,Bessell_B,3.063442,0.049981,0.210724,-49.955205,-2.870176
1109932,1109932,16384835,"[0.5932746668156491, 0.594846617739326, 0.5994...",12000,5000,1.310729,1.111111,7.008254,5.425004,2.4,SLOAN_u,3.928447,0.170106,0.25046,-5.570841,-2.707013
731380,731380,10579972,"[0.0559029780103817, 0.05590501277853746, 0.05...",10000,5000,1.427398,1.666667,9.3676,5.812,2.0,Bessell_U,4.772403,0.130062,0.330428,-7.853457,-1.776732
692703,692703,10341530,"[0.9719087814691745, 0.9719390320925176, 0.971...",9000,5000,1.223879,1.666667,9.3676,8.640628,1.8,GaiaDR2,4.772403,0.130062,0.209983,-7.853457,-6.611019
1145336,1145336,16615611,"[0.8654052436804602, 0.8654499986635169, 0.865...",20000,16000,1.180696,0.9,3.676833,19.051127,1.25,Kepler,3.585603,0.373969,0.049982,-0.183357,-31.082753


In [7]:
# BLOK 5 - Svetelne krivky a parametre. Pridanie sumu.
X = []
for row in data_sample["curve"]:
    X.append(row)
X=np.array(X)

y = np.array(data_sample[[
    "primary__t_eff",
    "secondary__t_eff",
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1_t2",
    "critical_surface_potential",
    "primary__equivalent_radius",
    "secondary__equivalent_radius",
    "primary__filling_factor",
    "secondary__filling_factor"]])

X_n = []
y_n = []
for i in range(len(X)):
    for j in range(3):
        curve = stochastic_noise_generator(X[i])
        X_n.append(curve[0])
        y_n.append(y[i])
X_n = np.array(X_n)
y_n=np.array(y_n)

In [9]:
# BLOK 6 - vyselektovanie parametrov
parameters = pd.DataFrame(data=y_n, columns=[
    "primary__t_eff",
    "secondary__t_eff",
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1_t2",
    "critical_surface_potential",
    "primary__equivalent_radius",
    "secondary__equivalent_radius",
    "primary__filling_factor",
    "secondary__filling_factor"])

In [14]:
# BLOK 7 - Vypocet priemernych skutocnych hodnot pre potreby vyhodnotenia
test_1 = pd.DataFrame(data=parameters[[
                            "inclination",
                            "mass_ratio",
                            "primary__surface_potential",
                            "secondary__surface_potential",
                            "t1_t2"]])
test_mean_1 = test_1.mean(axis=0)
print("Priemerne hodnoty test1: \n", test_mean_1)

test_2 = pd.DataFrame(data=parameters[[
                            "inclination",
                            "mass_ratio",
                            "primary__equivalent_radius",
                            "secondary__equivalent_radius",
                            "t1_t2"]])
test_mean_2 = test_2.mean(axis=0)
print("Priemerne hodnoty test2: \n", test_mean_2)

Priemerne hodnoty test1: 
 inclination                      1.374078
mass_ratio                       1.738379
primary__surface_potential      18.352879
secondary__surface_potential    15.764617
t1_t2                            2.635565
dtype: float64
Priemerne hodnoty test2: 
 inclination                     1.374078
mass_ratio                      1.738379
primary__equivalent_radius      0.176892
secondary__equivalent_radius    0.200262
t1_t2                           2.635565
dtype: float64


## Priprava parametrov pre jednotlive modely

In [15]:
# BLOK 8  - parametre pre 1. Model - "models/detached_allParams.hdf5" - original data so sumom
y_model1 = np.array(parameters[[
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1_t2",
    "critical_surface_potential",
    "primary__equivalent_radius",
    "secondary__equivalent_radius",
    "primary__filling_factor",
    "secondary__filling_factor"]])

In [16]:
# BLOK 8 - parametre pre 2. model - "models/norm_detached_all_params.hdf5" - original data so sumom + normalizacia

y_model2 = np.array(parameters[[
    "primary__t_eff",
    "secondary__t_eff",
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1_t2",
    "critical_surface_potential",
    "primary__equivalent_radius",
    "secondary__equivalent_radius",
    "primary__filling_factor",
    "secondary__filling_factor"]])

scaler_model2 = MinMaxScaler()
y_normed_model2 = scaler_model2.fit_transform(y_model2)

In [17]:
# BLOK 9 - parametre pre 3. Model - oddelene modely - original data so sumom
y_inc = np.array(parameters["inclination"])
y_mass_ratio = np.array(parameters["mass_ratio"])
y_psp = np.array(parameters["primary__surface_potential"])
y_ssp = np.array(parameters["secondary__surface_potential"])
y_t2_t1 = np.array(parameters["t1_t2"])

In [18]:
# BLOK 10 - parametre pre 4. model - "models/norm_detached_sel_v4.hdf5" - original data so sumom + normalizacia
y_model4 = np.array(parameters[[
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1_t2"]])
scaler_model4 = MinMaxScaler()
y_normed_model4 = scaler_model4.fit_transform(y_model4)

In [19]:
# BLOK 11 - parametre pre 5. model - "models/det_multiNN_norm_model.hdf5" - original data so sumom + normalizacia
y_model5 = np.array(data_sample[[
    "inclination",
    "mass_ratio",
    "primary__surface_potential",
    "secondary__surface_potential",
    "t1_t2"]])

y_inc_model5 = np.array([arr[0] for arr in y_model5])
y_mass_model5 = np.array([arr[1] for arr in y_model5])
y_prim_potent_model5 = np.array([arr[2] for arr in y_model5])
y_sec_potent_model5 = np.array([arr[3] for arr in y_model5])
y_temp_ratio_model5 = np.array([arr[4] for arr in y_model5])

y_inc_model5 = y_inc_model5.reshape(-1,1)
y_mass_model5 = y_mass_model5.reshape(-1,1)
y_prim_potent_model5 = y_prim_potent_model5.reshape(-1,1)
y_sec_potent_model5 = y_sec_potent_model5.reshape(-1,1)
y_temp_ratio_model5 = y_temp_ratio_model5.reshape(-1,1)

inc_model5_scaler = MinMaxScaler()
mass_model5_scaler = MinMaxScaler()
prim_potent_model5_scaler = MinMaxScaler()
sec_potent_model5_scaler = MinMaxScaler()
temp_ratio_model5_scaler = MinMaxScaler()

y_inc_model5_scaled = inc_model5_scaler.fit_transform(y_inc_model5)
y_mass_model5_scaled = mass_model5_scaler.fit_transform(y_mass_model5)
y_prim_potent_model5_scaled = prim_potent_model5_scaler.fit_transform(y_prim_potent_model5)
y_sec_potent_model5_scaled = sec_potent_model5_scaler.fit_transform(y_sec_potent_model5)
y_temp_ratio_model5_scaled = temp_ratio_model5_scaler.fit_transform(y_temp_ratio_model5)

y_inc_model5_scaled = np.array(y_inc_model5_scaled)
y_mass_model5_scaled = np.array(y_mass_model5_scaled)
y_prim_potent_model5_scaled = np.array(y_prim_potent_model5_scaled)
y_sec_potent_model5_scaled = np.array(y_sec_potent_model5_scaled)
y_temp_ratio_model5_scaled = np.array(y_temp_ratio_model5_scaled)

In [20]:
# BLOK 12 - parametre pre 6. model - "models/det_multiNN_radius_model.hdf5" - original data so sumom
y_model6 = np.array(data_sample[[
    "inclination",
    "mass_ratio",
    "primary__equivalent_radius",
    "secondary__equivalent_radius",
    "t1_t2"]])

y_inc_model6 = np.array([arr[0] for arr in y_model6])
y_mass_model6 = np.array([arr[1] for arr in y_model6])
y_prim_radius_model6 = np.array([arr[2] for arr in y_model6])
y_sec_radius_model6 = np.array([arr[3] for arr in y_model6])
y_temp_ratio_model6 = np.array([arr[4] for arr in y_model6])

## Predikcie

In [21]:
# BLOK 13 - predikcie - 1. Model - predikcia vsetkych parametrov
## Vystup - vsetky parametre
model1_pred = det_all_params.predict(X_n)
model1_pred = pd.DataFrame(model1_pred,
                            columns = [
                                "P_inclination",
                                "P_mass_ratio",
                                "P_primary__surface_potential",
                                "P_secondary__surface_potential",
                                "P_t1_t2",
                                "P_critical_surface_potential",
                                "P_primary__equivalent_radius",
                                "P_secondary__equivalent_radius",
                                "P_primary__filling_factor",
                                "P_secondary__filling_factor"])
model1_pred = model1_pred[["P_inclination", "P_mass_ratio","P_primary__surface_potential","P_secondary__surface_potential","P_t1_t2"]]
pred_mean_model1 = model1_pred.mean(axis=0)
pred_mean_model1


P_inclination                      1.379104
P_mass_ratio                       1.578713
P_primary__surface_potential      17.479206
P_secondary__surface_potential    17.599979
P_t1_t2                            2.696397
dtype: float32

In [23]:
# BLOK 14 - 2. Model - predikcia vsetkych parametrov
## Vystup - vsetky parametre - potrebne denormalizovat
model2_pred = det_all_params_norm(X_n)
pred_model2_denorm = scaler_model2.inverse_transform(model2_pred)
denorm_pred_model2 = pd.DataFrame(pred_model2_denorm,
                            columns = [
                                "P_prim__t_eff",
                                "P_sec__t_eff",
                                "P_inclination",
                                "P_mass_ratio",
                                "P_primary__surface_potential",
                                "P_secondary__surface_potential",
                                "P_t1_t2",
                                "P_critical_surface_potential",
                                "P_primary_equivalent_radius",
                                "P_secondary_equivalent_radius",
                                "P_primary_filling_factor",
                                "P_secondary_filling_factor"
                            ])
denorm_pred_model2 = denorm_pred_model2[["P_inclination", "P_mass_ratio","P_primary__surface_potential","P_secondary__surface_potential","P_t1_t2"]]
pred_mean_model2 = denorm_pred_model2.mean(axis=0)
pred_mean_model2


P_inclination                      1.368506
P_mass_ratio                       1.630880
P_primary__surface_potential      17.123560
P_secondary__surface_potential    17.932296
P_t1_t2                            2.588711
dtype: float64

In [33]:
#BLOK 15 - 3. Modely pre predikciu jednotlivych parametrov samostatne
## Vystup - samostatne parametre - original hodnoty bez potreby dalsich uprav
pred_inc = det_inc.predict(X_n).flatten()
pred_q = det_mass_ratio.predict(X_n).flatten()
pred_pp = det_prim_potent.predict(X_n).flatten()
pred_sp = det_sec_potent.predict(X_n).flatten()
pred_tr = det_temp_ratio.predict(X_n).flatten()

pred_model3 = pd.DataFrame()
pred_model3['P_inclination']=pd.Series(pred_inc)
pred_model3['P_mass_ratio']=pd.Series(pred_q)
pred_model3['P_primary__surface_potential']=pd.Series(pred_pp)
pred_model3['P_secondary__surface_potential']=pd.Series(pred_sp)
pred_model3['P_t1_t2']=pd.Series(pred_tr)

pred_mean_model3 = pred_model3.mean(axis=0)
pred_mean_model3

P_inclination                      1.375742
P_mass_ratio                       1.630156
P_primary__surface_potential      17.775438
P_secondary__surface_potential    15.362234
P_t1_t2                            2.616299
dtype: float32

In [34]:
# BLOK 16 -  4. Model pre predikciu vybranych parametrov - trenovany na normovanych krivkach
## Vystup - 5 parametrov - inc, mass, prim. potential, sec. potential, temp. ratio - vystupom su normovane krivky - potrebne denormovat
pred_model_4 = det_selected_params_norm.predict(X_n)
pred_model4_denorm = scaler_model4.inverse_transform(pred_model_4)
denorm_pred_model4 = pd.DataFrame(pred_model4_denorm,
                            columns = [
                                "P_inclination",
                                "P_mass_ratio",
                                "P_primary__surface_potential",
                                "P_secondary__surface_potential",
                                "P_t1_t2"])
pred_mean_model4 = denorm_pred_model4.mean(axis=0)
pred_mean_model4

P_inclination                      1.360300
P_mass_ratio                       1.652796
P_primary__surface_potential      18.847944
P_secondary__surface_potential    19.050194
P_t1_t2                            2.499725
dtype: float32

In [35]:
# BLOK 17 - 5. Multi NN model - trenovany na normovanych krivkach
## vystupom je 5 parametrov - inc, mass, prim. potential, sec. potential, temp. ratio - vystupom su normovane krivky - potrebne denormovat
pred_model_5 = det_multi_nn_norm.predict(X_n)

pred_inc_model5 = np.array([arr[0] for arr in pred_model_5])
pred_mass_model5 = np.array([arr[1] for arr in pred_model_5])
pred_prim_potent_model5 = np.array([arr[2] for arr in pred_model_5])
pred_sec_potent_model5 = np.array([arr[3] for arr in pred_model_5])
pred_temp_ratio_model5 = np.array([arr[4] for arr in pred_model_5])

pred_inc_model5 = pred_inc_model5.reshape(-1,1)
pred_mass_model5 = pred_mass_model5.reshape(-1,1)
pred_prim_potent_model5 = pred_prim_potent_model5.reshape(-1,1)
pred_sec_potent_model5 = pred_sec_potent_model5.reshape(-1,1)
pred_temp_ratio_model5 = pred_temp_ratio_model5.reshape(-1,1)

pred_inc_model5 = inc_model5_scaler.inverse_transform(pred_inc_model5).flatten()
pred_mass_model5 = mass_model5_scaler.inverse_transform(pred_mass_model5).flatten()
pred_prim_potent_model5 = prim_potent_model5_scaler.inverse_transform(pred_prim_potent_model5).flatten()
pred_sec_potent_model5 = sec_potent_model5_scaler.inverse_transform(pred_sec_potent_model5).flatten()
pred_temp_ratio_model5 = temp_ratio_model5_scaler.inverse_transform(pred_temp_ratio_model5).flatten()

denorm_pred_model5 = pd.DataFrame()
denorm_pred_model5['P_inclination']=pd.Series(pred_inc_model5)
denorm_pred_model5['P_mass_ratio']=pd.Series(pred_mass_model5)
denorm_pred_model5['P_primary__surface_potential']=pd.Series(pred_prim_potent_model5)
denorm_pred_model5['P_secondary__surface_potential']=pd.Series(pred_sec_potent_model5)
denorm_pred_model5['P_t1_t2']=pd.Series(pred_temp_ratio_model5)

pred_mean_model5 = denorm_pred_model5.mean(axis=0)
pred_mean_model5

P_inclination                       1.375397
P_mass_ratio                        7.677978
P_primary__surface_potential       84.673218
P_secondary__surface_potential    763.246277
P_t1_t2                             8.845905
dtype: float32

In [36]:
#BLOK 18 - 6. Multi NN del - trenovany na original krivkach - potrebne vyhodnotit samostatne na 2. test mnozine, ktora obsahuje primarny a sekundarny radius
## vystupom je 5 parametrov - inc, mass, prim. radius, sec. radius, temp. ratio
pred_model_6 = det_multi_radius_nn.predict(X_n)
pred_model6_df = pd.DataFrame(pred_model_6,
                        columns = [
                            "P_inclination",
                            "P_mass_ratio",
                            "P_primary__equivalent_radius",
                            "P_secondary__equivalent_radius",
                            "P_t1_t2"
                            ])
pred_mean_model6 = pred_model6_df.mean(axis=0)
pred_mean_model6

P_inclination                     1.347707
P_mass_ratio                      1.348775
P_primary__equivalent_radius      1.347275
P_secondary__equivalent_radius    1.348610
P_t1_t2                           1.348568
dtype: float32

## Porovnanie - modely ktore predikovali inc, q, primary potential, secondary potential, temperature ratio

In [37]:
#BLOK 19 - porovnanie AVG skutocnych hodnot a predikovanych
eval_pred = pd.DataFrame({'attribute': test_mean_1.index,
            'AVG test data': test_mean_1.values,
            'AVG model 1': pred_mean_model1.values,
            'AVG model 2': pred_mean_model2.values,
            'AVG model 3': pred_mean_model3.values,
            'AVG model 4': pred_mean_model4.values,
            'AVG model 5': pred_mean_model5.values})
eval_pred

Unnamed: 0,attribute,AVG test data,AVG model 1,AVG model 2,AVG model 3,AVG model 4,AVG model 5
0,inclination,1.374078,1.379104,1.368506,1.375742,1.3603,1.375397
1,mass_ratio,1.738379,1.578713,1.63088,1.630156,1.652796,7.677978
2,primary__surface_potential,18.352879,17.479206,17.12356,17.775438,18.847944,84.673218
3,secondary__surface_potential,15.764617,17.599979,17.932296,15.362234,19.050194,763.246277
4,t1_t2,2.635565,2.696397,2.588711,2.616299,2.499725,8.845905


In [38]:
# BLOK 20 - porovnanie MAE jednotlivych modelov
eval_pred_mae = pd.DataFrame({'attribute': test_mean_1.index,
            'avg_true': test_mean_1.values,
            'MAE model 1': abs(test_mean_1.values - pred_mean_model1.values),
            'MAE model 2': abs(test_mean_1.values - pred_mean_model2.values),
            'MAE model 3': abs(test_mean_1.values - pred_mean_model3.values),
            'MAE model 4': abs(test_mean_1.values - pred_mean_model4.values),
            'MAE model 5': abs(test_mean_1.values - pred_mean_model5.values),})
eval_pred_mae

Unnamed: 0,attribute,avg_true,MAE model 1,MAE model 2,MAE model 3,MAE model 4,MAE model 5
0,inclination,1.374078,0.005026,0.005572,0.001664,0.013778,0.001319
1,mass_ratio,1.738379,0.159666,0.107499,0.108223,0.085583,5.939599
2,primary__surface_potential,18.352879,0.873673,1.229319,0.57744,0.495066,66.320339
3,secondary__surface_potential,15.764617,1.835363,2.16768,0.402383,3.285577,747.48166
4,t1_t2,2.635565,0.060832,0.046854,0.019266,0.13584,6.210341


## Porovnanie - model ktory predikoval inc, q, primary radius, secondary radius, temperature ratio

In [40]:
#BLOK 21 - porovnanie AVG skutocnych hodnot a predikovanych - model, ktory predikoval radiusy
eval_pred_2 = pd.DataFrame({'attribute': test_mean_2.index,
            'AVG test data': test_mean_2.values,
            'AVG model 1': pred_mean_model6.values})
eval_pred_2

Unnamed: 0,attribute,AVG test data,AVG model 1
0,inclination,1.374078,1.347707
1,mass_ratio,1.738379,1.348775
2,primary__equivalent_radius,0.176892,1.347275
3,secondary__equivalent_radius,0.200262,1.34861
4,t1_t2,2.635565,1.348568


In [41]:
# BLOK 22 - porovnanie MAE - model, ktory predikoval radiusy
eval_pred_mae_2 = pd.DataFrame({'attribute': test_mean_2.index,
            'avg_true': test_mean_2.values,
            'MAE model 1': abs(test_mean_2.values - pred_mean_model6.values)})
eval_pred_mae_2

Unnamed: 0,attribute,avg_true,MAE model 1
0,inclination,1.374078,0.026371
1,mass_ratio,1.738379,0.389604
2,primary__equivalent_radius,0.176892,1.170383
3,secondary__equivalent_radius,0.200262,1.148347
4,t1_t2,2.635565,1.286997
