# Find best model

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [162]:
import pandas as pd
from sklearn.metrics import accuracy_score
import plotly.express as px
import joblib
import os

from sklearn.feature_selection import mutual_info_classif

# Deep Learning API for creating Neural Networks (Runs on TensorFlow)
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Dense, Dropout
from tensorflow import math
from keras.utils.vis_utils import plot_model

from my_utils.visualization import show_history_graph

# Data

In [105]:
train_df = pd.read_feather('./data/train_processed.ftr')
val_df = pd.read_feather('./data/val_processed.ftr')
train_ensemble_df = pd.read_feather('./data/ensemble_train_df.ftr')
val_ensemble_df = pd.read_feather('./data/ensemble_val_df.ftr')

standard_cols = list(train_df.columns)
ensemble_cols = list(train_ensemble_df.columns)

In [106]:
standard_cols[:-1]

['is_g734s',
 'CryoSleep',
 'VIP',
 'Europa',
 'Mars',
 'PSO J318.5-22',
 'TRAPPIST-1e',
 'Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'segment']

In [107]:
train_df = pd.merge(
    train_df.reset_index(),
    train_ensemble_df.reset_index(),
    on='index'
)
cols = standard_cols[:-1] + ensemble_cols
train_df = train_df[cols]

In [108]:
val_df = pd.merge(
    val_df.reset_index(),
    val_ensemble_df.reset_index(),
    on='index'
)
cols = standard_cols[:-1] + ensemble_cols
val_df = val_df[cols]

In [110]:
train_x = train_df[cols[:-1]].to_numpy()
val_x = val_df[cols[:-1]].to_numpy()
train_y = train_df[cols[-1]].astype(int).to_numpy()
val_y = val_df[cols[-1]].astype(int).to_numpy()

# Mean prediction 

In [22]:
mean_pred = val_ensemble_df['mean'].apply(lambda x: int(round(x, 0))).to_numpy()
mean_acc = round(accuracy_score(mean_pred, val_y), 3)

print(f'Prediction based on probabilities mean has accuracy = {mean_acc}')

Prediction based on probabilities mean has accuracy = 0.796


# Feature selection

In [171]:
mutual_info = mutual_info_classif(train_x, train_y)
mutual_df = pd.DataFrame({
    'Feature': cols[:-1],
    'MutualInfo': mutual_info
})
mutual_df.sort_values(
    by='MutualInfo',
    ascending=False,
    ignore_index=True,
    inplace=True
)
mutual_df['MutualInfo'] = mutual_df['MutualInfo'].round(2)

In [175]:
mutual_df

Unnamed: 0,Feature,MutualInfo
0,ada_boost,0.51
1,mean,0.33
2,lgbm,0.28
3,neural,0.27
4,svc,0.27
5,CryoSleep,0.11
6,segment,0.1
7,Spa,0.07
8,RoomService,0.07
9,VRDeck,0.06


In [176]:
tresh = 0.10
filter_tresh = mutual_df['MutualInfo'] >= tresh
best_cols = mutual_df['Feature'].to_list()

In [178]:
train_x = train_df[best_cols].to_numpy()
val_x = val_df[best_cols].to_numpy()

# Baseline neural model

In [179]:
def get_accuracy(x, y, name, model):
    y_pred = model.predict(x)
    y_pred = [1 if y>=0.5 else 0 for y in y_pred.reshape(1,-1)[0]]
    acc = accuracy_score(y, y_pred)
    print(f'Accuracy on {name} data: {round(acc, 3)}')

In [180]:
layers = [
  Dense(4, activation=keras.activations.relu), # hidden layer 1, ReLU activation
  Dense(4, activation=keras.activations.relu), # hidden layer 2, ReLU activation
  Dense(1, activation=keras.activations.sigmoid) # ouput layer, sigmoid activation
]

base_model = keras.Sequential(layers)

In [181]:
base_model.compile(
    loss=keras.losses.binary_crossentropy,
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [182]:
history = base_model.fit(
    train_x,
    train_y,
    epochs=100,
    validation_data=(val_x, val_y)
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [183]:
get_accuracy(train_x, train_y, 'train', base_model)

Accuracy on train data: 0.92


In [184]:
get_accuracy(val_x, val_y, 'train', base_model)

Accuracy on train data: 0.756


In [185]:
history_df = pd.DataFrame(history.history)

show_history_graph(history_df, 'loss', 'val_loss')
show_history_graph(history_df, 'accuracy', 'val_accuracy')

# Final model

In [224]:
relu = keras.activations.relu
layers = [
    Dense(16, activation=relu),
    Dropout(0.1),
    Dense(8, activation=relu),
    Dense(4, activation=relu),
    Dense(1, activation=keras.activations.sigmoid)
]

final_model = keras.Sequential(layers)

In [225]:
final_model.compile(
    loss=keras.losses.binary_crossentropy,
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy']
)

early_stopping_callback = keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    min_delta=0.001,
    patience=20,
    verbose=0,
    restore_best_weights=True
)
callbacks = [
    early_stopping_callback
]

In [226]:
final_history = final_model.fit(
    train_x,
    train_y,
    epochs=100,
    validation_data=(val_x, val_y),
    callbacks=callbacks
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


In [227]:
get_accuracy(train_x, train_y, 'train', final_model)

Accuracy on train data: 0.825


In [230]:
get_accuracy(val_x, val_y, 'train', final_model)

Accuracy on train data: 0.803


In [229]:
history_df = pd.DataFrame(final_history.history)

show_history_graph(history_df, 'loss', 'val_loss')
show_history_graph(history_df, 'accuracy', 'val_accuracy')

# Save model

In [192]:
model_name = 'final_model'
path = os.path.join(
    os.getcwd(),
    'models',
    model_name
)
final_model.save(path)

INFO:tensorflow:Assets written to: c:\PROJEKTY\KAGGLE\SpaceshipTytanic\models\final_model\assets
