# Imputation Research Project <img src="https://chroniclesofai.com/content/images/2021/05/file-20201210-18-elk4m.jpg" alt="Alt text image not displaying" width="360" align="right" />
## Notebook 3.0: Autoencoder Model

**Author:** Chike Odenigbo

**Date:** November 25th, 2022

**Notebook Structure:**

* 1.0 Preprocessing

* **1.1 Exploratory Data Analysis**

* 1.2 Masking

* 2.* Models


Water Sugar lutein_zeaxanthin Alcohol

In [1]:
import pandas as pd
import os
from src.visualization.visualize import histogram, box_plot, bar_plot
from pathlib import Path
from notebook_config import ROOT_DIR  # setup.py file changed the root of the project so it is set in the config file

ROOT_DIR = ROOT_DIR.as_posix()  # convert root path to windows readable path (i.e. change backslash to forward slash)
from joblib import load
import tensorflow as tf
from keras import backend as K
import keras
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
import pickle

In [2]:
notebook_nm = "3.0-autoencoder"
fig_dir = f"{ROOT_DIR}/reports/figures/"
model_dir = f"{ROOT_DIR}/models/autoencoders/"
scaler_dir = f'{ROOT_DIR}/models/scalers/'
output_prefix = notebook_nm

In [3]:
pd.set_option('display.max_columns', None)

# Ground Truth Included
water_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water.csv')
sugars_df = pd.read_csv(f'{ROOT_DIR}/data/processed/sugars.csv')
lutein_df = pd.read_csv(f'{ROOT_DIR}/data/processed/lutein.csv')

# Scaled Data without Ground Truth to prevent Data Leakage, rows are included with NaN
water_mcar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water_mcar_scaled.csv', index_col = 0)
water_mar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water_mar_scaled.csv', index_col = 0)

lutein_mar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/lutein_mar_scaled.csv', index_col = 0)
lutein_mcar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/utein_mcar_scaled.csv', index_col = 0)

sugars_mar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/sugars_mar_scaled.csv', index_col = 0)
sugars_mcar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/sugars_mcar_scaled.csv', index_col = 0)

water_mcar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water_mcar_scaled.csv', index_col = 0)
water_mar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water_mar_scaled.csv', index_col = 0)

# Scalers to return back to origin scale for model evaluation
scaler_lutein_mar = load(f'{scaler_dir}/scaler_lutein_mar.joblib')
scaler_lutein_mcar = load(f'{scaler_dir}/scaler_lutein_mcar.joblib')

scaler_sugars_mar = load(f'{scaler_dir}/scaler_sugars_mar.joblib')
scaler_sugars_mcar = load(f'{scaler_dir}/scaler_sugars_mcar.joblib')

scaler_water_mcar = load(f'{scaler_dir}/scaler_water_mcar.joblib')
scaler_water_mar = load(f'{scaler_dir}/scaler_water_mar.joblib')

In [4]:
def scale_ground_truth(scaler,raw_df,target_col,non_target_na_col):
    raw_filtered_df = raw_df[raw_df[target_col].isnull()].drop(['name',target_col, non_target_na_col], axis = 'columns')
    scaled_ground_truth_df = pd.DataFrame(scaler.transform(raw_filtered_df),columns = raw_filtered_df.columns)
    return scaled_ground_truth_df

def train_test_split_scaled(scaled_df,drop_cols = ['name','dataset_type']):
    train_df = scaled_df[scaled_df.dataset_type == 'training'].drop(drop_cols, axis = 'columns')
    val_df = scaled_df[scaled_df.dataset_type == 'validation'].drop(drop_cols, axis = 'columns')
    return train_df, val_df

def autoencoder(data, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.0001):
    enc_input = keras.Input(shape = (input_shape,),name = 'input_data')
    latent_layer = keras.layers.Dense(latent_shape, activation = 'relu')(enc_input)
    encoder = keras.Model(enc_input,latent_layer, name = "encoder")
    dec_output = keras.layers.Dense(input_shape)(latent_layer)
    opt = keras.optimizers.Adam(lr = lr)
    auto_encoder = keras.Model(enc_input,dec_output, name="auto_encoder")
    print(auto_encoder.summary())
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)
    auto_encoder.compile(opt, loss = "mse")
    history = auto_encoder.fit(data, data, epochs = 1000, validation_split = 0.1, callbacks=[callback])
    return auto_encoder, history

def save_autoencoder(model,model_name,path = model_dir):
    model.save(f'{path}/{model_name}.h5')
    with open(f'{path}/{model_name}_history.pkl', 'wb') as file_pi:
        pickle.dump(model.history, file_pi)

In [5]:
water_mcar_train_df, water_mcar_val_df = train_test_split_scaled(water_mcar_scaled_df,drop_cols = ['name','dataset_type'])
water_mar_train_df, water_mar_val_df = train_test_split_scaled(water_mar_scaled_df,drop_cols = ['name','dataset_type'])

sugars_mcar_train_df, sugars_mcar_val_df = train_test_split_scaled(sugars_mcar_scaled_df,drop_cols = ['name','dataset_type'])
sugars_mar_train_df, sugars_mar_val_df = train_test_split_scaled(sugars_mar_scaled_df,drop_cols = ['name','dataset_type'])

lutein_mcar_train_df, lutein_mcar_val_df = train_test_split_scaled(lutein_mcar_scaled_df,drop_cols = ['name','dataset_type'])
lutein_mar_train_df, lutein_mar_val_df = train_test_split_scaled(lutein_mar_scaled_df,drop_cols = ['name','dataset_type'])


## Train Models

In [6]:
sugars_mcar_model, sugars_mcar_history = autoencoder(sugars_mcar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense (Dense)               (None, 40)                2920      
                                                                 
 dense_1 (Dense)             (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


None
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000


In [7]:
sugars_mar_model, sugars_mar_history = autoencoder(sugars_mar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_2 (Dense)             (None, 40)                2920      
                                                                 
 dense_3 (Dense)             (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________
None


  super().__init__(name, **kwargs)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000


In [8]:
water_mar_model, water_mar_history = autoencoder(water_mar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_4 (Dense)             (None, 40)                2920      
                                                                 
 dense_5 (Dense)             (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________
None


  super().__init__(name, **kwargs)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000


In [9]:
water_mcar_model, water_mcar_history = autoencoder(water_mcar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_6 (Dense)             (None, 40)                2920      
                                                                 
 dense_7 (Dense)             (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________
None


  super().__init__(name, **kwargs)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000


In [10]:
lutein_mar_model, lutein_mar_history = autoencoder(lutein_mar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_8 (Dense)             (None, 40)                2920      
                                                                 
 dense_9 (Dense)             (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________
None


  super().__init__(name, **kwargs)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000


In [11]:
lutein_mcar_model, lutein_mcar_history = autoencoder(lutein_mcar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_10 (Dense)            (None, 40)                2920      
                                                                 
 dense_11 (Dense)            (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________
None


  super().__init__(name, **kwargs)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000


In [12]:
save_autoencoder(model = sugars_mcar_model, model_name = 'sugars_mcar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 17:04:08         1872
metadata.json                                  2022-11-28 17:04:08           64
variables.h5                                   2022-11-28 17:04:08        36504


In [13]:
save_autoencoder(model = sugars_mar_model, model_name = 'sugars_mar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 17:04:08         1878
metadata.json                                  2022-11-28 17:04:08           64
variables.h5                                   2022-11-28 17:04:08        36504


In [14]:
save_autoencoder(model = water_mar_model, model_name = 'water_mar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 17:04:08         1878
metadata.json                                  2022-11-28 17:04:08           64
variables.h5                                   2022-11-28 17:04:08        36504


In [15]:
save_autoencoder(model = water_mcar_model, model_name = 'water_mcar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 17:04:08         1878
metadata.json                                  2022-11-28 17:04:08           64
variables.h5                                   2022-11-28 17:04:08        36504


In [16]:
save_autoencoder(model = lutein_mar_model, model_name = 'lutein_mar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 17:04:09         1878
metadata.json                                  2022-11-28 17:04:09           64
variables.h5                                   2022-11-28 17:04:09        36504


In [17]:
save_autoencoder(model = lutein_mcar_model, model_name = 'lutein_mcar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 17:04:09         1884
metadata.json                                  2022-11-28 17:04:09           64
variables.h5                                   2022-11-28 17:04:09        36504


In [18]:
max_water_mcar = water_mcar_train_df.water_mcar.max()
min_water_mcar = water_mcar_train_df.water_mcar.min()

In [19]:
water_mcar_test_df['water_mcar'] = 0#np.random.uniform(min_water_mcar, max_water_mcar, size=len(water_mcar_test_df))

NameError: name 'water_mcar_test_df' is not defined

In [None]:
water_mcar_test_df

In [None]:
water_mcar_pred = pd.DataFrame(auto_encoder.predict(water_mcar_test_df), columns = water_mcar_test_df.columns)

In [None]:
water_mcar_true = pd.DataFrame(scaler_water_mcar.transform(water_df[water_df.water_mcar.isnull()].drop(['name','water_mcar', 'water_mar'], axis = 'columns')),columns = water_df.drop(['name','water_mcar', 'water_mar'], axis = 'columns').columns)# a = np.array(x) # your x
# b = np.array(y) # your y
# mses = ((a-b)**2).mean(axis=1)

In [None]:
from sklearn.metrics import mean_squared_error
pd.DataFrame({'columns':water_mcar_true.columns,
              'mse':list(mean_squared_error(water_mcar_true, water_mcar_pred, multioutput='raw_values'))}).nlargest(10,'mse')