# Imputation Research Project <img src="https://chroniclesofai.com/content/images/2021/05/file-20201210-18-elk4m.jpg" alt="Alt text image not displaying" width="360" align="right" />
## Notebook 3.0: Autoencoder Model

**Author:** Chike Odenigbo

**Date:** November 25th, 2022

**Notebook Structure:**

* 1.0 Preprocessing

* **1.1 Exploratory Data Analysis**

* 1.2 Masking

* 2.* Models


Water Sugar lutein_zeaxanthin Alcohol

In [2]:
import pandas as pd
import os
from src.visualization.visualize import histogram, box_plot, bar_plot
from pathlib import Path
from notebook_config import ROOT_DIR  # setup.py file changed the root of the project so it is set in the config file

ROOT_DIR = ROOT_DIR.as_posix()  # convert root path to windows readable path (i.e. change backslash to forward slash)
from joblib import load
import tensorflow as tf
from keras import backend as K
import keras
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
import pickle

In [38]:
notebook_nm = "3.0-autoencoder"
fig_dir = f"{ROOT_DIR}/reports/figures/"
model_dir = f"{ROOT_DIR}/models/autoencoders/"
scaler_dir = f'{ROOT_DIR}/models/scalers/'
output_prefix = notebook_nm

In [37]:
pd.set_option('display.max_columns', None)

# Ground Truth Included
water_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water.csv')
sugars_df = pd.read_csv(f'{ROOT_DIR}/data/processed/sugars.csv')
lutein_df = pd.read_csv(f'{ROOT_DIR}/data/processed/lutein.csv')

# Scaled Data without Ground Truth to prevent Data Leakage, rows are included with NaN
water_mcar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water_mcar_scaled.csv', index_col = 0)
water_mar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water_mar_scaled.csv', index_col = 0)

lutein_mar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/lutein_mar_scaled.csv', index_col = 0)
lutein_mcar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/utein_mcar_scaled.csv', index_col = 0)

sugars_mar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/sugars_mar_scaled.csv', index_col = 0)
sugars_mcar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/sugars_mcar_scaled.csv', index_col = 0)

water_mcar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water_mcar_scaled.csv', index_col = 0)
water_mar_scaled_df = pd.read_csv(f'{ROOT_DIR}/data/processed/water_mar_scaled.csv', index_col = 0)

# Scalers to return back to origin scale for model evaluation
scaler_lutein_mar = load(f'{scaler_dir}/scaler_lutein_mar.joblib')
scaler_lutein_mcar = load(f'{scaler_dir}/scaler_lutein_mcar.joblib')

scaler_sugars_mar = load(f'{scaler_dir}/scaler_sugars_mar.joblib')
scaler_sugars_mcar = load(f'{scaler_dir}/scaler_sugars_mcar.joblib')

scaler_water_mcar = load(f'{scaler_dir}/scaler_water_mcar.joblib')
scaler_water_mar = load(f'{scaler_dir}/scaler_water_mar.joblib')

In [52]:
def scale_ground_truth(scaler,raw_df,target_col,non_target_na_col):
    raw_filtered_df = raw_df[raw_df[target_col].isnull()].drop(['name',target_col, non_target_na_col], axis = 'columns')
    scaled_ground_truth_df = pd.DataFrame(scaler.transform(raw_filtered_df),columns = raw_filtered_df.columns)
    return scaled_ground_truth_df

def train_test_split_scaled(scaled_df,drop_cols = ['name','dataset_type','serving_size']):
    train_df = scaled_df[scaled_df.dataset_type == 'training'].drop(drop_cols, axis = 'columns')
    val_df = scaled_df[scaled_df.dataset_type == 'validation'].drop(drop_cols, axis = 'columns')
    return train_df, val_df

def autoencoder(data, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.0001):
    enc_input = keras.Input(shape = (input_shape,),name = 'input_data')
    latent_layer = keras.layers.Dense(latent_shape, activation = 'relu')(enc_input)
    encoder = keras.Model(enc_input,latent_layer, name = "encoder")
    dec_output = keras.layers.Dense(input_shape)(latent_layer)
    opt = keras.optimizers.Adam(lr = lr)
    auto_encoder = keras.Model(enc_input,dec_output, name="auto_encoder")
    print(auto_encoder.summary())
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)
    auto_encoder.compile(opt, loss = "mse")
    history = auto_encoder.fit(data, data, epochs = 1000, validation_split = 0.1, callbacks=[callback])
    return auto_encoder, history

def save_autoencoder(model,model_name,path = model_dir):
    model.save(f'{path}/{model_name}.h5')
    with open(f'{path}/{model_name}_history.pkl', 'wb') as file_pi:
        pickle.dump(model.history, file_pi)

In [14]:
water_mcar_ground_truth_scaled_df = scale_ground_truth(scaler = scaler_water_mcar,raw_df = water_df,target_col = 'water_mcar',non_target_na_col = 'water_mar')
water_mar_ground_truth_scaled_df = scale_ground_truth(scaler = scaler_water_mar,raw_df = water_df,target_col = 'water_mar',non_target_na_col = 'water_mcar')

sugars_mcar_ground_truth_scaled_df = scale_ground_truth(scaler = scaler_sugars_mcar,raw_df = sugars_df,target_col = 'sugars_mcar',non_target_na_col = 'sugars_mar')
sugars_mar_ground_truth_scaled_df = scale_ground_truth(scaler = scaler_sugars_mar,raw_df = sugars_df,target_col = 'sugars_mar',non_target_na_col = 'sugars_mcar')

lutein_mcar_ground_truth_scaled_df = scale_ground_truth(scaler = scaler_lutein_mcar,raw_df = lutein_df,target_col = 'lutein_zeaxanthin_mcar',non_target_na_col = 'lutein_zeaxanthin_mar')
lutein_mar_ground_truth_scaled_df = scale_ground_truth(scaler = scaler_lutein_mar,raw_df = lutein_df,target_col = 'lutein_zeaxanthin_mar',non_target_na_col = 'lutein_zeaxanthin_mcar')


Feature names unseen at fit time:
- water
Feature names seen at fit time, yet now missing:
- water_mcar

Feature names unseen at fit time:
- water
Feature names seen at fit time, yet now missing:
- water_mar

Feature names unseen at fit time:
- sugars
Feature names seen at fit time, yet now missing:
- sugars_mcar

Feature names unseen at fit time:
- sugars
Feature names seen at fit time, yet now missing:
- sugars_mar

Feature names unseen at fit time:
- lutein_zeaxanthin
Feature names seen at fit time, yet now missing:
- lutein_zeaxanthin_mcar

Feature names unseen at fit time:
- lutein_zeaxanthin
Feature names seen at fit time, yet now missing:
- lutein_zeaxanthin_mar



In [25]:
water_mcar_train_df, water_mcar_val_df = train_test_split_scaled(water_mcar_scaled_df,drop_cols = ['name','dataset_type','serving_size'])
water_mar_train_df, water_mar_val_df = train_test_split_scaled(water_mar_scaled_df,drop_cols = ['name','dataset_type','serving_size'])

sugars_mcar_train_df, sugars_mcar_val_df = train_test_split_scaled(sugars_mcar_scaled_df,drop_cols = ['name','dataset_type','serving_size'])
sugars_mar_train_df, sugars_mar_val_df = train_test_split_scaled(sugars_mar_scaled_df,drop_cols = ['name','dataset_type','serving_size'])

lutein_mcar_train_df, lutein_mcar_val_df = train_test_split_scaled(lutein_mcar_scaled_df,drop_cols = ['name','dataset_type','serving_size'])
lutein_mar_train_df, lutein_mar_val_df = train_test_split_scaled(lutein_mar_scaled_df,drop_cols = ['name','dataset_type','serving_size'])


In [30]:
sugars_mcar_model, sugars_mcar_history = autoencoder(sugars_mcar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_4 (Dense)             (None, 40)                2920      
                                                                 
 dense_5 (Dense)             (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1000


  super().__init__(name, **kwargs)


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000


In [31]:
sugars_mar_model, sugars_mar_history = autoencoder(sugars_mar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_6 (Dense)             (None, 40)                2920      
                                                                 
 dense_7 (Dense)             (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


None
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000


In [32]:
water_mar_model, water_mar_history = autoencoder(water_mar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_8 (Dense)             (None, 40)                2920      
                                                                 
 dense_9 (Dense)             (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


None
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000


In [33]:
water_mcar_model, water_mcar_history = autoencoder(water_mcar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_10 (Dense)            (None, 40)                2920      
                                                                 
 dense_11 (Dense)            (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1000


  super().__init__(name, **kwargs)


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000


In [34]:
lutein_mar_model, lutein_mar_history = autoencoder(lutein_mar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_12 (Dense)            (None, 40)                2920      
                                                                 
 dense_13 (Dense)            (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


None
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000


In [35]:
lutein_mcar_model, lutein_mcar_history = autoencoder(lutein_mcar_train_df, input_shape = 72, latent_shape = 40, patience = 2, lr = 0.001)

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_data (InputLayer)     [(None, 72)]              0         
                                                                 
 dense_14 (Dense)            (None, 40)                2920      
                                                                 
 dense_15 (Dense)            (None, 72)                2952      
                                                                 
Total params: 5,872
Trainable params: 5,872
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1000


  super().__init__(name, **kwargs)


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000


In [47]:
import json
import pickle
lutein_mcar_model.save(f'{model_dir}/lutein_mcar_model.h5')
with open(f'{model_dir}/lutein_mcar_model.pkl', 'wb') as file_pi:
    pickle.dump(lutein_mcar_model.history, file_pi)
# json.dump(lutein_mcar_model.history, open(f'{model_dir}/lutein_mcar_model.json', 'w'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 14:26:47         1884
metadata.json                                  2022-11-28 14:26:47           64
variables.h5                                   2022-11-28 14:26:47        36504


In [60]:
save_autoencoder(model = sugars_mcar_model, model_name = 'sugars_mcar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 14:43:02         1878
metadata.json                                  2022-11-28 14:43:02           64
variables.h5                                   2022-11-28 14:43:02        36504


In [61]:
save_autoencoder(model = sugars_mar_model, model_name = 'sugars_mar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 14:43:14         1878
metadata.json                                  2022-11-28 14:43:14           64
variables.h5                                   2022-11-28 14:43:14        36504


In [62]:
save_autoencoder(model = water_mar_model, model_name = 'water_mar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 14:43:19         1878
metadata.json                                  2022-11-28 14:43:19           64
variables.h5                                   2022-11-28 14:43:19        36504


In [63]:
save_autoencoder(model = water_mcar_model, model_name = 'water_mcar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 14:43:25         1884
metadata.json                                  2022-11-28 14:43:25           64
variables.h5                                   2022-11-28 14:43:25        36504


In [64]:
save_autoencoder(model = lutein_mar_model, model_name = 'lutein_mar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 14:43:30         1884
metadata.json                                  2022-11-28 14:43:30           64
variables.h5                                   2022-11-28 14:43:30        36504


In [65]:
save_autoencoder(model = lutein_mcar_model, model_name = 'lutein_mcar_model',path = model_dir)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-11-28 14:43:34         1884
metadata.json                                  2022-11-28 14:43:34           64
variables.h5                                   2022-11-28 14:43:34        36504


In [72]:
tes = pd.read_pickle(f'{model_dir}/lutein_mcar_model_history.pkl').history['val_loss']

Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2022-11-28 14:43:34         1884
metadata.json                                  2022-11-28 14:43:34           64
variables.h5                                   2022-11-28 14:43:34        36504
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...vars


In [69]:
test = keras.models.load_model(f'{model_dir}/lutein_mcar_model.h5')
test.history

In [34]:
max_water_mcar = water_mcar_train_df.water_mcar.max()
min_water_mcar = water_mcar_train_df.water_mcar.min()

In [67]:
water_mcar_test_df['water_mcar'] = 0#np.random.uniform(min_water_mcar, max_water_mcar, size=len(water_mcar_test_df))

In [37]:
water_mcar_test_df

Unnamed: 0,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,pantothenic_acid,riboflavin,thiamin,vitamin_a,vitamin_a_rae,carotene_alpha,carotene_beta,cryptoxanthin_beta,lutein_zeaxanthin,vitamin_b12,vitamin_b6,vitamin_c,vitamin_d,vitamin_e,tocopherol_alpha,vitamin_k,calcium,copper,irom,magnesium,manganese,phosphorous,potassium,selenium,zink,protein,alanine,arginine,aspartic_acid,cystine,glutamic_acid,glycine,histidine,hydroxyproline,isoleucine,leucine,lysine,methionine,phenylalanine,proline,serine,threonine,tryptophan,tyrosine,valine,carbohydrate,fiber,sugars,fructose,galactose,glucose,lactose,maltose,sucrose,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water_mcar
1,-0.619342,-0.551556,-0.366113,-0.419401,-0.264566,-0.377104,-0.198124,-0.121463,0.295053,0.016684,0.315431,-0.165265,-0.119859,-0.06443,-0.101149,-0.039304,-0.039796,-0.312098,0.387865,-0.120460,-0.125774,-0.224649,-0.224649,-0.106650,0.565155,1.057292,0.776284,2.507023,1.367768,1.442629,0.469791,-0.355989,0.378058,0.005471,0.247723,-0.195034,-0.083019,0.785573,0.862210,-0.076787,-0.062212,-0.367017,0.059323,0.257674,-0.452719,0.554252,0.543382,0.213058,0.379865,0.157415,0.193964,0.252622,0.283687,1.776759,1.304084,-0.352894,0.311358,-0.062457,0.372412,-0.07869,-0.150615,0.023213,-0.620268,-0.544198,-0.549419,-0.273112,-0.366113,-0.024432,0.265352,-0.032626,-0.083373,-0.003348
2,-0.643570,-0.435218,-0.358304,-0.373641,-0.361717,-0.346635,-0.198124,-0.838286,-0.244720,-0.411286,-0.437932,-0.154655,-0.105630,-0.06443,-0.105163,0.004479,-0.109427,-0.283870,-0.564302,-0.070570,-0.125774,-0.241400,-0.241400,-0.137364,-0.136664,-0.267279,-0.475648,-0.467465,-0.075670,-0.778567,-0.580093,-0.455583,-0.510558,-1.183777,-0.859682,-0.839409,-0.841020,-0.883861,-0.933870,-0.779530,-0.854175,-0.367017,-0.891171,-0.904661,-0.834197,-0.856819,-0.915290,-0.882456,-0.907343,-0.882938,-0.868764,-0.882973,-0.905697,0.257549,-0.189534,1.322267,-0.162020,-0.062457,-0.142311,-0.07869,-0.173220,-0.156322,-0.643289,-0.440772,-0.557515,-0.454119,-0.358304,-0.024432,-0.642443,-0.032626,-0.083373,1.408102
3,-0.746538,-0.595182,-0.366113,-0.395175,0.296750,0.057075,-0.198124,-0.741841,0.088316,-0.502818,-0.390198,-0.167846,-0.119859,-0.06443,-0.106167,-0.039304,-0.116508,-0.312098,-0.230317,0.925066,-0.125774,-0.224649,-0.224649,0.113204,-0.314904,-0.248647,-0.428847,-0.349161,-0.053148,-0.755727,0.063794,-0.486491,-0.569799,-1.103844,-0.687716,-0.732221,-0.677402,-0.742383,-0.796040,-0.674929,-0.706833,-0.367017,-0.756470,-0.789298,-0.614036,-0.790881,-0.779454,-0.765315,-0.729369,-0.727905,-0.715853,-0.756520,-0.688973,-0.646582,-0.033484,-0.347678,0.814953,-0.062457,0.520483,-0.07869,-0.173220,-0.156322,-0.747491,-0.590602,-0.625578,-0.463069,-0.366113,-0.024432,-0.476551,-0.032626,-0.083373,1.651227
13,-0.710196,-0.580640,-0.366113,-0.388446,-0.500247,-0.232377,-0.198124,-0.495732,-0.325912,-0.282648,0.292602,-0.167846,-0.119859,-0.06443,-0.106167,-0.039304,-0.117689,-0.312098,-0.456431,-0.120460,-0.125774,-0.243793,-0.243793,-0.137364,-0.270344,2.032935,-0.164755,-0.197057,0.005816,-0.750017,0.974116,-0.483057,-0.253846,-0.160244,-0.271142,-0.325903,0.762798,0.509692,-0.504827,-0.204961,-0.220078,-0.367017,-0.084863,-0.208130,-0.233572,-0.385363,0.027205,0.237806,0.052889,0.037060,1.057909,-0.007722,0.132847,0.175776,-0.479340,-0.490007,-0.162020,-0.062457,-0.142311,-0.07869,-0.173220,-0.156322,-0.709930,-0.577219,-0.598133,-0.436950,-0.366113,-0.024432,0.094852,-0.032626,-0.083373,-1.192960
20,-0.740481,-0.595182,-0.366113,-0.426130,-0.500247,-0.377104,-0.198124,-0.851971,-0.413117,-0.651247,-0.493967,-0.167846,-0.119859,-0.06443,-0.106167,-0.039304,-0.117689,-0.312098,-0.612014,-0.120460,-0.125774,-0.243793,-0.243793,-0.137364,-0.326044,-0.314706,-0.499049,-0.602669,-0.077391,-1.006968,-0.884591,-0.489926,-0.645966,-1.291004,-0.859682,-0.839409,-0.841020,-0.883861,-0.933870,-0.779530,-0.854175,-0.367017,-0.891171,-0.904661,-0.834197,-0.856819,-0.915290,-0.882456,-0.907343,-0.882938,-0.868764,-0.882973,-0.905697,2.547208,0.055687,-0.490007,-0.162020,-0.062457,-0.142311,-0.07869,-0.173220,-0.156322,-0.740221,-0.601075,-0.617482,-0.426356,-0.366113,-0.024432,-0.227714,-0.032626,-0.083373,1.330437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7166,-0.213525,-0.115289,0.321081,-0.357491,1.439172,-0.308549,-0.198124,0.961156,0.004869,-0.292543,-0.338314,-0.167846,-0.119859,-0.06443,-0.106167,-0.039304,-0.117689,0.072238,0.609829,-0.120460,-0.081150,-0.143288,-0.143288,-0.113116,-0.348324,-0.163956,-0.178126,-0.197057,-0.075670,0.243526,0.231902,0.609044,0.851987,1.466686,1.690166,1.440208,1.541135,1.698103,1.343811,1.758889,1.521714,3.132008,1.550517,1.544079,1.591637,1.572992,1.419004,1.343219,1.398041,1.422158,0.553304,1.351026,1.528554,-0.823283,-0.479340,-0.490007,-0.162020,-0.062457,-0.142311,-0.07869,-0.173220,-0.156322,-0.216185,-0.108383,-0.134591,-0.406265,0.321081,-0.024432,-0.319876,-0.032626,-0.083373,-0.744752
7167,0.628393,0.728159,0.570970,-0.338648,-0.500247,-0.232377,-0.198124,0.583847,0.045465,-0.032792,-0.348691,-0.167846,-0.119859,-0.06443,-0.106167,-0.039304,-0.117689,0.250292,-0.383826,-0.120460,-0.125774,-0.243793,-0.243793,-0.137364,-0.298194,-0.075877,-0.092881,-0.163256,-0.073793,0.192135,0.101856,0.794495,1.114340,1.740603,1.912537,1.462643,1.689038,1.740546,1.485931,1.458345,1.737465,-0.367017,1.954619,1.728007,1.951808,1.774102,1.730382,1.268974,1.484959,1.832179,1.906562,1.708069,2.003614,-0.823283,-0.479340,-0.490007,-0.162020,-0.062457,-0.142311,-0.07869,-0.173220,-0.156322,0.607731,0.730226,0.691223,-0.176489,0.570970,-0.024432,-0.310660,-0.032626,-0.083373,0.690429
7170,0.446684,0.655448,0.305463,-0.339994,0.863463,-0.331401,-0.198124,0.157880,0.147706,-0.186169,-0.356992,-0.160964,-0.111559,-0.06443,-0.106167,-0.039304,-0.117689,0.315434,0.128560,-0.120460,-0.081150,-0.219863,-0.219863,-0.111499,-0.353894,-0.180894,-0.087867,-0.247758,-0.075514,0.089355,0.171637,0.413290,1.788559,1.141105,1.249873,1.207136,1.226842,0.969494,1.181847,0.897034,1.271759,1.635792,1.121751,1.279615,1.378577,1.480679,1.067919,0.808661,1.075205,1.356881,1.279629,1.249367,1.048292,-0.823283,-0.479340,-0.490007,-0.162020,-0.062457,-0.142311,-0.07869,-0.173220,-0.156322,0.468392,0.652693,0.670502,-0.322793,0.305463,-0.024432,-0.393606,-0.032626,-0.083373,1.537408
7186,-0.437633,-0.347965,0.125856,-0.339994,1.273656,-0.316166,-0.198124,-0.062380,-0.413117,-0.082268,-0.263600,-0.167846,-0.119859,-0.06443,-0.106167,-0.039304,-0.117689,0.424004,0.055955,-0.120460,-0.125774,-0.205506,-0.205506,-0.137364,-0.392884,-0.114835,-0.009308,-0.213957,-0.077391,0.289206,0.044762,0.361776,1.562879,1.233710,1.460384,1.335513,1.545757,1.082676,1.292862,1.034047,1.342799,1.046731,1.645377,1.587612,1.632219,1.566398,1.431543,0.924152,1.242832,1.558831,1.271983,1.470040,1.521619,-0.823283,-0.479340,-0.490007,-0.162020,-0.062457,-0.142311,-0.07869,-0.173220,-0.156322,-0.436098,-0.343601,-0.272363,-0.398593,0.125856,-0.024432,-0.200066,-0.032626,-0.083373,0.186553


In [69]:
water_mcar_pred = pd.DataFrame(auto_encoder.predict(water_mcar_test_df), columns = water_mcar_test_df.columns)



In [71]:
water_mcar_true = pd.DataFrame(scaler_water_mcar.transform(water_df[water_df.water_mcar.isnull()].drop(['name','water_mcar', 'water_mar'], axis = 'columns')),columns = water_df.drop(['name','water_mcar', 'water_mar'], axis = 'columns').columns).drop('serving_size',axis = 'columns')# a = np.array(x) # your x
# b = np.array(y) # your y
# mses = ((a-b)**2).mean(axis=1)

Feature names unseen at fit time:
- water
Feature names seen at fit time, yet now missing:
- water_mcar



In [77]:
from sklearn.metrics import mean_squared_error
pd.DataFrame({'columns':water_mcar_true.columns,
              'mse':list(mean_squared_error(water_mcar_true, water_mcar_pred, multioutput='raw_values'))}).nlargest(10,'mse')

Unnamed: 0,columns,mse
31,selenium,1.593192
57,galactose,0.66808
58,glucose,0.558205
29,phosphorous,0.404909
6,folic_acid,0.256179
56,fructose,0.250337
71,water,0.208576
5,folate,0.201098
33,protein,0.189536
17,vitamin_b12,0.161144
