## Evaluating all data imputation techniques

#### Considered methods:
- mean imputation
- Last observation carried forward imputation
- linear imputation
- k-nearest neighbours
- CNN based imputation

#### 1. Configuring setup

In [None]:
pip install tensorflow tensorflow-datasets

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import Model, load_model
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
## Reading the traffic data-frames
traffic_data = pd.read_csv('Output_Data/traffic_data.csv').drop(columns = ['Hour'])
traffic_data_5 = pd.read_csv('Output_Data/traffic_data_mapped_5.csv').drop(columns = ['Hour'])
traffic_data_15 = pd.read_csv('Output_Data/traffic_data_mapped_15.csv').drop(columns = ['Hour'])
traffic_data_25 = pd.read_csv('Output_Data/traffic_data_mapped_25.csv').drop(columns = ['Hour'])
hour = pd.read_csv('Output_Data/traffic_data.csv')['Hour']

In [4]:
## Subsetting the data-frame to only account for the testing data
def creating_test(data):
    data_1 = data.iloc[38400:38600, 0:200]
    data_2 = data.iloc[38400:38600, 50:250]; data_2.columns = data_1.columns
    data_3 = data.iloc[38400:38600, 100:300]; data_3.columns = data_1.columns
    data_4 = data.iloc[38400:38600, 150:350]; data_4.columns = data_1.columns
    data_5 = data.iloc[38400:38600, 200:400]; data_5.columns = data_1.columns
    data_6 = data.iloc[38700:38900, 0:200]; data_6.columns = data_1.columns
    data_7 = data.iloc[38700:38900, 50:250]; data_7.columns = data_1.columns
    data_8 = data.iloc[38700:38900, 100:300]; data_8.columns = data_1.columns
    data_9 = data.iloc[38700:38900, 150:350]; data_9.columns = data_1.columns
    data_10 = data.iloc[38700:38900, 200:400]; data_10.columns = data_1.columns
    
    return pd.concat([data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, data_10], axis = 0, ignore_index = True)

test = creating_test(traffic_data)
test_5 = creating_test(traffic_data_5)
test_15 = creating_test(traffic_data_15)
test_25 = creating_test(traffic_data_25)

In [5]:
## Saving the original dimensions
n = test.shape[0]; p = test.shape[1]

## Scaling the values to be between 0 and 1
test = test / 255.
test_5 = test_5 / 255.
test_15 = test_15 / 255.
test_25 = test_25 / 255.

## Flattening the data-frames for easier computation
test_flat = np.array(test).flatten()
test_5_flat = np.array(test_5).flatten()
test_15_flat = np.array(test_15).flatten()
test_25_flat = np.array(test_25).flatten()

In [6]:
## Recording the indices where each data-frame has missing values
missing_idx_5 = np.argwhere(test_5_flat == 1.0)
missing_idx_15 = np.argwhere(test_15_flat == 1.0)
missing_idx_25 = np.argwhere(test_25_flat == 1.0)

## Extracting the true values for these indices
Y_true_5 = test_flat[missing_idx_5]
Y_true_15 = test_flat[missing_idx_15]
Y_true_25 = test_flat[missing_idx_25]

In [7]:
## Defining MSE and MAE functions
def imputed_mse(Y_true, Y_pred):
    return mean_squared_error(Y_true, Y_pred, squared = False)

def imputed_mae(Y_true, Y_pred):
    return mean_absolute_error(Y_true, Y_pred)

def evaluation(Y_true, Y_pred):
    MSE = imputed_mse(Y_true, Y_pred); MAE = imputed_mae(Y_true, Y_pred)
    return MSE, MAE

#### 2. Mean Imputation

In [8]:
def mean_imputer(col):
    col = np.where(col == 1.0, np.mean(col), col)
    return col

mean_imputed_5 = np.array(test_5.copy().apply(mean_imputer, axis = 0)).flatten()
mean_imputed_15 = np.array(test_15.copy().apply(mean_imputer, axis = 0)).flatten()
mean_imputed_25 = np.array(test_25.copy().apply(mean_imputer, axis = 0)).flatten()

mean_imputed_5 = mean_imputed_5[missing_idx_5]
mean_imputed_15 = mean_imputed_15[missing_idx_15]
mean_imputed_25 = mean_imputed_25[missing_idx_25]

mean_imputer_mse_5, mean_imputer_mae_5 = evaluation(Y_true_5, mean_imputed_5)
mean_imputer_mse_15, mean_imputer_mae_15 = evaluation(Y_true_15, mean_imputed_15)
mean_imputer_mse_25, mean_imputer_mae_25 = evaluation(Y_true_25, mean_imputed_25)

In [9]:
print(mean_imputer_mse_5, mean_imputer_mae_5)
print(mean_imputer_mse_15, mean_imputer_mae_15)
print(mean_imputer_mse_25, mean_imputer_mae_25)

0.07323306937038288 0.06267482610738885
0.1904966550508372 0.18340013290942975
0.37528951894229434 0.37093955306196313


#### 3. Last Observation Carried Forward

In [10]:
def locf_imputer(data, missing_idx):
    
    for idx in missing_idx:
        i = 1
        while np.isnan(data[idx-i]):
            i += 1
        data[idx] = data[idx-i]
    return data

locf_imputed_5 = locf_imputer(test_5_flat, missing_idx_5)[missing_idx_5]
locf_imputed_15 = locf_imputer(test_15_flat, missing_idx_15)[missing_idx_15]
locf_imputed_25 = locf_imputer(test_25_flat, missing_idx_25)[missing_idx_25]

locf_imputer_mse_5, locf_imputer_mae_5 = evaluation(Y_true_5, locf_imputed_5)
locf_imputer_mse_15, locf_imputer_mae_15 = evaluation(Y_true_15, locf_imputed_15)
locf_imputer_mse_25, locf_imputer_mae_25 = evaluation(Y_true_25, locf_imputed_25)

In [11]:
print(locf_imputer_mse_5, locf_imputer_mae_5)
print(locf_imputer_mse_15, locf_imputer_mae_15)
print(locf_imputer_mse_25, locf_imputer_mae_25)

0.04809841142147701 0.028607450289030476
0.04889635259149022 0.028722079757440802
0.04996210766678489 0.028991109543814245


#### 4. Linear imputation

In [12]:
def linear_imputer(data, missing_idx):
    
    for idx in missing_idx:
        a = 1
        while np.isnan(data[idx-a]):
            a += 1
        if idx < 1999:
            b = 1
            while np.isnan(data[idx+b]):
                b += 1
        else: b = -1
        data[idx] = (data[idx-a] + data[idx+b]) / 2
    return data

linear_imputed_5 = linear_imputer(test_5_flat, missing_idx_5)[missing_idx_5]
linear_imputed_15 = linear_imputer(test_15_flat, missing_idx_15)[missing_idx_15]
linear_imputed_25 = linear_imputer(test_25_flat, missing_idx_25)[missing_idx_25]

linear_imputer_mse_5, linear_imputer_mae_5 = evaluation(Y_true_5, linear_imputed_5)
linear_imputer_mse_15, linear_imputer_mae_15 = evaluation(Y_true_15, linear_imputed_15)
linear_imputer_mse_25, linear_imputer_mae_25 = evaluation(Y_true_25, linear_imputed_25)

In [13]:
print(linear_imputer_mse_5, linear_imputer_mae_5)
print(linear_imputer_mse_15, linear_imputer_mae_15)
print(linear_imputer_mse_25, linear_imputer_mae_25)

0.04809404130623528 0.028601608612834593
0.04888810757855967 0.028715928254446986
0.04995207058453482 0.02898364163798717


#### 5. K-Nearest Neighbours

In [14]:
def knn_imputer(data):
    return KNNImputer(n_neighbors = 5).fit_transform(data)

knn_test_5 = pd.DataFrame(np.where(test_5 == 1.0, np.nan, test_5))
knn_test_15 = pd.DataFrame(np.where(test_15 == 1.0, np.nan, test_15))
knn_test_25 = pd.DataFrame(np.where(test_25 == 1.0, np.nan, test_25))

knn_imputed_5 = np.array(knn_imputer(knn_test_5)).flatten()
knn_imputed_15 = np.array(knn_imputer(knn_test_15)).flatten()
knn_imputed_25 = np.array(knn_imputer(knn_test_25)).flatten()

knn_imputed_5 = knn_imputed_5[missing_idx_5]
knn_imputed_15 = knn_imputed_15[missing_idx_15]
knn_imputed_25 = knn_imputed_25[missing_idx_25]

knn_imputer_mse_5, knn_imputer_mae_5 = evaluation(Y_true_5, knn_imputed_5)
knn_imputer_mse_15, knn_imputer_mae_15 = evaluation(Y_true_15, knn_imputed_15)
knn_imputer_mse_25, knn_imputer_mae_25 = evaluation(Y_true_25, knn_imputed_25)

In [15]:
print(knn_imputer_mse_5, knn_imputer_mae_5)
print(knn_imputer_mse_15, knn_imputer_mae_15)
print(knn_imputer_mse_25, knn_imputer_mae_25)

0.016465334677439974 0.007034292489167325
0.016426942770228416 0.007178376626211829
0.017240141503471737 0.007726060209106818


#### CNN based imputation

In [17]:
## Reading the testing image data sets
testing_5 = tf.keras.utils.image_dataset_from_directory('Image_Data/Testing/Missing/five', labels = None, color_mode = 'grayscale', 
                                                      batch_size = None, seed = 42, image_size = (200, 200), shuffle = False)
testing_15 = tf.keras.utils.image_dataset_from_directory('Image_Data/Testing/Missing/fifteen', labels = None, color_mode = 'grayscale', 
                                                      batch_size = None, seed = 42, image_size = (200, 200), shuffle = False)
testing_25 = tf.keras.utils.image_dataset_from_directory('Image_Data/Testing/Missing/twenty-five', labels = None, color_mode = 'grayscale', 
                                                      batch_size = None, seed = 42, image_size = (200, 200), shuffle = False)
testing_target = tf.keras.utils.image_dataset_from_directory('Image_Data/Testing/Filled', labels = None, color_mode = 'grayscale', 
                                                      batch_size = None, seed = 42, image_size = (200, 200), shuffle = False)

Found 10 files belonging to 1 classes.
Found 10 files belonging to 1 classes.
Found 10 files belonging to 1 classes.
Found 10 files belonging to 1 classes.


In [18]:
## Scaling the data to be between 0 and 1
testing_5 = testing_5.map(lambda x:(x/255))
testing_15 = testing_15.map(lambda x:(x/255))
testing_25 = testing_25.map(lambda x:(x/255))
testing_target = testing_target.map(lambda x:(x/255))

In [20]:
## Converting tf.Dataset objects to numpy arrays
def dataset_to_numpy(ds):
    images = []
    for i, image in enumerate(tfds.as_numpy(ds)): 
        images.append(image)
    return np.array(images)

X_test_5 = dataset_to_numpy(testing_5)
X_test_15 = dataset_to_numpy(testing_15)
X_test_25 = dataset_to_numpy(testing_25)
Y_test = dataset_to_numpy(testing_target)

In [21]:
## Loading the CNN models
autoencoder_5 = load_model(os.path.join('Models', 'autoencoder_missing_5.h5'))
autoencoder_15 = load_model(os.path.join('Models', 'autoencoder_missing_15.h5'))
autoencoder_25 = load_model(os.path.join('Models', 'autoencoder_missing_25.h5'))

## Predicting on the test sets
testing_preds_5 = autoencoder_5.predict(X_test_5)
testing_preds_15 = autoencoder_15.predict(X_test_15)
testing_preds_25 = autoencoder_25.predict(X_test_25)



In [22]:
## Defining MSE and MAE functions specifically for CNN predictions
def imputed_mse(Y_true, Y_pred):
    return mean_squared_error(Y_true, Y_pred, squared = False)

def imputed_mae(Y_true, Y_pred):
    return mean_absolute_error(Y_true, Y_pred)

def evaluation(X, Y, preds):
    mse = list(); mae = list()
    
    for i in range(len(preds)):
        
        ## Flattening the arrays 
        X_flat = np.squeeze(X[i]).flatten()
        Y_flat = np.squeeze(Y[i]).flatten()
        preds_flat = np.squeeze(preds[i]).flatten()
        
        ## Finding the index of imputed values
        missing_idx = np.argwhere(X_flat == 1.0)
        
        ## Extracting the Y_true and Y_pred values
        Y_true = Y_flat[missing_idx]
        Y_pred = preds_flat[missing_idx]
        
        ## Computing the metrics:
        mse.append(imputed_mse(Y_true, Y_pred))
        mae.append(imputed_mae(Y_true, Y_pred))
        
    return mse, mae

CNN_imputer_mse_5, CNN_imputer_mae_5 = evaluation(X_test_5, Y_test, testing_preds_5)
CNN_imputer_mse_15, CNN_imputer_mae_15 = evaluation(X_test_15, Y_test, testing_preds_15)
CNN_imputer_mse_25, CNN_imputer_mae_25 = evaluation(X_test_25, Y_test, testing_preds_25)

In [23]:
print(np.mean(CNN_imputer_mse_5), np.mean(CNN_imputer_mae_5))
print(np.mean(CNN_imputer_mse_15), np.mean(CNN_imputer_mae_15))
print(np.mean(CNN_imputer_mse_25), np.mean(CNN_imputer_mae_25))

0.05831934 0.041224577
0.05640576 0.04095196
0.04808302 0.031264655
