In [17]:
# Installed with pip
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# Included in python 3.10.0
import numpy as np
import random
import math

In [18]:
data_path_pairs = []
data_path_pairs.append(('iris' , '../data/iris.csv'))
data_path_pairs.append(('wine' , '../data/wine.csv'))
data_path_pairs.append(('winequality-red' , '../data/winequality-red.csv'))
data_path_pairs.append(('winequality-white' , '../data/winequality-white.csv'))
data_path_pairs.append(('abalone' , '../data/abalone.csv'))

In [19]:
imputer_pairs = []
imputer_pairs.append(('mean' , SimpleImputer(strategy='mean')))
imputer_pairs.append(('median' , SimpleImputer(strategy='median')))
imputer_pairs.append(('most_frequent' , SimpleImputer(strategy='most_frequent')))
imputer_pairs.append(('knn' , KNNImputer()))
imputer_pairs.append(('iterative' , IterativeImputer(random_state = 42, max_iter = 250)))

In [20]:
missing_percentages = [0.10, 0.15, 0.20, 0.25]

In [21]:
def getValueListFromIndexListArray(array, indexes):
    values = []
    for row, col in indexes:
        values.append(array[row, col])
    return values

def GetResultDictionaryForImputation(imputed_array, ix, true_values):
    results_per_imputer = {}

    # Get Result values
    imputed_values = getValueListFromIndexListArray(imputed_array, ix)

    # Append MAE and RMSE
    results_per_imputer['mae'] = (mean_absolute_error(true_values, imputed_values))
    results_per_imputer['rmse'] = (math.sqrt(mean_squared_error(true_values, imputed_values)))

    return results_per_imputer

In [22]:
scaler = MinMaxScaler()
results = {}

# Iterate data sets
for data_path_pair in data_path_pairs:
    results_per_dataset = {}
    
    data_name = data_path_pair[0]
    data_path = data_path_pair[1]

    # Load Data
    data_true = pd.read_csv(data_path).to_numpy()
    data_scaled = scaler.fit_transform(data_true)

    ## Get all Indexes
    ix = [(row, col) for row in range(data_scaled.shape[0]) for col in range(data_scaled.shape[1])]

    # Get true values
    true_values = getValueListFromIndexListArray(data_scaled, ix)

    for missing_percentage in missing_percentages:
        results_per_percentage = {}

        # Create Dropped Data
        data_dropped = data_scaled.copy()

        ## Inserting random nan values
        random.seed(42)
        for row, col in random.sample(ix, int(round(missing_percentage*len(ix))), ):
            data_dropped[row, col] = np.nan
        
        # # Save dropped df locally
        # pd.DataFrame(data_dropped).to_csv(f'../data/svd/{data_name}_{missing_percentage}_dropped.csv', index=None)
        
        # Iterate through Imputers
        for imputer_pair in imputer_pairs:

            # Impute
            imputer_name = imputer_pair[0]
            imputer = imputer_pair[1]
            imputed_array = imputer.fit_transform(data_dropped)

            results_per_percentage[imputer_name] = GetResultDictionaryForImputation(imputed_array, ix, true_values)
        
        # Add SVD Results
        svd_imputed_array = pd.read_csv(f'../data/svd/{data_name}_{missing_percentage}_imputed.csv', index_col=0).to_numpy()
        results_per_percentage['svd'] = GetResultDictionaryForImputation(svd_imputed_array, ix, true_values)
        
        results_per_dataset[missing_percentage] = (results_per_percentage)
    
    results[data_name] = results_per_dataset


In [26]:
results['iris']

{0.1: {'mean': {'mae': 0.021708126417943313, 'rmse': 0.0844547082111173},
  'median': {'mae': 0.02362445072190835, 'rmse': 0.09319324465803813},
  'most_frequent': {'mae': 0.02461785938480854, 'rmse': 0.108065503281979},
  'knn': {'mae': 0.007312460765850597, 'rmse': 0.03198552443223404},
  'iterative': {'mae': 0.007959949887017906, 'rmse': 0.031932083795739044},
  'svd': {'mae': 0.01305158391548032, 'rmse': 0.06251261822500481}},
 0.15: {'mean': {'mae': 0.0312846372663869, 'rmse': 0.09874734783902572},
  'median': {'mae': 0.033265458254865043, 'rmse': 0.10723451201918283},
  'most_frequent': {'mae': 0.03962688323917137, 'rmse': 0.13501179708135153},
  'knn': {'mae': 0.01368047708725675, 'rmse': 0.04682911081741574},
  'iterative': {'mae': 0.012863322683347323, 'rmse': 0.042501139635590136},
  'svd': {'mae': 0.023182909911519767, 'rmse': 0.09114978788372181}},
 0.2: {'mean': {'mae': 0.04188951663527935, 'rmse': 0.11488468840147235},
  'median': {'mae': 0.04418628374136849, 'rmse': 0.12