# 1. Comparisson of Imputers using RMSE
## 1.1. Import all needed packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from fireman_imputation.src import utils
from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn import metrics

import matplotlib.pyplot as plt

## 1.2. Main
### 1.2.1 Load the data

In [2]:
# load and scale the data
data_orig = pd.read_csv('data/spam.csv',index_col=False)
data = data_orig.values

# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(data)
data_scaled = scaler.transform(data)

# create missing data mask
data_scaled_missing, mask = utils.MCARgen(data_scaled, 0.2)

# divide the data to train/test
# by default shuffles data, if pandas is passed the index shows shuffle result
data_missing_train, data_missing_test, data_train, data_test = train_test_split(data_scaled_missing, data_scaled, train_size=0.9)

### 1.2.2 Run sklearn imputers

In [3]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_knn = KNNImputer(n_neighbors=2)
imp_iter = IterativeImputer(max_iter=10, random_state=0)

imp_mean.fit(data_missing_train)
imp_median.fit(data_missing_train)
imp_most_frequent.fit(data_missing_train)
imp_knn.fit(data_missing_train)
imp_iter.fit(data_missing_train)

data_mean = imp_mean.transform(data_missing_test)
data_median = imp_median.transform(data_missing_test)
data_most_frequent = imp_most_frequent.transform(data_missing_test)
data_knn = imp_knn.transform(data_missing_test)
data_iter = imp_iter.transform(data_missing_test)

data_mean_rmse = metrics.mean_squared_error(data_test, data_mean, squared=True)
data_median_rmse = metrics.mean_squared_error(data_test, data_median, squared=True)
data_most_frequent_rmse = metrics.mean_squared_error(data_test, data_most_frequent, squared=True)
data_knn_rmse = metrics.mean_squared_error(data_test, data_knn, squared=True)
data_iter_rmse = metrics.mean_squared_error(data_test, data_iter, squared=True)



In [41]:
print('RMSE between test and imputed data:\nMean = {}\nMedian = {}\nMost frequent = {}\n'
      'KNN = {}\nIterative = {}'.format(data_mean_rmse, data_median_rmse, 
                                        data_most_frequent_rmse, data_knn_rmse, data_iter_rmse))

RMSE between test and imputed data:
Mean = 0.0007025419362472685
Median = 0.0007663804477698816
Most frequent = 0.0008034067442840705
KNN = 0.000815383624897717
Iterative = 0.0006423308784567793
