In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as mse
from sklearn import preprocessing as pr
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import xgboost

ModuleNotFoundError: No module named 'xgboost'

In [9]:
#importing dataset into dataframe
set_clean = pd.read_csv("./creditcard.csv")

#only imputing and removing attribute values
set_clean.drop("Class", axis=1, inplace=True)

#copying dataset to keep clean set for evaluation
set = set_clean.copy()

#removing 5% of each column ranodmly for fair imputation analysis
for col in set.columns:
    set.loc[set.sample(frac=0.05, random_state=1).index, col] = np.nan

In [10]:
#Since KNN and XGBoost imputation methods require data to be normalized, clean and imputed datasets are normalized for 
#fair comparison between diferent approaches
scaler = MinMaxScaler()
set_norm = set.copy(deep=True)
set_norm = pd.DataFrame(scaler.fit_transform(set_norm), columns = set_norm.columns)
set_clean_norm = set_clean.copy(deep=True)
set_clean_norm = pd.DataFrame(scaler.fit_transform(set_clean_norm), columns = set_clean_norm.columns)

In [12]:
#First will try a simple imputation method: filling with the last observable value
set_ffill_norm = set_norm.copy(deep=True)
set_ffill_norm.fillna(method='ffill',inplace=True)
rmse = np.sqrt(mse(set_ffill_norm, set_clean_norm))
print("Norm. ffill RMSE: {0:.5g}".format(rmse))
del set_ffill_norm
del rmse

Norm. ffill RMSE: 0.012244


In [13]:
#Trying linear interpolation, predicted to be bad since the datais not linear
set_lin_norm = set_norm.copy(deep=True)
set_lin_norm.interpolate(limit_direction="both",inplace=True)
rmse = np.sqrt(mse(set_lin_norm, set_clean_norm))
print("Norm. linear interpolation RMSE: {0:.5g}".format(rmse))
del set_lin_norm
del rmse

Norm. linear interpolation RMSE: 0.010612


In [None]:
#Trying KNN imputation algorithm
#here, n_neighbors is a hyperparameter.
#Running this code for different values gave the same result.
set_knn_norm = set_norm.copy(deep=True)
knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")
set_knn_norm = knn_imputer.fit_transform(set_knn_norm)
rmse = np.sqrt(mse(set_knn_norm, set_clean_norm))
print("KNN RMSE: {0:.5g}".format(rmse))
del set_knn_norm
del knn_imputer
del rmse


In [None]:
#Now attempting XGB regressor.
#Default hyperparameters gave the best result on a smaller version of the dataset, compared to tuned versions.
iimp = IterativeImputer(
    estimator = xgboost.XGBRegressor(),
    random_state = 1,
    verbose = 2,
)
set_xgb_norm = set_norm.copy(deep=True)
set1_xgb_norm = iimp.fit_transform(set_xgb_norm)

rmse = np.sqrt(mse(set_xgb_norm, set_clean_norm))
print("XGBoost RMSE: {0:.5g}".format(rmse))
del set_xgb_norm
del rmse