In [1]:
from IPython.display import Markdown, Math, Latex, IFrame
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import mercury as mr

show_code = mr.Checkbox(label="Show code", value=False)

app = mr.App(title="[DEMO] DEV Project 2: Missing Data Imputation",
             description="""This notebook shows the impact of missing data on model performance and 
             investigates popular techniques of data imputation to mitigate impact of missing data on model performance""",
             show_code=False,
             show_sidebar=True)

# add text widget for seed
seed_widget = mr.Text(value="42", label="Random Seed", rows=1)

mercury.Checkbox

mercury.Text

In [3]:
RANDOM_SEED = 42
seed = seed_widget.value
range_max = 4294967295
if seed != "":
    try:
        RANDOM_SEED = int(seed)
    except:
        RANDOM_SEED = abs(hash(seed)) % (range_max + 1)
# print(RANDOM_SEED)

42


In [6]:
from sklearn.datasets import load_breast_cancer, load_wine

# choose the dataset
dataset = mr.Select(label="Choose a dataset", choices=["Breast Cancer", "Wine"], value="Breast Cancer")

if dataset.value == "Breast Cancer":
    data = load_breast_cancer()
else:
    data = load_wine()
# data = load_breast_cancer()

mercury.Select

In [9]:
X = data["data"]
y = data["target"]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [20]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

print(dataset.value)

(142, 13)


In [13]:
missing_ratio_slider_1 = mr.Slider(value=0.03, min=0, max=1, label="Missing Data", step=0.01)
missing_ratio_slider_2 = mr.Slider(value=0.40, min=0, max=1, label="Missing Data", step=0.01)
imputation_method = mr.MultiSelect(label="Imputation method(s)", 
                                   value=["Simple imputation by mean", 
                                          "Simple imputation by median", 
                                          "Simple imputation by mode",
                                          "MICE"], 
                                   choices=["Simple imputation by mean", 
                                            "Simple imputation by median", 
                                            "Simple imputation by mode",
                                            "MICE"])

mercury.Slider

mercury.Slider

mercury.MultiSelect

In [14]:
missing_ratio_1 = missing_ratio_slider_1.value
missing_ratio_2 = missing_ratio_slider_2.value
np.random.seed(RANDOM_SEED)
missing_indices_1 = np.random.choice(X_train.size, int(X_train.size * missing_ratio_1), 
                                   replace=False)
np.random.seed(RANDOM_SEED)
missing_indices_2 = np.random.choice(X_train.size, int(X_train.size * missing_ratio_2), 
                                     replace=False)

X_train_missing_1 = X_train.copy()
X_train_missing_1.flat[missing_indices_1] = np.nan

X_train_missing_2 = X_train.copy()
X_train_missing_2.flat[missing_indices_2] = np.nan

In [15]:
from sklearn.metrics import confusion_matrix

In [16]:
# impute missing data using simple imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy="mean")
median_imputer = SimpleImputer(strategy="median")
mode_imputer = SimpleImputer(strategy="most_frequent")
mice_imputer = IterativeImputer()

rfc1 = RandomForestClassifier(random_state=RANDOM_SEED)
rfc2 = RandomForestClassifier(random_state=RANDOM_SEED)

results = {
    method: {
        "accuracy_1": None,
        "confusion_matrix_1": None,
        "accuracy_2": None,
        "confusion_matrix_2": None
    }
    for method in imputation_method.value
}

for method in imputation_method.value:
    if method == "Simple imputation by mean":
        X_train_imputed_1 = mean_imputer.fit_transform(X_train_missing_1)
        X_train_imputed_2 = mean_imputer.fit_transform(X_train_missing_2)
    elif method == "Simple imputation by median":
        X_train_imputed_1 = median_imputer.fit_transform(X_train_missing_1)
        X_train_imputed_2 = median_imputer.fit_transform(X_train_missing_2)
    elif method == "Simple imputation by mode":
        X_train_imputed_1 = mode_imputer.fit_transform(X_train_missing_1)
        X_train_imputed_2 = mode_imputer.fit_transform(X_train_missing_2)
    elif method == "MICE":
        X_train_imputed_1 = mice_imputer.fit_transform(X_train_missing_1)
        X_train_imputed_2 = mode_imputer.fit_transform(X_train_missing_2)
    rfc1.fit(X_train_imputed_1, y_train)
    rfc2.fit(X_train_imputed_2, y_train)
    results[method]["accuracy_1"] = round(accuracy_score(rfc1.predict(X_test), y_test) * 100, 2)
    results[method]["confusion_matrix_1"] = confusion_matrix(y_test, rfc1.predict(X_test))
    results[method]["accuracy_2"] = round(accuracy_score(rfc2.predict(X_test), y_test) * 100, 2)
    results[method]["confusion_matrix_2"] = confusion_matrix(y_test, rfc2.predict(X_test))

In [17]:
display(Markdown(f"# Accuracy in % (missing data: {round(missing_ratio_1 * 100, 2)}%)"))
mr.NumberBox([mr.NumberBox(data=results[result]["accuracy_1"], title=f"({result})") for result in results])

# Accuracy in % (missing data: 3.0%)

In [18]:
display(Markdown(f"# Accuracy in % (missing data: {round(missing_ratio_2 * 100, 2)}%)"))
mr.NumberBox([mr.NumberBox(data=results[result]["accuracy_2"], 
                           percent_change=int(results[result]["accuracy_2"] - results[result]["accuracy_1"]),
                           title=f"({result})") for result in results])

# Accuracy in % (missing data: 40.0%)