In [3]:
import sys
sys.path.append("../")
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer as SklearnSimpleImputer
from sklearn.datasets import load_diabetes
from preprocessing.SimpleImputer import SimpleImputer  # Adjust import path as needed

def test_simple_imputer():
    data = load_diabetes()
    X = data.data
    y = data.target

    # Introduce missing values randomly
    rng = np.random.default_rng(seed=42)
    missing_samples = rng.choice(X.shape[0], size=int(X.shape[0] * 0.1), replace=False)
    missing_features = rng.choice(X.shape[1], size=int(X.shape[0] * 0.1))
    X[missing_samples, missing_features] = np.nan

    # Convert to DataFrame for easier manipulation and visualization
    df = pd.DataFrame(X, columns=data.feature_names)
    print("Number of missing values per feature before imputation:\n", df.isna().sum())

    # Custom SimpleImputer
    custom_imputer = SimpleImputer(strategy='mean')
    X_custom_imputed = custom_imputer.fit_transform(X)

    # Sklearn SimpleImputer
    sklearn_imputer = SklearnSimpleImputer(strategy='mean')
    X_sklearn_imputed = sklearn_imputer.fit_transform(X)

    # Convert the imputed arrays back to DataFrame to check missing values after imputation
    df_custom_imputed = pd.DataFrame(X_custom_imputed, columns=data.feature_names)
    df_sklearn_imputed = pd.DataFrame(X_sklearn_imputed, columns=data.feature_names)

    print("Number of missing values per feature after custom imputation:\n", df_custom_imputed.isna().sum())
    print("Number of missing values per feature after sklearn imputation:\n", df_sklearn_imputed.isna().sum())

    # Check if both imputed datasets are identical
    assert np.allclose(X_custom_imputed, X_sklearn_imputed, equal_nan=True), "Mismatch in imputed results"

    print("Custom SimpleImputer tests passed successfully!")
    print("Imputed data comparison done.")

test_simple_imputer()


Number of missing values per feature before imputation:
 age    3
sex    4
bmi    4
bp     2
s1     5
s2     6
s3     6
s4     7
s5     7
s6     0
dtype: int64
Number of missing values per feature after custom imputation:
 age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
dtype: int64
Number of missing values per feature after sklearn imputation:
 age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
dtype: int64
Custom SimpleImputer tests passed successfully!
Imputed data comparison done.
