In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer, fetch_california_housing


In [2]:
def inject_missing(X, missing_rate=0.1, random_state=42):
    rng = np.random.default_rng(random_state)
    X_missing = X.copy()
    mask = rng.random(X_missing.shape) < missing_rate
    X_missing[mask] = np.nan
    return X_missing


In [3]:
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
cols = ["Preg","Glucose","BP","Skin","Insulin","BMI","DPF","Age","Outcome"]
df = pd.read_csv(url, names=cols)

X = df.drop(columns=["Outcome"]).values
y = df["Outcome"].values

# Inject missing values
X_missing = inject_missing(X, missing_rate=0.15)

print("Missing values:", np.isnan(X_missing).sum())

# Imputers
imputers = {
    "SimpleImputer": SimpleImputer(strategy="median"),
    "KNNImputer": KNNImputer(n_neighbors=5),
    "IterativeImputer": IterativeImputer(random_state=42)
}

for name, imputer in imputers.items():
    X_imp = imputer.fit_transform(X_missing)
    print(f"{name} → Missing after imputation:", np.isnan(X_imp).sum())


Missing values: 941
SimpleImputer → Missing after imputation: 0
KNNImputer → Missing after imputation: 0
IterativeImputer → Missing after imputation: 0


In [7]:
data = load_breast_cancer()
X = data.data
y = data.target

X_missing = inject_missing(X, missing_rate=0.1)
print("Missing values:", np.isnan(X_missing).sum())

for name, imputer in imputers.items():
    X_imp = imputer.fit_transform(X_missing)
    print(f"{name} → Missing after imputation:", np.isnan(X_imp).sum())


Missing values: 1653
SimpleImputer → Missing after imputation: 0
KNNImputer → Missing after imputation: 0
IterativeImputer → Missing after imputation: 0




In [8]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(
    n_samples=300,
    centers=5,
    n_features=3,
    random_state=42
)

X_missing = inject_missing(X, missing_rate=0.2)
print("Missing values:", np.isnan(X_missing).sum())

for name, imputer in imputers.items():
    X_imp = imputer.fit_transform(X_missing)
    print(f"{name} → Missing after imputation:", np.isnan(X_imp).sum())


Missing values: 198
SimpleImputer → Missing after imputation: 0
KNNImputer → Missing after imputation: 0
IterativeImputer → Missing after imputation: 0




In [9]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_regression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer

# --- helper: inject missing values ---
def inject_missing(X, missing_rate=0.12, random_state=42):
    rng = np.random.default_rng(random_state)
    X = X.astype(float).copy()
    mask = rng.random(X.shape) < missing_rate
    X[mask] = np.nan
    return X

# --- House pricing dataset (offline, regression-like) ---
# Think of these as: area, rooms, location_score, age_of_house, etc.
X, y = make_regression(
    n_samples=2000,
    n_features=8,
    noise=15.0,
    random_state=42
)

# Ensure missing values exist
X_missing = inject_missing(X, missing_rate=0.15, random_state=42)
print("House Pricing → Missing values BEFORE:", np.isnan(X_missing).sum())

imputers = {
    "SimpleImputer(median)": SimpleImputer(strategy="median"),
    "KNNImputer(k=5)": KNNImputer(n_neighbors=5),
    "IterativeImputer": IterativeImputer(random_state=42, max_iter=10)
}

for name, imputer in imputers.items():
    X_imp = imputer.fit_transform(X_missing)
    print(f"{name:22s} → Missing AFTER: {np.isnan(X_imp).sum()}")


House Pricing → Missing values BEFORE: 2379
SimpleImputer(median)  → Missing AFTER: 0
KNNImputer(k=5)        → Missing AFTER: 0
IterativeImputer       → Missing AFTER: 0


In [10]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer

# --- helper: inject missing values ---
def inject_missing(X, missing_rate=0.10, random_state=42):
    rng = np.random.default_rng(random_state)
    X = X.astype(float).copy()
    mask = rng.random(X.shape) < missing_rate
    X[mask] = np.nan
    return X

# --- Load Car Evaluation dataset ---
X_raw, y_raw = fetch_openml(name="car", version=1, as_frame=True, return_X_y=True)

df = X_raw.copy()

# Encode categorical -> numeric
df["buying"]   = df["buying"].map({"low":0,"med":1,"high":2,"vhigh":3})
df["maint"]    = df["maint"].map({"low":0,"med":1,"high":2,"vhigh":3})
df["doors"]    = df["doors"].map({"2":2,"3":3,"4":4,"5more":5})
df["persons"]  = df["persons"].map({"2":2,"4":4,"more":5})
df["lug_boot"] = df["lug_boot"].map({"small":0,"med":1,"big":2})
df["safety"]   = df["safety"].map({"low":0,"med":1,"high":2})

X = df.values

# Ensure missing values exist
X_missing = inject_missing(X, missing_rate=0.12, random_state=42)
print("Car Evaluation → Missing values BEFORE:", np.isnan(X_missing).sum())

imputers = {
    "SimpleImputer(most_frequent)": SimpleImputer(strategy="most_frequent"),
    "KNNImputer(k=5)": KNNImputer(n_neighbors=5),
    "IterativeImputer": IterativeImputer(random_state=42, max_iter=10)
}

for name, imputer in imputers.items():
    X_imp = imputer.fit_transform(X_missing)
    print(f"{name:28s} → Missing AFTER: {np.isnan(X_imp).sum()}")


Car Evaluation → Missing values BEFORE: 1244
SimpleImputer(most_frequent) → Missing AFTER: 0
KNNImputer(k=5)              → Missing AFTER: 0
IterativeImputer             → Missing AFTER: 0


  warn(
