# IterativeImputer (Multivariate Regression-based Imputation)
- IterativeImputer is a method that predicts missing values ​​based on other feature values
- It considers each column with missing values ​​as a regression problem and iteratively predicts missing values
- It can reflect correlations between multiple features, and can impute missing values ​​using various regression models (RandomForest, LinearRegression, etc.)
- Since missing values ​​are imputed in an iterative manner, more accurate imputation is possible

In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# Hyperparameters
input_dim = 5  # Number of features

# Utility function to introduce missing values
def introduce_missing_values(data, missing_rate=0.2):
    data_with_missing = data.copy()
    mask = np.random.binomial(1, 1 - missing_rate, data.shape)
    data_with_missing[mask == 0] = np.nan
    return data_with_missing, mask

# Generate sample data
np.random.seed(0)
data = np.random.rand(1000, input_dim)
data_with_missing, mask = introduce_missing_values(data, missing_rate=0.2)

# Original data
original_df = pd.DataFrame(data)

# Data with missing
df = pd.DataFrame(data_with_missing)

# Impute missing values ​​with IterativeImputer (Based on RandomForestRegressor)
imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=0)

# Perform missing value imputation
df_imputed = imputer.fit_transform(df)

# Output the replaced dataframe
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)
print("Original Data:")
print(data[:5])
print("Data with Missing Values:")
print(data_with_missing[:5])
print("Imputed Data:")
print(df_imputed[:5])

Original Data:
[[0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]
 [0.64589411 0.43758721 0.891773   0.96366276 0.38344152]
 [0.79172504 0.52889492 0.56804456 0.92559664 0.07103606]
 [0.0871293  0.0202184  0.83261985 0.77815675 0.87001215]
 [0.97861834 0.79915856 0.46147936 0.78052918 0.11827443]]
Data with Missing Values:
[[0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]
 [0.64589411 0.43758721 0.891773   0.96366276 0.38344152]
 [       nan 0.52889492 0.56804456 0.92559664 0.07103606]
 [0.0871293  0.0202184  0.83261985 0.77815675        nan]
 [0.97861834 0.79915856 0.46147936 0.78052918 0.11827443]]
Imputed Data:
          0         1         2         3         4
0  0.548814  0.715189  0.602763  0.544883  0.423655
1  0.645894  0.437587  0.891773  0.963663  0.383442
2  0.510192  0.528895  0.568045  0.925597  0.071036
3  0.087129  0.020218  0.832620  0.778157  0.510086
4  0.978618  0.799159  0.461479  0.780529  0.118274




In [2]:
from sklearn.metrics import mean_squared_error

# MSE calculation function
def calculate_mse(true_values, imputed_values):
    mse = mean_squared_error(true_values, imputed_values)
    return mse

knn_mse = calculate_mse(original_df, df_imputed)

print(f"IterativeImputer MSE: {knn_mse}")

IterativeImputer MSE: 0.01865274644098053
