# Multiple Imputation using MICE

**Date:** 9 October 2025

**Topic:** Multiple Imputation by Chained Equations for missing data handling

This notebook covers:
1. Introduction to MICE Algorithm
2. Titanic Dataset Imputation
3. Life Expectancy Dataset Analysis
4. Planets Dataset Application
5. Comparative Analysis and Validation
6. Best Practices and Limitations

## 1. Setup and Dependencies

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")

## 2. Introduction to MICE Algorithm

### 2.1 MICE Methodology Overview

In [None]:
print("MULTIPLE IMPUTATION BY CHAINED EQUATIONS (MICE)")
print("=" * 55)
print("\nAlgorithm Steps:")
print("1. Replace missing values with initial estimates (mean/mode)")
print("2. For each variable with missing data:")
print("   a. Set current estimates to missing")
print("   b. Regress variable on other variables")
print("   c. Predict missing values using regression")
print("   d. Update estimates with new predictions")
print("3. Repeat step 2 for specified iterations")
print("4. Output imputed dataset")

print("\nKey Advantages:")
print("- Accounts for uncertainty in imputations")
print("- Preserves relationships between variables")
print("- Flexible for different variable types")
print("- Better than single imputation methods")

print("\nLimitations:")
print("- Assumes data is Missing at Random (MAR)")
print("- Computationally intensive")
print("- Requires careful model specification")
print("- May not work well with high missingness rates")

## 3. Titanic Dataset Analysis

### 3.1 Data Loading and Exploration

In [None]:
# Load Titanic dataset
titanic = sns.load_dataset('titanic')

print("Titanic Dataset Information:")
print(f"Shape: {titanic.shape}")
print(f"\nColumns: {list(titanic.columns)}")
print("\nFirst few rows:")
display(titanic.head())

# Analyze missing values
print("\nMissing Values Analysis:")
missing_counts = titanic.isnull().sum()
missing_percentages = (missing_counts / len(titanic)) * 100

missing_summary = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percentage': missing_percentages.values
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print(missing_summary)

# Total missingness
total_missing = missing_counts.sum()
total_cells = titanic.shape[0] * titanic.shape[1]
print(f"\nTotal missing values: {total_missing}")
print(f"Overall missingness rate: {(total_missing/total_cells)*100:.2f}%")

### 3.2 Data Preprocessing for MICE

In [None]:
# Select relevant columns and prepare for imputation
# Focus on numeric and easily encodable variables
titanic_subset = titanic[['age', 'fare', 'pclass', 'sex', 'survived']].copy()

# Encode categorical variables
titanic_encoded = pd.get_dummies(titanic_subset, columns=['sex'], drop_first=True)

print("Preprocessed Titanic Dataset:")
print(f"Shape: {titanic_encoded.shape}")
print(f"Columns: {list(titanic_encoded.columns)}")
display(titanic_encoded.head())

# Check missing values in subset
print("\nMissing values in subset:")
subset_missing = titanic_encoded.isnull().sum()
print(subset_missing[subset_missing > 0])

# Display some rows with missing age values
print("\nSample rows with missing age:")
missing_age_sample = titanic_encoded[titanic_encoded['age'].isnull()].head(10)
display(missing_age_sample)

### 3.3 MICE Imputation on Titanic Dataset

In [None]:
# Apply MICE imputation
mice_imputer = IterativeImputer(
    max_iter=10,
    random_state=42,
    estimator=LinearRegression()
)

# Fit and transform the data
titanic_imputed_array = mice_imputer.fit_transform(titanic_encoded)
titanic_imputed = pd.DataFrame(
    titanic_imputed_array, 
    columns=titanic_encoded.columns
)

print("MICE Imputation Results - Titanic:")
print("=" * 40)

# Verify no missing values remain
remaining_missing = titanic_imputed.isnull().sum()
print(f"Remaining missing values: {remaining_missing.sum()}")

# Compare original and imputed data
print("\nComparison - Original vs Imputed Data:")
comparison_data = pd.DataFrame({
    'Original_Age_Mean': titanic_encoded['age'].mean(),
    'Imputed_Age_Mean': titanic_imputed['age'].mean(),
    'Original_Age_Std': titanic_encoded['age'].std(),
    'Imputed_Age_Std': titanic_imputed['age'].std()
}, index=[0])

display(comparison_data.round(3))

# Display sample of imputed values
print("\nSample of imputed age values:")
original_missing_mask = titanic_encoded['age'].isnull()
imputed_values = titanic_imputed.loc[original_missing_mask, 'age'].head(10)
print(imputed_values.round(2).tolist())

### 3.4 Titanic Imputation Visualization

In [None]:
# Create comprehensive visualization for Titanic imputation
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Age distribution before imputation
ax1.hist(titanic_encoded['age'].dropna(), bins=30, alpha=0.7, color='blue', 
         density=True, label='Original Data')
ax1.set_xlabel('Age')
ax1.set_ylabel('Density')
ax1.set_title('Age Distribution - Before Imputation')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Age distribution after imputation
ax2.hist(titanic_encoded['age'].dropna(), bins=30, alpha=0.7, color='blue', 
         density=True, label='Original Data')
ax2.hist(titanic_imputed['age'], bins=30, alpha=0.7, color='red', 
         density=True, label='Imputed Data')
ax2.set_xlabel('Age')
ax2.set_ylabel('Density')
ax2.set_title('Age Distribution - After Imputation')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Plot 3: Box plots comparison
age_data = [
    titanic_encoded['age'].dropna(),
    titanic_imputed['age']
]
ax3.boxplot(age_data, labels=['Original', 'Imputed'])
ax3.set_ylabel('Age')
ax3.set_title('Age Distribution Comparison (Box Plot)')
ax3.grid(True, alpha=0.3)

# Plot 4: Imputed values only
imputed_ages = titanic_imputed.loc[original_missing_mask, 'age']
ax4.hist(imputed_ages, bins=20, alpha=0.7, color='green', 
         edgecolor='black', density=True)
ax4.set_xlabel('Age')
ax4.set_ylabel('Density')
ax4.set_title('Distribution of Imputed Age Values Only')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistical comparison
print("Statistical Comparison:")
print(f"Original age statistics (excluding missing):")
print(f"  Mean: {titanic_encoded['age'].mean():.2f}")
print(f"  Median: {titanic_encoded['age'].median():.2f}")
print(f"  Std: {titanic_encoded['age'].std():.2f}")

print(f"\nImputed dataset age statistics:")
print(f"  Mean: {titanic_imputed['age'].mean():.2f}")
print(f"  Median: {titanic_imputed['age'].median():.2f}")
print(f"  Std: {titanic_imputed['age'].std():.2f}")

print(f"\nImputed values only statistics:")
print(f"  Mean: {imputed_ages.mean():.2f}")
print(f"  Median: {imputed_ages.median():.2f}")
print(f"  Std: {imputed_ages.std():.2f}")

## 4. Life Expectancy Dataset Analysis

### 4.1 Dataset Loading and Exploration

In [None]:
# Create synthetic life expectancy dataset for demonstration
# In real scenarios, you would load from the actual WHO dataset
np.random.seed(42)
n_countries = 500
n_years = 5
total_rows = n_countries * n_years

# Generate synthetic life expectancy data
life_expectancy_data = {
    'Country': np.repeat([f'Country_{i}' for i in range(n_countries)], n_years),
    'Year': np.tile(range(2015, 2020), n_countries),
    'Life_expectancy': np.random.normal(70, 10, total_rows),
    'Adult_Mortality': np.random.exponential(150, total_rows),
    'Alcohol': np.random.exponential(5, total_rows),
    'Hepatitis_B': np.random.normal(85, 15, total_rows),
    'GDP': np.random.exponential(10000, total_rows),
    'Schooling': np.random.normal(12, 3, total_rows)
}

# Create DataFrame
life_expectancy_df = pd.DataFrame(life_expectancy_data)

# Introduce missing values randomly
missing_rate = 0.15
numeric_cols = ['Life_expectancy', 'Adult_Mortality', 'Alcohol', 'Hepatitis_B', 'GDP', 'Schooling']

for col in numeric_cols[1:]:  # Don't make target variable missing
    n_missing = int(len(life_expectancy_df) * missing_rate * np.random.uniform(0.5, 1.5))
    missing_indices = np.random.choice(len(life_expectancy_df), n_missing, replace=False)
    life_expectancy_df.loc[missing_indices, col] = np.nan

print("Synthetic Life Expectancy Dataset:")
print(f"Shape: {life_expectancy_df.shape}")
print(f"\nFirst few rows:")
display(life_expectancy_df.head(10))

# Analyze missing patterns
print("\nMissing Values Analysis:")
missing_analysis = life_expectancy_df.isnull().sum()
missing_percentages = (missing_analysis / len(life_expectancy_df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_analysis.index,
    'Missing_Count': missing_analysis.values,
    'Missing_Percentage': missing_percentages.values
})
display(missing_df[missing_df['Missing_Count'] > 0])

### 4.2 Life Expectancy MICE Imputation

In [None]:
# Prepare data for imputation (numeric columns only)
life_expectancy_numeric = life_expectancy_df[numeric_cols].copy()

print("Pre-imputation summary:")
print(life_expectancy_numeric.describe())

# Apply MICE imputation with Random Forest estimator for better performance
mice_imputer_life = IterativeImputer(
    max_iter=20,
    random_state=42,
    estimator=RandomForestRegressor(n_estimators=10, random_state=42)
)

# Fit and transform
life_expectancy_imputed_array = mice_imputer_life.fit_transform(life_expectancy_numeric)
life_expectancy_imputed = pd.DataFrame(
    life_expectancy_imputed_array,
    columns=numeric_cols
)

# Add back non-numeric columns
life_expectancy_imputed['Country'] = life_expectancy_df['Country']
life_expectancy_imputed['Year'] = life_expectancy_df['Year']

print("\nPost-imputation verification:")
print(f"Remaining missing values: {life_expectancy_imputed.isnull().sum().sum()}")

print("\nPost-imputation summary:")
print(life_expectancy_imputed[numeric_cols].describe())

### 4.3 Life Expectancy Imputation Validation

In [None]:
# Compare distributions before and after imputation
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numeric_cols):
    ax = axes[i]
    
    # Original data (non-missing)
    original_data = life_expectancy_numeric[col].dropna()
    ax.hist(original_data, bins=30, alpha=0.6, color='blue', 
            density=True, label='Original')
    
    # Imputed complete dataset
    ax.hist(life_expectancy_imputed[col], bins=30, alpha=0.6, color='red', 
            density=True, label='Imputed')
    
    ax.set_title(f'{col} Distribution Comparison')
    ax.set_xlabel(col)
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistical comparison table
comparison_stats = []
for col in numeric_cols:
    original = life_expectancy_numeric[col].dropna()
    imputed = life_expectancy_imputed[col]
    
    comparison_stats.append({
        'Variable': col,
        'Original_Mean': original.mean(),
        'Imputed_Mean': imputed.mean(),
        'Original_Std': original.std(),
        'Imputed_Std': imputed.std(),
        'Mean_Diff': abs(original.mean() - imputed.mean()),
        'Std_Diff': abs(original.std() - imputed.std())
    })

comparison_df = pd.DataFrame(comparison_stats)
print("\nStatistical Comparison - Life Expectancy Dataset:")
display(comparison_df.round(3))

## 5. Planets Dataset Application

### 5.1 Planets Dataset Analysis

In [None]:
# Load planets dataset
planets_df = sns.load_dataset("planets")

print("Planets Dataset Information:")
print(f"Shape: {planets_df.shape}")
print(f"\nColumns: {list(planets_df.columns)}")
display(planets_df.head())

# Check missing values
print("\nMissing Values in Planets Dataset:")
planets_missing = planets_df.isnull().sum()
planets_missing_df = pd.DataFrame({
    'Column': planets_missing.index,
    'Missing_Count': planets_missing.values,
    'Missing_Percentage': (planets_missing.values / len(planets_df)) * 100
})
display(planets_missing_df[planets_missing_df['Missing_Count'] > 0])

# Focus on numeric variables with missing values
numeric_planets = planets_df[['orbital_period', 'mass', 'distance']].copy()
print("\nNumeric subset for imputation:")
print(f"Shape: {numeric_planets.shape}")
display(numeric_planets.head(10))

# Show some missing patterns
print("\nRows with missing values (sample):")
missing_rows = numeric_planets[numeric_planets.isnull().any(axis=1)]
display(missing_rows.head(10))

### 5.2 Planets MICE Imputation

In [None]:
# Apply MICE to planets dataset
mice_imputer_planets = IterativeImputer(
    max_iter=15,
    random_state=42,
    estimator=RandomForestRegressor(n_estimators=20, random_state=42)
)

# Transform the data
planets_imputed_array = mice_imputer_planets.fit_transform(numeric_planets)
planets_imputed = pd.DataFrame(
    planets_imputed_array,
    columns=numeric_planets.columns
)

print("Planets MICE Imputation Results:")
print("=" * 40)

# Verification
print(f"Original missing values: {numeric_planets.isnull().sum().sum()}")
print(f"Remaining missing values: {planets_imputed.isnull().sum().sum()}")

# Compare before and after statistics
print("\nBefore Imputation:")
display(numeric_planets.describe())

print("\nAfter Imputation:")
display(planets_imputed.describe())

# Show examples of imputed values
print("\nExample imputed values:")
mass_missing_mask = numeric_planets['mass'].isnull()
if mass_missing_mask.sum() > 0:
    print(f"Sample imputed mass values: {planets_imputed.loc[mass_missing_mask, 'mass'].head().round(3).tolist()}")

distance_missing_mask = numeric_planets['distance'].isnull()
if distance_missing_mask.sum() > 0:
    print(f"Sample imputed distance values: {planets_imputed.loc[distance_missing_mask, 'distance'].head().round(2).tolist()}")

### 5.3 Planets Imputation Visualization

In [None]:
# Create visualization for planets imputation
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

variables = ['orbital_period', 'mass', 'distance']
colors = ['blue', 'green', 'orange']

for i, (var, color) in enumerate(zip(variables, colors)):
    ax = axes[i]
    
    # Original data (non-missing only)
    original_data = numeric_planets[var].dropna()
    
    # Use log scale for better visualization due to wide ranges
    if len(original_data) > 0:
        ax.hist(np.log10(original_data + 1), bins=25, alpha=0.6, 
                color='blue', density=True, label='Original')
    
    ax.hist(np.log10(planets_imputed[var] + 1), bins=25, alpha=0.6, 
            color=color, density=True, label='Imputed')
    
    ax.set_title(f'{var} Distribution (Log Scale)')
    ax.set_xlabel(f'log10({var} + 1)')
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Correlation analysis
print("\nCorrelation Analysis:")
print("Original data correlations:")
original_corr = numeric_planets.corr()
print(original_corr.round(3))

print("\nImputed data correlations:")
imputed_corr = planets_imputed.corr()
print(imputed_corr.round(3))

print("\nCorrelation preservation:")
corr_diff = abs(original_corr - imputed_corr)
print(f"Maximum correlation difference: {corr_diff.max().max():.4f}")
print(f"Average correlation difference: {corr_diff.mean().mean():.4f}")

## 6. Comparative Analysis and Validation

### 6.1 Comparison with Simple Imputation Methods

In [None]:
# Compare MICE with simple imputation methods using Titanic dataset
print("IMPUTATION METHODS COMPARISON")
print("=" * 40)

# Prepare test data (Titanic age column)
test_data = titanic_encoded[['age', 'fare', 'pclass']].copy()
original_age = test_data['age'].copy()

# Method 1: Mean imputation
mean_imputer = SimpleImputer(strategy='mean')
age_mean_imputed = mean_imputer.fit_transform(test_data[['age']])[:, 0]

# Method 2: Median imputation
median_imputer = SimpleImputer(strategy='median')
age_median_imputed = median_imputer.fit_transform(test_data[['age']])[:, 0]

# Method 3: MICE imputation
mice_imputer_comp = IterativeImputer(max_iter=10, random_state=42)
age_mice_imputed = mice_imputer_comp.fit_transform(test_data)[:, 0]

# Create comparison DataFrame
comparison_results = pd.DataFrame({
    'Method': ['Original', 'Mean Imputation', 'Median Imputation', 'MICE'],
    'Mean': [
        original_age.mean(),
        np.mean(age_mean_imputed),
        np.mean(age_median_imputed),
        np.mean(age_mice_imputed)
    ],
    'Std': [
        original_age.std(),
        np.std(age_mean_imputed),
        np.std(age_median_imputed),
        np.std(age_mice_imputed)
    ],
    'Min': [
        original_age.min(),
        np.min(age_mean_imputed),
        np.min(age_median_imputed),
        np.min(age_mice_imputed)
    ],
    'Max': [
        original_age.max(),
        np.max(age_mean_imputed),
        np.max(age_median_imputed),
        np.max(age_mice_imputed)
    ]
})

print("\nStatistical Comparison of Imputation Methods:")
display(comparison_results.round(3))

# Visualize comparison
plt.figure(figsize=(15, 10))

# Create subplots for each method
methods_data = {
    'Original (no missing)': original_age.dropna(),
    'Mean Imputation': age_mean_imputed,
    'Median Imputation': age_median_imputed,
    'MICE': age_mice_imputed
}

colors = ['blue', 'red', 'green', 'purple']

for i, (method, data) in enumerate(methods_data.items()):
    plt.subplot(2, 2, i + 1)
    plt.hist(data, bins=25, alpha=0.7, color=colors[i], density=True)
    plt.title(f'{method}\nMean: {np.mean(data):.1f}, Std: {np.std(data):.1f}')
    plt.xlabel('Age')
    plt.ylabel('Density')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 6.2 Validation and Quality Assessment

In [None]:
# Perform validation using artificial missingness
print("MICE VALIDATION ANALYSIS")
print("=" * 30)

# Create complete dataset for validation
validation_data = planets_df[['orbital_period', 'mass', 'distance']].dropna().copy()
print(f"Complete cases for validation: {len(validation_data)}")

if len(validation_data) > 50:  # Ensure we have enough data for validation
    # Introduce artificial missingness
    validation_data_missing = validation_data.copy()
    n_samples = len(validation_data_missing)
    
    # Randomly remove 20% of mass values
    np.random.seed(42)
    missing_indices = np.random.choice(n_samples, int(n_samples * 0.2), replace=False)
    true_values = validation_data_missing.loc[missing_indices, 'mass'].copy()
    validation_data_missing.loc[missing_indices, 'mass'] = np.nan
    
    # Apply MICE imputation
    mice_validator = IterativeImputer(max_iter=10, random_state=42)
    validation_imputed = mice_validator.fit_transform(validation_data_missing)
    imputed_values = validation_imputed[missing_indices, 1]  # mass is column 1
    
    # Calculate validation metrics
    mae = mean_absolute_error(true_values, imputed_values)
    mse = mean_squared_error(true_values, imputed_values)
    rmse = np.sqrt(mse)
    
    # Calculate correlation
    correlation = np.corrcoef(true_values, imputed_values)[0, 1]
    
    print("\nValidation Results (Artificial Missingness):")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Root Mean Square Error: {rmse:.4f}")
    print(f"Correlation (true vs imputed): {correlation:.4f}")
    
    # Visualization of validation
    plt.figure(figsize=(12, 5))
    
    # Scatter plot of true vs imputed
    plt.subplot(1, 2, 1)
    plt.scatter(true_values, imputed_values, alpha=0.6)
    plt.plot([true_values.min(), true_values.max()], 
             [true_values.min(), true_values.max()], 'r--', lw=2)
    plt.xlabel('True Values')
    plt.ylabel('Imputed Values')
    plt.title(f'True vs Imputed Values\nCorrelation: {correlation:.3f}')
    plt.grid(True, alpha=0.3)
    
    # Residuals plot
    plt.subplot(1, 2, 2)
    residuals = true_values - imputed_values
    plt.scatter(true_values, residuals, alpha=0.6)
    plt.axhline(0, color='r', linestyle='--', lw=2)
    plt.xlabel('True Values')
    plt.ylabel('Residuals (True - Imputed)')
    plt.title(f'Imputation Residuals\nRMSE: {rmse:.3f}')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("Insufficient complete cases for validation.")

## 7. Best Practices and Recommendations

### 7.1 Summary and Guidelines

In [None]:
print("MICE IMPLEMENTATION SUMMARY")
print("=" * 40)

print("\nDATASETS PROCESSED:")
datasets_summary = [
    ('Titanic', titanic_encoded.shape[0], 'age', titanic_encoded['age'].isnull().sum()),
    ('Life Expectancy', life_expectancy_df.shape[0], 'multiple', life_expectancy_numeric.isnull().sum().sum()),
    ('Planets', planets_df.shape[0], 'mass/distance', numeric_planets.isnull().sum().sum())
]

for name, n_rows, missing_vars, n_missing in datasets_summary:
    missing_rate = (n_missing / (n_rows * len(missing_vars.split('/')))) * 100 if '/' not in missing_vars else (n_missing / (n_rows * 3)) * 100
    print(f"{name:15}: {n_rows:4} rows, {missing_vars:15}, {n_missing:3} missing ({missing_rate:5.1f}%)")

print("\nKEY FINDINGS:")
print("1. MICE preserves distributional properties better than simple imputation")
print("2. Random Forest estimator often performs better than Linear Regression")
print("3. Iterative process allows for complex variable relationships")
print("4. Correlation structure is generally well-preserved")
print("5. Performance depends on missingness mechanism and rate")

print("\nBEST PRACTICES:")
print("• Analyze missingness patterns before imputation")
print("• Choose appropriate estimator (Linear vs Tree-based)")
print("• Set sufficient iterations (typically 10-20)")
print("• Validate with artificial missingness when possible")
print("• Consider multiple imputation for uncertainty quantification")
print("• Monitor convergence of imputed values")

print("\nLIMITATIONS TO CONSIDER:")
print("• Assumes Missing at Random (MAR) mechanism")
print("• Computationally intensive for large datasets")
print("• May not work well with high missingness rates (>50%)")
print("• Requires careful handling of categorical variables")
print("• Can introduce artificial precision")

print("\nALTERNATIVE APPROACHES:")
print("• K-Nearest Neighbors (KNN) imputation")
print("• Matrix factorization methods")
print("• Deep learning-based imputation")
print("• Domain-specific imputation strategies")
print("• Multiple imputation with proper pooling")