In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

In [3]:
# Load raw data
# Make sure the path matches where your file is located
input_path = '../data/raw/uc_diagnostic_tests.csv'
df = pd.read_csv(input_path, decimal=',')

print("Original data shape:", df.shape)
print("Missing values before imputation:", df.isna().sum().sum())

Original data shape: (252, 56)
Missing values before imputation: 3939


In [4]:
# Initialize MICE (IterativeImputer)
# random_state=42 ensures the result is the same every time you run it
mice_imputer = IterativeImputer(max_iter=20, random_state=42)

# Select only numeric columns (good practice, though your dataset is mostly numeric)
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Create a copy of the dataframe to store results
df_mice = df.copy()

# Run the imputation (fit_transform) and assign values back to the dataframe
print("Running MICE imputation...")
df_mice[numeric_cols] = mice_imputer.fit_transform(df[numeric_cols])

print("Imputation complete.")
print("Missing values after imputation:", df_mice.isna().sum().sum())

Running MICE imputation...
Imputation complete.
Missing values after imputation: 0


In [5]:
output_path = '../data/processed/uc_diagnostic_tests_mice.csv'
df_mice.to_csv(output_path, index=False)

print(f"File successfully saved to: {output_path}")

File successfully saved to: ../data/processed/uc_diagnostic_tests_mice.csv
