# Multiple imputation by chained equations (MICE)

In [2]:
# Cargar bibliotecas
import seaborn as sns
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [3]:
# Cargar el dataset de Titanic desde seaborn
titanic = sns.load_dataset('titanic')

In [4]:
# Mostrar primeros valores
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
# Check missing data
print(titanic.isnull().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [6]:
# Vamos a imputar los valores faltantes de la columna 'age' (edad)
# Seleccionamos un subconjunto de columnas para el ejemplo
data = titanic[['age', 'fare', 'pclass', 'sex']]

# Convertimos las variables categóricas en variables dummy (la variable 'sex' es categórica)
data = pd.get_dummies(data, drop_first=True)

data.head(20)

Unnamed: 0,age,fare,pclass,sex_male
0,22.0,7.25,3,True
1,38.0,71.2833,1,False
2,26.0,7.925,3,False
3,35.0,53.1,1,False
4,35.0,8.05,3,True
5,,8.4583,3,True
6,54.0,51.8625,1,True
7,2.0,21.075,3,True
8,27.0,11.1333,3,False
9,14.0,30.0708,2,False


In [7]:
# Crear el imputador MICE (IterativeImputer)
imputer = IterativeImputer(max_iter=10, random_state=0)

In [8]:
# Imputar los valores faltantes
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [9]:
# Mostrar los datos originales con valores faltantes
print("Datos originales (con valores faltantes):")
data.head(30)

Datos originales (con valores faltantes):


Unnamed: 0,age,fare,pclass,sex_male
0,22.0,7.25,3,True
1,38.0,71.2833,1,False
2,26.0,7.925,3,False
3,35.0,53.1,1,False
4,35.0,8.05,3,True
5,,8.4583,3,True
6,54.0,51.8625,1,True
7,2.0,21.075,3,True
8,27.0,11.1333,3,False
9,14.0,30.0708,2,False


In [10]:
# Mostrar los datos después de la imputación múltiple (MICE)
print("\nDatos después de la imputación múltiple (MICE):")
data_imputed.head(30)


Datos después de la imputación múltiple (MICE):


Unnamed: 0,age,fare,pclass,sex_male
0,22.0,7.25,3.0,1.0
1,38.0,71.2833,1.0,0.0
2,26.0,7.925,3.0,0.0
3,35.0,53.1,1.0,0.0
4,35.0,8.05,3.0,1.0
5,26.082704,8.4583,3.0,1.0
6,54.0,51.8625,1.0,1.0
7,2.0,21.075,3.0,1.0
8,27.0,11.1333,3.0,0.0
9,14.0,30.0708,2.0,0.0


# Exercise 1
Use MICE on the life-expectancy dataset to fill in missing values.
https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import os
import shutil
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)
shutil.copy('/content/drive/MyDrive/kaggle/kaggle.json', '/root/.kaggle/kaggle.json')
!chmod 600 /root/.kaggle/kaggle.json

In [15]:
!kaggle datasets download -d kumarajarshi/life-expectancy-who

Dataset URL: https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who
License(s): other
Downloading life-expectancy-who.zip to /content
  0% 0.00/119k [00:00<?, ?B/s]
100% 119k/119k [00:00<00:00, 50.9MB/s]


In [16]:
import zipfile

with zipfile.ZipFile('life-expectancy-who.zip', 'r') as zip_ref:
    zip_ref.extractall('life_expectancy_data')

In [19]:
data_path = '/content/life_expectancy_data/Life Expectancy Data.csv'
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [22]:
print(data.isnull().sum())

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64


In [29]:
subset_columns = ['Life expectancy ', 'GDP', 'Adult Mortality', 'Schooling', 'Population', 'Total expenditure', 'Alcohol', 'Hepatitis B', 'Income composition of resources']
data_subset = data[subset_columns]
imputer = IterativeImputer(max_iter=10, random_state=0)
data_imputed = pd.DataFrame(imputer.fit_transform(data_subset), columns=data_subset.columns)

In [30]:
print("Original Data (with missing values):")
data_subset.head(30)

Original Data (with missing values):


Unnamed: 0,Life expectancy,GDP,Adult Mortality,Schooling,Population,Total expenditure,Alcohol,Hepatitis B,Income composition of resources
0,65.0,584.25921,263.0,10.1,33736494.0,8.16,0.01,65.0,0.479
1,59.9,612.696514,271.0,10.0,327582.0,8.18,0.01,62.0,0.476
2,59.9,631.744976,268.0,9.9,31731688.0,8.13,0.01,64.0,0.47
3,59.5,669.959,272.0,9.8,3696958.0,8.52,0.01,67.0,0.463
4,59.2,63.537231,275.0,9.5,2978599.0,7.87,0.01,68.0,0.454
5,58.8,553.32894,279.0,9.2,2883167.0,9.2,0.01,66.0,0.448
6,58.6,445.893298,281.0,8.9,284331.0,9.42,0.01,63.0,0.434
7,58.1,373.361116,287.0,8.7,2729431.0,8.33,0.03,64.0,0.433
8,57.5,369.835796,295.0,8.4,26616792.0,6.73,0.02,63.0,0.415
9,57.3,272.56377,295.0,8.1,2589345.0,7.43,0.03,64.0,0.405


In [31]:
print("\nData after MICE Imputation:")
data_imputed.head(30)


Data after MICE Imputation:


Unnamed: 0,Life expectancy,GDP,Adult Mortality,Schooling,Population,Total expenditure,Alcohol,Hepatitis B,Income composition of resources
0,65.0,584.25921,263.0,10.1,33736494.0,8.16,0.01,65.0,0.479
1,59.9,612.696514,271.0,10.0,327582.0,8.18,0.01,62.0,0.476
2,59.9,631.744976,268.0,9.9,31731688.0,8.13,0.01,64.0,0.47
3,59.5,669.959,272.0,9.8,3696958.0,8.52,0.01,67.0,0.463
4,59.2,63.537231,275.0,9.5,2978599.0,7.87,0.01,68.0,0.454
5,58.8,553.32894,279.0,9.2,2883167.0,9.2,0.01,66.0,0.448
6,58.6,445.893298,281.0,8.9,284331.0,9.42,0.01,63.0,0.434
7,58.1,373.361116,287.0,8.7,2729431.0,8.33,0.03,64.0,0.433
8,57.5,369.835796,295.0,8.4,26616792.0,6.73,0.02,63.0,0.415
9,57.3,272.56377,295.0,8.1,2589345.0,7.43,0.03,64.0,0.405


In [33]:
print(data_imputed.isnull().sum())

Life expectancy                    0
GDP                                0
Adult Mortality                    0
Schooling                          0
Population                         0
Total expenditure                  0
Alcohol                            0
Hepatitis B                        0
Income composition of resources    0
dtype: int64


# Exercise 2
Use MICE on the Planets (seaborn) dataset to fill in missing values.

In [35]:
planets = sns.load_dataset('planets')
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [36]:
print(planets.isnull().sum())

method              0
number              0
orbital_period     43
mass              522
distance          227
year                0
dtype: int64


In [38]:
data_subset = planets[['orbital_period', 'mass', 'distance']]
data_subset = pd.get_dummies(data_subset, drop_first=True)
imputer = IterativeImputer(max_iter=10, random_state=0)
data_imputed = pd.DataFrame(imputer.fit_transform(data_subset), columns=data_subset.columns)

In [39]:
print("Original Data (with missing values):")
data_subset.head(30)

Original Data (with missing values):


Unnamed: 0,orbital_period,mass,distance
0,269.3,7.1,77.4
1,874.774,2.21,56.95
2,763.0,2.6,19.84
3,326.03,19.4,110.62
4,516.22,10.5,119.47
5,185.84,4.8,76.39
6,1773.4,4.64,18.15
7,798.5,,21.41
8,993.3,10.3,73.1
9,452.8,1.99,74.79


In [40]:
print("\nData after MICE Imputation:")
data_imputed.head(30)


Data after MICE Imputation:


Unnamed: 0,orbital_period,mass,distance
0,269.3,7.1,77.4
1,874.774,2.21,56.95
2,763.0,2.6,19.84
3,326.03,19.4,110.62
4,516.22,10.5,119.47
5,185.84,4.8,76.39
6,1773.4,4.64,18.15
7,798.5,1.83916,21.41
8,993.3,10.3,73.1
9,452.8,1.99,74.79


In [41]:
print(data_imputed.isnull().sum())

orbital_period    0
mass              0
distance          0
dtype: int64
