In [2]:
import seaborn as sns
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [3]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
#Missing Values
print(titanic.isnull().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [5]:
data = titanic[['age','fare','pclass','sex']]
#data = pd.get_dummies(data['sex'], drop_first=True)
data = pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0,age,fare,pclass,sex_male
0,22.0,7.25,3,True
1,38.0,71.2833,1,False
2,26.0,7.925,3,False
3,35.0,53.1,1,False
4,35.0,8.05,3,True


In [6]:
imputer = IterativeImputer(max_iter=10, random_state=0)


In [7]:
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [8]:
print("Datos originales:")
print(data.head(30))

print("\nDatos imputados:")
print(data_imputed.head(30))

Datos originales:
     age      fare  pclass  sex_male
0   22.0    7.2500       3      True
1   38.0   71.2833       1     False
2   26.0    7.9250       3     False
3   35.0   53.1000       1     False
4   35.0    8.0500       3      True
5    NaN    8.4583       3      True
6   54.0   51.8625       1      True
7    2.0   21.0750       3      True
8   27.0   11.1333       3     False
9   14.0   30.0708       2     False
10   4.0   16.7000       3     False
11  58.0   26.5500       1     False
12  20.0    8.0500       3      True
13  39.0   31.2750       3      True
14  14.0    7.8542       3     False
15  55.0   16.0000       2     False
16   2.0   29.1250       3      True
17   NaN   13.0000       2      True
18  31.0   18.0000       3     False
19   NaN    7.2250       3     False
20  35.0   26.0000       2      True
21  34.0   13.0000       2      True
22  15.0    8.0292       3     False
23  28.0   35.5000       1      True
24   8.0   21.0750       3     False
25  38.0   31.3875  

Ahora usar MICE en life expectancy dataset presente en Kaggle

In [27]:
life = pd.read_csv('/content/Life Expectancy Data.csv')

In [28]:
print(life.isnull().sum())

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64


In [29]:
life_numeric = life.copy()
life_numeric = pd.get_dummies(life_numeric, columns=['Status'], drop_first=True)

# Select columns for imputation (excluding Country and Year as they are identifiers/time-series related)
# Also exclude 'infant deaths', 'Measles', 'percentage expenditure', 'HIV/AIDS' as they have no missing values
# Include 'Status_Developing' from one-hot encoding
columns_for_imputation = ['Life expectancy ', 'Adult Mortality', 'Alcohol', 'Hepatitis B', ' BMI ', 'Polio', 'Total expenditure', 'Diphtheria ', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling', 'Status_Developing']
data_for_mice = life_numeric[columns_for_imputation]

print("Selected columns for MICE imputation:")
print(data_for_mice.head())
print("\nMissing values in data_for_mice before imputation:")
print(data_for_mice.isnull().sum())

Selected columns for MICE imputation:
   Life expectancy   Adult Mortality  Alcohol  Hepatitis B   BMI   Polio  \
0              65.0            263.0     0.01         65.0   19.1    6.0   
1              59.9            271.0     0.01         62.0   18.6   58.0   
2              59.9            268.0     0.01         64.0   18.1   62.0   
3              59.5            272.0     0.01         67.0   17.6   67.0   
4              59.2            275.0     0.01         68.0   17.2   68.0   

   Total expenditure  Diphtheria          GDP  Population  \
0               8.16         65.0  584.259210  33736494.0   
1               8.18         62.0  612.696514    327582.0   
2               8.13         64.0  631.744976  31731688.0   
3               8.52         67.0  669.959000   3696958.0   
4               7.87         68.0   63.537231   2978599.0   

    thinness  1-19 years   thinness 5-9 years  \
0                   17.2                 17.3   
1                   17.5                

In [30]:
imputer = IterativeImputer(max_iter=10, random_state=0)
data_for_mice_imputed = pd.DataFrame(imputer.fit_transform(data_for_mice), columns=data_for_mice.columns)

print("Missing values in data_for_mice after imputation:")
print(data_for_mice_imputed.isnull().sum())

Missing values in data_for_mice after imputation:
Life expectancy                    0
Adult Mortality                    0
Alcohol                            0
Hepatitis B                        0
 BMI                               0
Polio                              0
Total expenditure                  0
Diphtheria                         0
GDP                                0
Population                         0
 thinness  1-19 years              0
 thinness 5-9 years                0
Income composition of resources    0
Schooling                          0
Status_Developing                  0
dtype: int64


In [31]:
#Now we compare
print("Original data_for_mice (first 10 rows with potential NaN values):")
print(data_for_mice.head(10))

print("\nImputed data_for_mice_imputed (first 10 rows):")
print(data_for_mice_imputed.head(10))

Original data_for_mice (first 10 rows with potential NaN values):
   Life expectancy   Adult Mortality  Alcohol  Hepatitis B   BMI   Polio  \
0              65.0            263.0     0.01         65.0   19.1    6.0   
1              59.9            271.0     0.01         62.0   18.6   58.0   
2              59.9            268.0     0.01         64.0   18.1   62.0   
3              59.5            272.0     0.01         67.0   17.6   67.0   
4              59.2            275.0     0.01         68.0   17.2   68.0   
5              58.8            279.0     0.01         66.0   16.7   66.0   
6              58.6            281.0     0.01         63.0   16.2   63.0   
7              58.1            287.0     0.03         64.0   15.7   64.0   
8              57.5            295.0     0.02         63.0   15.2   63.0   
9              57.3            295.0     0.03         64.0   14.7   58.0   

   Total expenditure  Diphtheria          GDP  Population  \
0               8.16         65.0  5

Posteriormente usar MICE en el dataset de planets de seaborn

In [32]:
import seaborn as sn
planets = sns.load_dataset("planets")
print(planets.isnull().sum())

method              0
number              0
orbital_period     43
mass              522
distance          227
year                0
dtype: int64


In [33]:
planets_numeric = planets.copy()
planets_numeric = pd.get_dummies(planets_numeric, columns=['method'], drop_first=True)

columns_for_imputation_planets = [
    'orbital_period',
    'mass',
    'distance'
] + [col for col in planets_numeric.columns if 'method_' in col]

data_for_mice_planets = planets_numeric[columns_for_imputation_planets]

print("Selected columns for MICE imputation (Planets dataset):")
print(data_for_mice_planets.head())
print("\nMissing values in data_for_mice_planets before imputation:")
print(data_for_mice_planets.isnull().sum())

Selected columns for MICE imputation (Planets dataset):
   orbital_period   mass  distance  method_Eclipse Timing Variations  \
0         269.300   7.10     77.40                             False   
1         874.774   2.21     56.95                             False   
2         763.000   2.60     19.84                             False   
3         326.030  19.40    110.62                             False   
4         516.220  10.50    119.47                             False   

   method_Imaging  method_Microlensing  method_Orbital Brightness Modulation  \
0           False                False                                 False   
1           False                False                                 False   
2           False                False                                 False   
3           False                False                                 False   
4           False                False                                 False   

   method_Pulsar Timing  metho

In [34]:
imputer_planets = IterativeImputer(max_iter=10, random_state=0)
data_for_mice_planets_imputed = pd.DataFrame(imputer_planets.fit_transform(data_for_mice_planets), columns=data_for_mice_planets.columns)

print("Missing values in data_for_mice_planets after imputation:")
print(data_for_mice_planets_imputed.isnull().sum())

Missing values in data_for_mice_planets after imputation:
orbital_period                          0
mass                                    0
distance                                0
method_Eclipse Timing Variations        0
method_Imaging                          0
method_Microlensing                     0
method_Orbital Brightness Modulation    0
method_Pulsar Timing                    0
method_Pulsation Timing Variations      0
method_Radial Velocity                  0
method_Transit                          0
method_Transit Timing Variations        0
dtype: int64




In [35]:
print("Original data_for_mice_planets (first 10 rows with potential NaN values):")
print(data_for_mice_planets.head(10))

print("\nImputed data_for_mice_planets_imputed (first 10 rows):")
print(data_for_mice_planets_imputed.head(10))

Original data_for_mice_planets (first 10 rows with potential NaN values):
   orbital_period   mass  distance  method_Eclipse Timing Variations  \
0         269.300   7.10     77.40                             False   
1         874.774   2.21     56.95                             False   
2         763.000   2.60     19.84                             False   
3         326.030  19.40    110.62                             False   
4         516.220  10.50    119.47                             False   
5         185.840   4.80     76.39                             False   
6        1773.400   4.64     18.15                             False   
7         798.500    NaN     21.41                             False   
8         993.300  10.30     73.10                             False   
9         452.800   1.99     74.79                             False   

   method_Imaging  method_Microlensing  method_Orbital Brightness Modulation  \
0           False                False               