# **Missing value imputation methods in Scikit-Learn library**

1. **SimpleImputer**
2. **KNNImputer**
3. **IterativeImputer**
4. **MissingIndicator** (this shows to indicate the missing values)

## Simple_imputer

In [8]:
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer


# Load the Titanic dataset
titanic_data = sns.load_dataset('titanic')
print("Missing values in the date:\n", titanic_data.isnull().sum())
# Select the relevant columns containing missing values
columns_with_nulls = ['age', 'fare']

# Create a new DataFrame with the selected columns
data = titanic_data[columns_with_nulls].copy()

# Method 1: SimpleImputer with mean strategy
mean_imputer = SimpleImputer(strategy='mean')
mean_imputed_data = mean_imputer.fit_transform(data)

# Method 2: SimpleImputer with median strategy
median_imputer = SimpleImputer(strategy='median')
median_imputed_data = median_imputer.fit_transform(data)

# Method 3: SimpleImputer with most frequent strategy
most_frequent_imputer = SimpleImputer(strategy='most_frequent')
most_frequent_imputed_data = most_frequent_imputer.fit_transform(data)

# Replace null values in the original DataFrame with new columns
titanic_data[['age_mean', 'fare_mean']] = mean_imputed_data
titanic_data[['age_median', 'fare_median']] = median_imputed_data
titanic_data[['age_most_frequent', 'fare_most_frequent']] = most_frequent_imputed_data

# print a separation line
print("==============================================================\n")

# print the missing values
print("Missing values in the imputed_data:\n", titanic_data.isnull().sum())

# Print the modified DataFrame
# print(titanic_data.head())

Missing values in the date:
 survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

Missing values in the imputed_data:
 survived                0
pclass                  0
sex                     0
age                   177
sibsp                   0
parch                   0
fare                    0
embarked                2
class                   0
who                     0
adult_male              0
deck                  688
embark_town             2
alive                   0
alone                   0
age_mean                0
fare_mean               0
age_median              0
fare_median             0
age_most_frequent       0
fare_most_frequent      0
dtype: int64


## Iterative_imputer

In [11]:
import pandas as pd
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Load the Titanic dataset
titanic_data = sns.load_dataset('titanic')

# Select the relevant columns containing missing values
columns_with_nulls = ['age', 'fare']

# Create a new DataFrame with the selected columns
data = titanic_data[columns_with_nulls].copy()

# IterativeImputer
iterative_imputer = IterativeImputer()
iterative_imputed_data = iterative_imputer.fit_transform(data)


# Replace null values in the original DataFrame with new columns
titanic_data[['age_iterative', 'fare_iterative']] = iterative_imputed_data


# print a separation line
print("==============================================================\n")

# print the missing values
print("Missing values in the imputed_data:\n", titanic_data.isnull().sum())



Missing values in the imputed_data:
 survived            0
pclass              0
sex                 0
age               177
sibsp               0
parch               0
fare                0
embarked            2
class               0
who                 0
adult_male          0
deck              688
embark_town         2
alive               0
alone               0
age_iterative       0
fare_iterative      0
dtype: int64


## KNN_imputer

In [12]:
import pandas as pd
import seaborn as sns
from sklearn.impute import KNNImputer

# Load the Titanic dataset
titanic_data = sns.load_dataset('titanic')

# Select the relevant columns containing missing values
columns_with_nulls = ['age', 'fare']

# Create a new DataFrame with the selected columns
data = titanic_data[columns_with_nulls].copy()

# KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)
knn_imputed_data = knn_imputer.fit_transform(data)

# Replace null values in the original DataFrame with new columns
titanic_data[['age_knn', 'fare_knn']] = knn_imputed_data



# print a separation line
print("==============================================================\n")

# print the missing values
print("Missing values in the imputed_data:\n", titanic_data.isnull().sum())



Missing values in the imputed_data:
 survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
age_knn          0
fare_knn         0
dtype: int64


## **MissingIndicator**

In [14]:
import pandas as pd
import seaborn as sns
from sklearn.impute import MissingIndicator

# Load the Titanic dataset
titanic_data = sns.load_dataset('titanic')

# Select the relevant columns containing missing values
columns_with_nulls = ['age', 'fare']

# Create a new DataFrame with the selected columns
data = titanic_data[columns_with_nulls].copy()

# Missing value indicators
indicator = MissingIndicator(features="all")
missing_indicators = indicator.fit_transform(data)

# Add missing value indicators to the DataFrame
titanic_data[['age_missing', 'fare_missing']] = missing_indicators



# print a separation line
print("==============================================================\n")

# print the missing values
print("Missing values in the imputed_data:\n", titanic_data.isnull().sum())

print(titanic_data.head())



Missing values in the imputed_data:
 survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
who               0
adult_male        0
deck            688
embark_town       2
alive             0
alone             0
age_missing       0
fare_missing      0
dtype: int64
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  age_missing  fare_missing  
0    man        True  NaN  Southampton    no  False        False       

# All together

In [16]:
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.impute import MissingIndicator

# Load the Titanic dataset
titanic_data = sns.load_dataset('titanic')

# Select the relevant columns containing missing values
columns_with_nulls = ['age', 'fare']

# Create a new DataFrame with the selected columns
data = titanic_data[columns_with_nulls].copy()

# Method 1: SimpleImputer with mean strategy
mean_imputer = SimpleImputer(strategy='mean')
mean_imputed_data = mean_imputer.fit_transform(data)

# Method 2: SimpleImputer with median strategy
median_imputer = SimpleImputer(strategy='median')
median_imputed_data = median_imputer.fit_transform(data)

# Method 3: SimpleImputer with most frequent strategy
most_frequent_imputer = SimpleImputer(strategy='most_frequent')
most_frequent_imputed_data = most_frequent_imputer.fit_transform(data)

# Method 4: KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)
knn_imputed_data = knn_imputer.fit_transform(data)

# Method 5: IterativeImputer
iterative_imputer = IterativeImputer()
iterative_imputed_data = iterative_imputer.fit_transform(data)

# Missing value indicators
indicator = MissingIndicator(features="all")
missing_indicators = indicator.fit_transform(data)

# Replace null values in the original DataFrame
titanic_data[['age_mean', 'fare_mean']] = mean_imputed_data
titanic_data[['age_median', 'fare_median']] = median_imputed_data
titanic_data[['age_most_frequent', 'fare_most_frequent']] = most_frequent_imputed_data
titanic_data[['age_knn', 'fare_knn']] = knn_imputed_data
titanic_data[['age_iterative', 'fare_iterative']] = iterative_imputed_data

# Add missing value indicators to the DataFrame
titanic_data[['age_missing', 'fare_missing']] = missing_indicators

# Print the modified DataFrame
print(titanic_data.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  ...  age_median fare_median age_most_frequent fare_most_frequent  \
0    man  ...        22.0      7.2500              22.0             7.2500   
1  woman  ...        38.0     71.2833              38.0            71.2833   
2  woman  ...        26.0      7.9250              26.0             7.9250   
3  woman  ...        35.0     53.1000              35.0            53.1000   
4    man  ...        35.0      8.0500              35.0             8.0500   

   age_knn  fare_knn  age_iterative  fare_iterative  age_missing  fare_mis

In [17]:
titanic_data.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,age_mean,fare_mean,age_median,fare_median,age_most_frequent,fare_most_frequent,age_knn,fare_knn,age_iterative,fare_iterative
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,29.699118,32.204208,29.361582,32.204208,28.56697,32.204208,29.840074,32.204208,29.643268,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,13.002015,49.693429,13.019697,49.693429,13.199572,49.693429,13.348477,49.693429,13.006385,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0,0.42,0.0,0.42,0.0,0.42,0.0,0.42,0.0,0.42,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,22.0,7.9104,22.0,7.9104,22.0,7.9104,22.0,7.9104,22.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542,29.699118,14.4542,28.0,14.4542,24.0,14.4542,29.0,14.4542,29.09661,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0,35.0,31.0,35.0,31.0,35.0,31.0,36.4,31.0,35.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,80.0,512.3292,80.0,512.3292,80.0,512.3292,80.0,512.3292,80.0,512.3292
