### Imported the required Libraries and Classes.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.read_csv('titanic_toy.csv')

In [4]:
df.head()

Unnamed: 0,Age,Fare,Family,Survived
0,22.0,7.25,1,0
1,38.0,71.2833,1,1
2,26.0,7.925,0,1
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [None]:
df.isnull().mean()*100

### Input and Target Features

In [None]:
X = df.drop(columns=['Survived'])
y = df['Survived']

### Train-Test Split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

### Adding Two Columns for each of Age and Fare with imputed values 99 and -1

In [None]:
print("Minimum Age Value:- ",X_train["Age"].min())
print("Maximum Age Value:- ",X_train["Age"].max())
print("Minimum Fare Value:- ",X_train["Fare"].min())
print("Maximum Fare Value:- ",X_train["Fare"].max())

In [None]:
X_train['Age_99'] = X_train['Age'].fillna(99)
X_train['Age_minus1'] = X_train['Age'].fillna(-1)

X_train['Fare_999'] = X_train['Fare'].fillna(999)
X_train['Fare_minus1'] = X_train['Fare'].fillna(-1)

### While using this technique, it's definite that the Variance will change significantly.
#### But main purpose of using this technique is help Model differenciate between where values are known and where these are missing.

In [None]:
print('Original Age variable variance: ', X_train['Age'].var())
print('Age Variance after 99 imputation: ', X_train['Age_99'].var())
print('Age Variance after -1 imputation: ', X_train['Age_minus1'].var())

print('Original Fare variable variance: ', X_train['Fare'].var())
print('Fare Variance after 999 imputation: ', X_train['Fare_999'].var())
print('Fare Variance after -1 imputation: ', X_train['Fare_minus1'].var())

### Kernel Density Estimation plots

In [None]:
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(111)

# original variable distribution
X_train['Age'].plot(kind='kde', ax=ax)

# variable imputed with the median
X_train['Age_99'].plot(kind='kde', ax=ax, color='red')

# variable imputed with the mean
X_train['Age_minus1'].plot(kind='kde', ax=ax, color='green')

# add legends
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')


# For Age_99 and Age_minus1, the distribution is changed significantly with additional peaks at 99 and -1 respectively.
# This is so because around 20% of the total values are replaced with 99 and -1.

In [None]:
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(111)

# original variable distribution
X_train['Fare'].plot(kind='kde', ax=ax)

# variable imputed with the median
X_train['Fare_999'].plot(kind='kde', ax=ax, color='red')

# variable imputed with the mean
X_train['Fare_minus1'].plot(kind='kde', ax=ax, color='green')

# add legends
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
X_train.cov()

In [None]:
X_train.corr()

## Using Sklearn

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

### Defined two SimpleImputer objects/instances with same imputation startegy but different fill_value.

In [None]:
imputer1 = SimpleImputer(strategy='constant',fill_value=99)
imputer2 = SimpleImputer(strategy='constant',fill_value=999)

### Defined the Transformer to transform both the fileds with specified imputer object.

In [None]:
trf = ColumnTransformer([
    ('imputer1',imputer1,['Age']),
    ('imputer2',imputer2,['Fare'])
],remainder='passthrough')

In [None]:
trf.fit(X_train)

In [None]:
trf.named_transformers_['imputer1'].statistics_

In [None]:
trf.named_transformers_['imputer2'].statistics_

In [None]:
X_train = trf.transform(X_train)
X_test = trf.transform(X_test)

In [None]:
X_train

In [None]:
transformed_df = pd.DataFrame(X_train, columns = df.columns[:3])

In [None]:
transformed_df.head()

In [None]:
transformed_df.isnull().mean()*100