In [195]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [196]:
df = pd.read_csv('titanic.csv', usecols=['Age', 'Fare', 'Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [197]:
y = df['Survived']
X = df.drop(['Survived'], axis=1)

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [199]:
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [200]:
model = LogisticRegression()
model.fit(X_train_imputed, y_train)
pred = model.predict(X_test_imputed)
score = accuracy_score(y_test, pred)
print(f"Accuracy Score: {score}")

Accuracy Score: 0.6480446927374302


#### Now using MissingIndicator, and computing score

In [202]:
X_train

Unnamed: 0,Age,Fare
331,45.5,28.5000
733,23.0,13.0000
382,32.0,7.9250
704,26.0,7.8542
813,6.0,31.2750
...,...,...
106,21.0,7.6500
270,,31.0000
860,41.0,14.1083
435,14.0,120.0000


In [203]:
mi = MissingIndicator()
si = SimpleImputer()

trans1 = ColumnTransformer(transformers=[
    ('missingIndicator', MissingIndicator(), [0]),
    ('simpleImputer', SimpleImputer(), [0])
], remainder='passthrough')

trans2  = LogisticRegression()

pipeline = Pipeline([
    ('trans1', trans1),
    ('trans2', trans2)
])

In [246]:
trans1.fit(X_train)

# get the transformed column after applying missingIndicator
missing_indicator_values = trans1.transformers_[0][1].transform(df[['Age']])




In [204]:
pipeline.fit(X_train, y_train)

In [None]:
preds = pipeline.predict(X_test)
score = accuracy_score(y_test, preds)
print(f"Accuracy Score: {score}")

As we can see here, there's been a slight degradation in the performance of the model. It's opposite to what we expected.

#### We can also directly use missing indicator in SimpleImputer

In [None]:
si = SimpleImputer(add_indicator=True)
lr = LogisticRegression()

X_train_transf = si.fit_transform(X_train)
X_test_transf = si.transform(X_test)

lr.fit(X_train_transf, y_train)
preds2 = lr.predict(X_test_transf)
score1 = accuracy_score(y_test, preds2)
print(f"Accuracy Score: {score1}")

As we can see, in both of the cases, the accuracy score is the same