# Missing Indicator
* In Missing indicator, we add a column to the dataset to flag which observations have a missing value for a particular feature.
* First, the missing values in original feature are replaced with either mean, median or mode, then a new column is created for that orginal feature where if the value was missing previously then its is flaged as True else False

In [1]:
# Loading dataset
import numpy as np
import pandas as pd

df = pd.read_csv('train.csv', usecols=['Age', 'Fare', 'Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [2]:
# Train test Split
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Survived'])
Y = df['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

## Accuracy of logistic regression before missing indicator

In [3]:
# Handling missing values using SimpleImputer
from sklearn.impute import SimpleImputer
si = SimpleImputer()

X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [4]:
# Applying logistic regression and checking accuracy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression()
clf.fit(X_train_trf, Y_train)

Y_pred = clf.predict(X_test_trf)
accuracy_score(Y_test, Y_pred)

0.6145251396648045

##Accuracy of logistic regression after missing indicator

In [6]:
# Missing Indicator step
from sklearn.impute import MissingIndicator
mi = MissingIndicator()

X_train_missing = mi.fit_transform(X_train)
X_test_missing = mi.transform(X_test)

In [7]:
# Adding columns in data
X_train['Age_NA'] = X_train_missing
X_test['Age_NA'] = X_test_missing

In [8]:
# Handling missing values using SimpleImputer step
from sklearn.impute import SimpleImputer
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [9]:
# Applying logistic regression and checking accuracy => Model accuracy improved from last time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression()
clf.fit(X_train_trf2, Y_train)

Y_pred = clf.predict(X_test_trf2)
accuracy_score(Y_test, Y_pred)

0.6312849162011173

## SimpleImputer also comes with a add_indicator=True parameter

In [10]:
# Handling missing values using SimpleImputer and setting add_indicator parameter
from sklearn.impute import SimpleImputer
si = SimpleImputer(add_indicator=True)

X_train_trf3 = si.fit_transform(X_train)
X_test_trf3 = si.transform(X_test)

In [11]:
# Applying logistic regression and checking accuracy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression()
clf.fit(X_train_trf3, Y_train)

Y_pred = clf.predict(X_test_trf3)
accuracy_score(Y_test, Y_pred)

0.6312849162011173