# **Missing Indicator**

In [143]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


import warnings
warnings.filterwarnings('ignore')

In [144]:
df = pd.read_csv('train.csv',usecols=['Age','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


## **Type 1 - Using `MissingIndicator()`**

In [145]:
x = df.drop(columns=['Survived'])
y = df[['Survived']]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [146]:
si = SimpleImputer()


x_train_trf = pd.DataFrame(si.fit_transform(x_train), columns=x.columns)
x_test_trf = pd.DataFrame(si.transform(x_test), columns=x.columns)


In [147]:
reg = LogisticRegression()

reg.fit(x_train_trf, y_train)
y_test_pred = reg.predict(x_test_trf)

print(f"{np.round(accuracy_score(y_test, y_test_pred),2)*100} %")


61.0 %


In [148]:
# printing the missing data

mi = MissingIndicator()
x_train_missing = pd.DataFrame(mi.fit_transform(x_train), columns=['missing x_train'])
x_test_missing = pd.DataFrame(mi.transform(x_test), columns=['missing x_test'])
print(x_train_missing.sum())   # will print the total numbers of missing data
print(x_test_missing.sum())   # will print the total numbers of missing data
x_train_missing.sample(5)       # will print all the missing dat


missing x_train    148
dtype: int64
missing x_test    29
dtype: int64


Unnamed: 0,missing x_train
208,False
452,False
64,False
410,False
653,False


In [149]:
x_train['Age_NaN'] = x_train_missing
x_test['Age_NaN'] = x_test_missing          # REMEMBER, while naming the 'Age_NaN' it must be same in both x_train and x_test, or else it will throw error while making prediction
x_train.sample(5)     # the cells where values are not missing will be written as 'False', and 'True' for missing values


Unnamed: 0,Age,Fare,Age_NaN
708,22.0,151.55,True
285,33.0,8.6625,False
168,,25.925,False
705,39.0,26.0,True
76,,7.8958,True


What we did in above is very imp, coz as you put a new feature which represents where the value is missing and where the value is not missing, your accuracy for the prediction increases by a lot. 

This is a way where the machine gets to know where the value is missing and wehere it is not.

In [150]:
x_test.sample(5)

Unnamed: 0,Age,Fare,Age_NaN
836,21.0,8.6625,
870,26.0,7.8958,
280,65.0,7.75,
728,25.0,26.0,
731,11.0,18.7875,


In [151]:
si = SimpleImputer()

x_train_trf2 = si.fit_transform(x_train)
x_test_trf2 = si.transform(x_test)

reg = LogisticRegression()
reg.fit(x_train_trf2, y_train)
y_test_pred2 = reg.predict(x_test_trf2)

print(f"{np.round(accuracy_score(y_test, y_test_pred2),2)*100} %")
# as you can see our output has increased by 2 %


62.0 %


As you can see, it is quite hectic to do all these always, hence to avoid these there is a inbuilt parameter in SimpleImputer `add_indicator = True`.

By doing these you dont need to make another column specifically. 
Example code is down below...

## **Type 2 - using `SimpleImputer(add_indicator=True)`**

In [163]:
x = df.drop(columns=['Survived'])
y = df[['Survived']]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

si = SimpleImputer(add_indicator=True)

cols = [['Age', 'Fare', 'Age_imputed']]

x_train_trf = pd.DataFrame(si.fit_transform(x_train), columns=cols)
x_test_trf = pd.DataFrame(si.transform(x_test), columns=cols)


reg.fit(x_train_trf, y_train)
y_test_pred = reg.predict(x_test_trf)

print(f"ACCURACY : {np.round(accuracy_score(y_test, y_test_pred),2)*100} %")

# as you can see the accuracy is 63 % now



ACCURACY : 63.0 %


In [189]:
x_train_trf.sample(6)       # 1 = data missing, 0 = data not missing

Unnamed: 0,Age,Fare,Age_imputed
396,21.0,7.65,0.0
643,60.0,75.25,0.0
325,29.785904,8.4583,1.0
509,21.0,7.925,0.0
635,29.785904,14.5,1.0
331,14.0,7.8542,0.0
