In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# for the Q-Q plots
import scipy.stats as stats
%matplotlib inline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [6]:
#Creating sample dataset with missing values

raw_data = {'first_name': ['John', np.nan, 'Tommy', 'Jan', 'Zai', np.nan, np.nan, 'Howard', np.nan], 
        'last_name': ['Melvin','Albertson', np.nan, 'Lawson', np.nan, 'Collins', 'Philbert', np.nan, np.nan], 
        'age': [42, np.nan, 36, 24, 73, 37, np.nan, 16, 90], 
        'sex': ['m', np.nan, 'f', 'm', 'f', np.nan, 'f', np.nan, 'm'], 
        'hashealthinsurance': ['y', np.nan, 'n', 'y', 'y', np.nan, 'n', 'n', np.nan],
        'preTestScore': [4, np.nan, np.nan, 2, 3, np.nan, 1, 3, 2],
        'postTestScore': [25, np.nan, np.nan, 62, 70, 24, np.nan, np.nan, 56]}
raw_data

{'first_name': ['John', nan, 'Tommy', 'Jan', 'Zai', nan, nan, 'Howard', nan],
 'last_name': ['Melvin',
  'Albertson',
  nan,
  'Lawson',
  nan,
  'Collins',
  'Philbert',
  nan,
  nan],
 'age': [42, nan, 36, 24, 73, 37, nan, 16, 90],
 'sex': ['m', nan, 'f', 'm', 'f', nan, 'f', nan, 'm'],
 'hashealthinsurance': ['y', nan, 'n', 'y', 'y', nan, 'n', 'n', nan],
 'preTestScore': [4, nan, nan, 2, 3, nan, 1, 3, 2],
 'postTestScore': [25, nan, nan, 62, 70, 24, nan, nan, 56]}

In [7]:
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'hashealthinsurance', 'preTestScore', 'postTestScore'])
df.head()

Unnamed: 0,first_name,last_name,age,sex,hashealthinsurance,preTestScore,postTestScore
0,John,Melvin,42.0,m,y,4.0,25.0
1,,Albertson,,,,,
2,Tommy,,36.0,f,n,,
3,Jan,Lawson,24.0,m,y,2.0,62.0
4,Zai,,73.0,f,y,3.0,70.0


## Median Imputation Pipeline 

In [8]:
#All imputation is done using feature engine, as that keeps the data a as a pandas dataframe. Sklearn pipelines returns numpy arrays
# which have to be converted to a pd dataframe every time

median_columns = ['age', 'preTestScore', 'postTestScore'] ##Replacing Nan values with median value

from feature_engine.missing_data_imputers import MeanMedianImputer

median_imputer = MeanMedianImputer(imputation_method='median',
                                   variables=median_columns)

median_imputer.fit(df)


df = median_imputer.transform(df)

In [9]:
#age, preTestScore, postTestScore now have no Nan values
df

Unnamed: 0,first_name,last_name,age,sex,hashealthinsurance,preTestScore,postTestScore
0,John,Melvin,42.0,m,y,4.0,25.0
1,,Albertson,37.0,,,2.5,56.0
2,Tommy,,36.0,f,n,2.5,56.0
3,Jan,Lawson,24.0,m,y,2.0,62.0
4,Zai,,73.0,f,y,3.0,70.0
5,,Collins,37.0,,,2.5,24.0
6,,Philbert,37.0,f,n,1.0,56.0
7,Howard,,16.0,,n,3.0,56.0
8,,,90.0,m,,2.0,56.0


## Mode or Frequent Category Imputation

In [10]:
from feature_engine.missing_data_imputers import FrequentCategoryImputer

mode_freq_columns = ['sex', 'hashealthinsurance']

mode_imputer = FrequentCategoryImputer(variables=mode_freq_columns) #replacing NaN with most frequent category

mode_imputer.fit(df)

df = mode_imputer.transform(df)

In [11]:
df ## Sex and Hashealthinsurance now have no Nan values

Unnamed: 0,first_name,last_name,age,sex,hashealthinsurance,preTestScore,postTestScore
0,John,Melvin,42.0,m,y,4.0,25.0
1,,Albertson,37.0,f,n,2.5,56.0
2,Tommy,,36.0,f,n,2.5,56.0
3,Jan,Lawson,24.0,m,y,2.0,62.0
4,Zai,,73.0,f,y,3.0,70.0
5,,Collins,37.0,f,n,2.5,24.0
6,,Philbert,37.0,f,n,1.0,56.0
7,Howard,,16.0,f,n,3.0,56.0
8,,,90.0,m,n,2.0,56.0


## Arbitrary Value Imputation

In [14]:
## Replacing NAN values in names with Missing

from feature_engine.missing_data_imputers import CategoricalVariableImputer

cat_columns = ['first_name','last_name']

categorical_imputer = CategoricalVariableImputer(variables=cat_columns)

categorical_imputer.fit(df)

df = categorical_imputer.transform(df) #replaces Nan with Missing

In [15]:
df

Unnamed: 0,first_name,last_name,age,sex,hashealthinsurance,preTestScore,postTestScore
0,John,Melvin,42.0,m,y,4.0,25.0
1,Missing,Albertson,37.0,f,n,2.5,56.0
2,Tommy,Missing,36.0,f,n,2.5,56.0
3,Jan,Lawson,24.0,m,y,2.0,62.0
4,Zai,Missing,73.0,f,y,3.0,70.0
5,Missing,Collins,37.0,f,n,2.5,24.0
6,Missing,Philbert,37.0,f,n,1.0,56.0
7,Howard,Missing,16.0,f,n,3.0,56.0
8,Missing,Missing,90.0,m,n,2.0,56.0


In [23]:
preprocessor.fit(dfarbcat)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('arbitrary imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value='Smith',
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0))],
                                          verbose=False),
                                 ['last_name'])],
                  verbose=False)

In [24]:
preprocessor.transform(dfarbcat)

array([['Melvin', 'John', 42.0, 'm', 'y', 4.0, 25.0],
       ['Albertson', nan, nan, nan, nan, nan, nan],
       ['Smith', 'Tommy', 36.0, 'f', 'n', nan, nan],
       ['Lawson', 'Jan', 24.0, 'm', 'y', 2.0, 62.0],
       ['Smith', 'Zai', 73.0, 'f', 'y', 3.0, 70.0],
       ['Collins', nan, 37.0, nan, nan, nan, 24.0],
       ['Philbert', nan, nan, 'f', 'n', 1.0, nan],
       ['Smith', 'Howard', 16.0, nan, 'n', 3.0, nan],
       ['Smith', nan, 90.0, 'm', nan, 2.0, 56.0]], dtype=object)