This notebook will help you impute missing values using sklearn SimpleImputer.
Continuous features will be imputed with sample mean, while categorical data with sample mode ( most frequent value )

Import the packages

In [52]:
# !pip install numpy
# !pip install sklearn
import numpy as np
from sklearn.impute import SimpleImputer

Impute continuous features

In [9]:
# Specify the datasets
train_data = [[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]
test_data = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]

# Imputor class
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# train on data
imp_mean.fit(train_data)

# impute train-data.
print(imp_mean.transform(train_data))
print('===============================================')
# impute test-data.
print(imp_mean.transform(test_data))

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   5.   9. ]]
[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   3.5  9. ]]


Categorical imputation using most-frequent value ( mode )

In [10]:
# Specify the datasets
train_data = [['yes','one',np.nan], [np.nan, np.nan,'ok'], ['yes','one','not ok'],['no','two','not ok']]
test_data = [[np.nan,np.nan,np.nan]]

# Imputor class
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# train on data
imp_mode.fit(train_data)

# impute train-data.
print(imp_mode.transform(train_data))
print('===============================================')
# impute test-data.
print(imp_mode.transform(test_data))

[['yes' 'one' 'not ok']
 ['yes' 'one' 'ok']
 ['yes' 'one' 'not ok']
 ['no' 'two' 'not ok']]
[['yes' 'one' 'not ok']]


Missing Indicator Method


In [31]:
from sklearn.impute import MissingIndicator

train_data = [[7, 50, 3], [4, np.nan, 6], [10, 5, 9]]
test_data = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]


indicator = MissingIndicator(features = 'all')
indicator.fit(train_data)
indicator.transform(train_data)

array([[False, False, False],
       [False,  True, False],
       [False, False, False]])

In [32]:
indicator.transform(test_data)

array([[ True, False, False],
       [False,  True, False],
       [False,  True, False]])

Preprocess data from complex methods

In [44]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
scaler.fit(train_data)
train_data = scaler.transform(train_data)

KNN Imputation


In [45]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=1)
imputer.fit(train_data)
imputer.transform(train_data)

array([[ 0.        , -1.        , -1.22474487],
       [-1.22474487, -1.        ,  0.        ],
       [ 1.22474487,  1.        ,  1.22474487]])

MissForest

In [46]:
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from impute
from sklearn.impute import IterativeImputer
# Import RandomForest
from sklearn.ensemble import RandomForestRegressor

In [47]:
imp_mean = IterativeImputer(estimator=RandomForestRegressor(n_estimators = 50),random_state=0)
imp_mean.fit(train_data)
imp_mean.transform(train_data)

array([[ 0.        , -1.        , -1.22474487],
       [-1.22474487, -0.48      ,  0.        ],
       [ 1.22474487,  1.        ,  1.22474487]])

DAE

In [51]:
from new_dae import DAE
import pandas as pd

train_data = pd.DataFrame(train_data)

dae_imputor = DAE(parameters = {},names = list(train_data.columns),vmaps = {})
dae_imputor.fit(train_data)
imputed_data, new_names, vmaps = dae_imputor.transform(train_data)
imputed_data

[[0.5, 0.0, 0.0], [0.0, 0.32137125730514526, 0.5], [1.0, 1.0, 1.0]]

GAIN

Needs Normalized data in range 0 - 1

In [53]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train_data)
train_data = pd.DataFrame(scaler.transform(train_data))

In [55]:
from gain import Gain


gain_imputor = Gain(parameters = {'iterations' : 100},names = list(train_data.columns) , vmaps = {} )

gain_imputor.fit(train_data)
imputed_data, new_names, vmaps  = gain_imputor.transform(train_data)
imputed_data

 20%|██        | 20/100 [00:00<00:00, 192.22it/s]

Iter: 0	Train_loss: 0.4779	Test_loss: 0.5


100%|██████████| 100/100 [00:00<00:00, 191.56it/s]


[[0.5, 0.0, 0.0], [0.0, 0.4761406091827655, 0.5], [1.0, 1.0, 1.0]]