# Missing Value Imputation using ML Methods

In [324]:
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, f1_score, accuracy_score

In [237]:
# The horse colic dataset describes medical characteristics of horses with colic and whether they lived or died.
# There are 300 rows and 26 input variables with one output variable. It is a binary classification prediction task that involves predicting 1 if the horse lived and 2 if the horse died.
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
ds = pd.read_csv(url, header=None, na_values='?')
ds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1.0,1,533886,,120.0,70.0,4.0,,4.0,2.0,...,55.0,65.0,,,3.0,2,3205,0,0,2
296,2.0,1,527702,37.2,72.0,24.0,3.0,2.0,4.0,2.0,...,44.0,,3.0,3.3,3.0,1,2208,0,0,1
297,1.0,1,529386,37.5,72.0,30.0,4.0,3.0,4.0,1.0,...,60.0,6.8,,,2.0,1,3205,0,0,2
298,1.0,1,530612,36.5,100.0,24.0,3.0,3.0,3.0,1.0,...,50.0,6.0,3.0,3.4,1.0,1,2208,0,0,1


In [238]:
ds.isna().mean()

0     0.003333
1     0.000000
2     0.000000
3     0.200000
4     0.080000
5     0.193333
6     0.186667
7     0.230000
8     0.156667
9     0.106667
10    0.183333
11    0.146667
12    0.186667
13    0.346667
14    0.353333
15    0.823333
16    0.340000
17    0.393333
18    0.096667
19    0.110000
20    0.550000
21    0.660000
22    0.003333
23    0.000000
24    0.000000
25    0.000000
26    0.000000
27    0.000000
dtype: float64

In [239]:
X, y = ds.iloc[:, :-1], ds.iloc[:, -1]

In [353]:
X.isna().mean().mean()

0.19814814814814813

In [165]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline Performance

In [315]:
classifier = make_pipeline(
    StandardScaler(),
    MLPClassifier(max_iter=1000, verbose=True, random_state=42)
)

In [325]:
clf = deepcopy(classifier)
clf.fit(Xtrain.fillna(Xtrain.mean()), ytrain)
pred = clf.predict(Xtest.fillna(Xtrain.mean()))

Iteration 1, loss = 0.72612800
Iteration 2, loss = 0.70327473
Iteration 3, loss = 0.68370743
Iteration 4, loss = 0.66613241
Iteration 5, loss = 0.65132236
Iteration 6, loss = 0.63683722
Iteration 7, loss = 0.62396852
Iteration 8, loss = 0.61261327
Iteration 9, loss = 0.60116629
Iteration 10, loss = 0.59093314
Iteration 11, loss = 0.58129814
Iteration 12, loss = 0.57180421
Iteration 13, loss = 0.56369980
Iteration 14, loss = 0.55575213
Iteration 15, loss = 0.54747788
Iteration 16, loss = 0.54028939
Iteration 17, loss = 0.53280124
Iteration 18, loss = 0.52590428
Iteration 19, loss = 0.51935600
Iteration 20, loss = 0.51289011
Iteration 21, loss = 0.50671837
Iteration 22, loss = 0.50087510
Iteration 23, loss = 0.49516291
Iteration 24, loss = 0.48949828
Iteration 25, loss = 0.48454879
Iteration 26, loss = 0.47907500
Iteration 27, loss = 0.47406306
Iteration 28, loss = 0.46916781
Iteration 29, loss = 0.46430583
Iteration 30, loss = 0.45970468
Iteration 31, loss = 0.45499709
Iteration 32, los

In [326]:
scores = {}
scores['mean', 'f1'] = f1_score(ytest, pred, average='macro')
scores['mean', 'accuracy'] = accuracy_score(ytest, pred)

## Iterative Imputation

In [289]:
def iterative_impute(estimator, ds_train, ds_test, iters=1):
    missing = ds_train.isna()
    df = ds_train.fillna(ds_train.mean())
    missing2 = ds_test.isna()
    df2 = ds_test.fillna(ds_train.mean())
    estimators = {}
    for _ in range(iters):
        for col in df.columns:
            if missing[col].any():
                print("Imputing Feature", col)
                mask = missing[col]
                Xtrain, ytrain = df.drop(col, axis=1)[~mask], df[col][~mask]
                est = estimators.setdefault(col, deepcopy(estimator))
                est.fit(Xtrain, ytrain)
                current = df[col]
                df[col] = current.mask(mask, est.predict(df.drop(col, axis=1)))
                print("Delta:", mean_absolute_error(df[col], current))
                df2[col] = df2[col].mask(missing2[col], est.predict(df2.drop(col, axis=1)))
                estimators[col] = est
    return df, df2

In [334]:
df_train, df_test = iterative_impute(
    RidgeCV(),
    pd.concat([Xtrain, ytrain], axis=1), 
    pd.concat([Xtest, ytest], axis=1), 
    iters=5)
Xtrain_imputed = df_train.iloc[:, :-1]
Xtest_imputed = df_test.iloc[:, :-1]

Imputing Feature 0
Delta: 0.001484889979948109
Imputing Feature 3
Delta: 0.04408703529346587
Imputing Feature 4
Delta: 1.544621381010714
Imputing Feature 5
Delta: 1.3237398381107994
Imputing Feature 6
Delta: 0.04379634812998547
Imputing Feature 7
Delta: 0.10539708272242677
Imputing Feature 8
Delta: 0.10261144402272886
Imputing Feature 9
Delta: 0.02653876986269139
Imputing Feature 10
Delta: 0.08806155117076514
Imputing Feature 11
Delta: 0.04841870628172907
Imputing Feature 12


Delta: 0.09067005871709324
Imputing Feature 13
Delta: 0.07099149796454624
Imputing Feature 14
Delta: 0.11467329448009538
Imputing Feature 15
Delta: 1.1327831850079007
Imputing Feature 16
Delta: 0.15238100159864904
Imputing Feature 17
Delta: 0.3133616418586664
Imputing Feature 18
Delta: 0.5185258295702163
Imputing Feature 19
Delta: 0.3516190089996564
Imputing Feature 20
Delta: 0.25238925665014267
Imputing Feature 21
Delta: 0.5012787625470454
Imputing Feature 22
Delta: 0.00016516334993846028
Imputing Feature 0
Delta: 0.00042206346365390816
Imputing Feature 3
Delta: 0.029526837041208894
Imputing Feature 4
Delta: 0.34213502183796946
Imputing Feature 5
Delta: 0.971068322003929
Imputing Feature 6
Delta: 0.03031176398570382
Imputing Feature 7
Delta: 0.024966403050252252
Imputing Feature 8
Delta: 0.027270149607636178
Imputing Feature 9
Delta: 0.006946371930798241
Imputing Feature 10
Delta: 0.03332590400742705
Imputing Feature 11
Delta: 0.013459634036646849
Imputing Feature 12
Delta: 0.02858805

In [335]:
clf = deepcopy(classifier)
clf.fit(Xtrain_imputed, ytrain)
pred = clf.predict(Xtest_imputed)

Iteration 1, loss = 0.70194376
Iteration 2, loss = 0.67564659
Iteration 3, loss = 0.65287017
Iteration 4, loss = 0.63321914
Iteration 5, loss = 0.61627325
Iteration 6, loss = 0.60020313
Iteration 7, loss = 0.58634264
Iteration 8, loss = 0.57410551
Iteration 9, loss = 0.56238799
Iteration 10, loss = 0.55179907
Iteration 11, loss = 0.54258701
Iteration 12, loss = 0.53334667
Iteration 13, loss = 0.52534941
Iteration 14, loss = 0.51792073
Iteration 15, loss = 0.51006766
Iteration 16, loss = 0.50338793
Iteration 17, loss = 0.49644137
Iteration 18, loss = 0.49006985
Iteration 19, loss = 0.48406903
Iteration 20, loss = 0.47824161
Iteration 21, loss = 0.47257019
Iteration 22, loss = 0.46743742
Iteration 23, loss = 0.46229016
Iteration 24, loss = 0.45739168
Iteration 25, loss = 0.45303772
Iteration 26, loss = 0.44820177
Iteration 27, loss = 0.44385361
Iteration 28, loss = 0.43948914
Iteration 29, loss = 0.43544749
Iteration 30, loss = 0.43148465
Iteration 31, loss = 0.42755231
Iteration 32, los

In [336]:
scores['ridge', 'f1'] = f1_score(ytest, pred, average='macro')
scores['ridge', 'accuracy'] = accuracy_score(ytest, pred)

In [320]:
# MissForest
df_train, df_test = iterative_impute(
    RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1), 
    pd.concat([Xtrain, ytrain], axis=1), 
    pd.concat([Xtest, ytest], axis=1), iters=5)
Xtrain_imputed = df_train.iloc[:, :-1]
Xtest_imputed = df_test.iloc[:, :-1]

Imputing Feature 0
Delta: 0.0016562064156206415
Imputing Feature 3
Delta: 0.048926058201057936
Imputing Feature 4
Delta: 1.6571900452488686
Imputing Feature 5
Delta: 1.175843537414966
Imputing Feature 6
Delta: 0.06208934707903778
Imputing Feature 7
Delta: 0.10666666666666667
Imputing Feature 8
Delta: 0.11047454844006568
Imputing Feature 9
Delta: 0.02499538341158059
Imputing Feature 10
Delta: 0.10625510204081631
Imputing Feature 11
Delta: 0.043202791461412156
Imputing Feature 12
Delta: 0.11118197278911565
Imputing Feature 13
Delta: 0.06391666666666668
Imputing Feature 14
Delta: 0.14119444444444446
Imputing Feature 15
Delta: 0.8785416666666666
Imputing Feature 16
Delta: 0.15135403726708072
Imputing Feature 17
Delta: 0.2900513698630137
Imputing Feature 18
Delta: 0.6244600614439323
Imputing Feature 19
Delta: 1.8654834507042253
Imputing Feature 20
Delta: 0.22075450450450446
Imputing Feature 21
Delta: 0.540204430379747
Imputing Feature 22
Delta: 0.0013556485355648536
Imputing Feature 0
Delta

In [329]:
clf = deepcopy(classifier)
clf.fit(Xtrain_imputed, ytrain)
pred = clf.predict(Xtest_imputed)

Iteration 1, loss = 0.69138816
Iteration 2, loss = 0.66703567
Iteration 3, loss = 0.64672006
Iteration 4, loss = 0.62874096
Iteration 5, loss = 0.61367800
Iteration 6, loss = 0.59941157
Iteration 7, loss = 0.58663820
Iteration 8, loss = 0.57619926
Iteration 9, loss = 0.56559702
Iteration 10, loss = 0.55646053
Iteration 11, loss = 0.54806710
Iteration 12, loss = 0.53967155
Iteration 13, loss = 0.53287456
Iteration 14, loss = 0.52623311
Iteration 15, loss = 0.51927722
Iteration 16, loss = 0.51351184
Iteration 17, loss = 0.50719777
Iteration 18, loss = 0.50186853
Iteration 19, loss = 0.49674751
Iteration 20, loss = 0.49180639
Iteration 21, loss = 0.48707798
Iteration 22, loss = 0.48285682
Iteration 23, loss = 0.47875888
Iteration 24, loss = 0.47459202
Iteration 25, loss = 0.47110403
Iteration 26, loss = 0.46702290
Iteration 27, loss = 0.46339817
Iteration 28, loss = 0.45951025
Iteration 29, loss = 0.45594722
Iteration 30, loss = 0.45240026
Iteration 31, loss = 0.44883697
Iteration 32, los

In [330]:
scores['missforest', 'f1'] = f1_score(ytest, pred, average='macro')
scores['missforest', 'accuracy'] = accuracy_score(ytest, pred)

In [339]:
df_train, df_test = iterative_impute(
    KNeighborsRegressor(n_neighbors=5, n_jobs=-1),
    pd.concat([Xtrain, ytrain], axis=1), 
    pd.concat([Xtest, ytest], axis=1), iters=5)
Xtrain_imputed = df_train.iloc[:, :-1]
Xtest_imputed = df_test.iloc[:, :-1]

Imputing Feature 0
Delta: 0.0016562064156206415
Imputing Feature 3


Delta: 0.05780952380952347
Imputing Feature 4
Delta: 1.7402300150829562
Imputing Feature 5
Delta: 1.3055102040816327
Imputing Feature 6
Delta: 0.08334192439862541
Imputing Feature 7
Delta: 0.0975
Imputing Feature 8
Delta: 0.12364121510673236
Imputing Feature 9
Delta: 0.020414710485133023
Imputing Feature 10
Delta: 0.12127551020408166
Imputing Feature 11
Delta: 0.06296387520525452
Imputing Feature 12
Delta: 0.09499999999999999
Imputing Feature 13
Delta: 0.08333333333333333
Imputing Feature 14
Delta: 0.1188888888888889
Imputing Feature 15
Delta: 1.0090833333333333
Imputing Feature 16
Delta: 0.14454968944099378
Imputing Feature 17
Delta: 0.32350456621004564
Imputing Feature 18
Delta: 0.4952956989247312
Imputing Feature 19
Delta: 2.5594256651017213
Imputing Feature 20
Delta: 0.269557057057057
Imputing Feature 21
Delta: 0.7171339662447258
Imputing Feature 22
Delta: 0.0011889818688981869
Imputing Feature 0
Delta: 0.0
Imputing Feature 3
Delta: 2.960594732333751e-17
Imputing Feature 4
Delta: 0

In [340]:
clf = deepcopy(classifier)
clf.fit(Xtrain_imputed, ytrain)
pred = clf.predict(Xtest_imputed)

Iteration 1, loss = 0.69138816
Iteration 2, loss = 0.66703567
Iteration 3, loss = 0.64672006
Iteration 4, loss = 0.62874096
Iteration 5, loss = 0.61367800
Iteration 6, loss = 0.59941157
Iteration 7, loss = 0.58663820
Iteration 8, loss = 0.57619926
Iteration 9, loss = 0.56559702
Iteration 10, loss = 0.55646053
Iteration 11, loss = 0.54806710
Iteration 12, loss = 0.53967155
Iteration 13, loss = 0.53287456


Iteration 14, loss = 0.52623311
Iteration 15, loss = 0.51927722
Iteration 16, loss = 0.51351184
Iteration 17, loss = 0.50719777
Iteration 18, loss = 0.50186853
Iteration 19, loss = 0.49674751
Iteration 20, loss = 0.49180639
Iteration 21, loss = 0.48707798
Iteration 22, loss = 0.48285682
Iteration 23, loss = 0.47875888
Iteration 24, loss = 0.47459202
Iteration 25, loss = 0.47110403
Iteration 26, loss = 0.46702290
Iteration 27, loss = 0.46339817
Iteration 28, loss = 0.45951025
Iteration 29, loss = 0.45594722
Iteration 30, loss = 0.45240026
Iteration 31, loss = 0.44883697
Iteration 32, loss = 0.44535679
Iteration 33, loss = 0.44200799
Iteration 34, loss = 0.43929356
Iteration 35, loss = 0.43595809
Iteration 36, loss = 0.43293434
Iteration 37, loss = 0.42992843
Iteration 38, loss = 0.42739822
Iteration 39, loss = 0.42451058
Iteration 40, loss = 0.42172950
Iteration 41, loss = 0.41909246
Iteration 42, loss = 0.41608898
Iteration 43, loss = 0.41318907
Iteration 44, loss = 0.41048452
Iteratio

In [341]:
scores['knn', 'f1'] = f1_score(ytest, pred, average='macro')
scores['knn', 'accuracy'] = accuracy_score(ytest, pred)

In [342]:
pd.Series(scores).unstack()

Unnamed: 0,accuracy,f1
knn,0.716667,0.644723
mean,0.8,0.753762
missforest,0.716667,0.644723
ridge,0.85,0.811912
