In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [2]:
# load the dataset
diabetes = datasets.load_diabetes()
X_full = diabetes.data
y = diabetes.target

print(X_full.shape)

(442, 10)


AttributeError: info

In [3]:
# Add missing values in 10% of the data
miss_ratio = 0.1
rng = np.random.RandomState(0)

X_unif = rng.random_sample(X_full.shape)

X_ind = X_unif < miss_ratio
X_miss = X_full.copy()
X_miss[X_ind] = np.nan

print(X_miss[X_ind].shape)

(433,)


In [4]:
print(X_miss)

[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990749
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06833155
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286131
  -0.02593034]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.01107952         nan
   0.01549073]
 [-0.04547248         nan  0.03906215 ...  0.02655962         nan
  -0.02593034]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.03949338         nan
   0.00306441]]


In [5]:
# split the dataset into training and  testing
X_full_train, X_full_test, y_train, y_test = train_test_split(X_full, y, test_size=0.30, random_state=42)
X_miss_train, X_miss_test, y_train, y_test = train_test_split(X_miss, y, test_size=0.30, random_state=42)

In [6]:
# see results without missing data
ols_full = linear_model.LinearRegression(fit_intercept=True)
ols_full.fit(X_full_train, y_train)
print('RMSE - full = ', np.sqrt(mean_squared_error(y_test, ols_full.predict(X_full_test))))

RMSE - full =  53.1201560709427


In [7]:
def test_imputations(my_strategy, my_fill_value, X_miss_train, X_miss_test, y_train, y_test):
    
    imp = SimpleImputer(missing_values=np.nan, strategy=my_strategy, fill_value=my_fill_value)
    imp.fit(X_miss_train)
    
    X_imp_train = imp.transform(X_miss_train)
    X_imp_test = imp.transform(X_miss_test)
    
    ols_miss = linear_model.LinearRegression(fit_intercept=True)
    ols_miss.fit(X_imp_train, y_train)
    print('RMSE - ', my_strategy, ' = ', np.sqrt(mean_squared_error(y_test, ols_miss.predict(X_imp_test))))  

In [8]:
test_imputations('mean', 0, X_miss_train, X_miss_test, y_train, y_test)
test_imputations('median', 0, X_miss_train, X_miss_test, y_train, y_test)
test_imputations('most_frequent', 0, X_miss_train, X_miss_test, y_train, y_test)
test_imputations('constant', 0, X_miss_train, X_miss_test, y_train, y_test)

RMSE -  mean  =  56.0786464183594
RMSE -  median  =  55.51057590795774
RMSE -  most_frequent  =  55.04347411862263
RMSE -  constant  =  55.92789379943154


In [9]:
!pip install fancyimpute

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting knnimpute>=0.1.0
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting cvxpy
  Downloading cvxpy-1.2.1-cp39-cp39-win_amd64.whl (833 kB)
     -------------------------------------- 833.2/833.2 kB 6.6 MB/s eta 0:00:00
Collecting cvxopt
  Downloading cvxopt-1.3.0-cp39-cp39-win_amd64.whl (12.7 MB)
     --------------------------------------- 12.7/12.7 MB 24.2 MB/s eta 0:00:00
Collecting pytest
  Downloading pytest-7.1.3-py3-none-any.whl (298 kB)
     ------------------------------------- 298.2/298.2 kB 18.0 MB/s eta 0:00:00
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
     -------------------------------------- 154.7/154.7 kB 9.6 MB/s eta 0:00:00
Collecting ecos>=2
  Downloading ecos-2

In [10]:
# MICE - Multivariate Imputation by Chained Equations
# YouTube explanation of MICE - https://www.youtube.com/watch?v=zX-pacwVyvU
import fancyimpute as fimp
# this package can be installed by calling: !pip install fancyimpute

imputer = fimp.IterativeImputer(sample_posterior=True, random_state=0)
imputer.fit(X_miss_train)

X_fancy_train = imputer.transform(X_miss_train)
X_fancy_test = imputer.transform(X_miss_test)

ols_fancy = linear_model.LinearRegression(fit_intercept=True)
ols_fancy.fit(X_fancy_train, y_train)
print('RMSE - fancy = ', np.sqrt(mean_squared_error(y_test, ols_fancy.predict(X_fancy_test))))

RMSE - fancy =  57.56247404233082
