In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import numpy as np
random.seed(0)

In [16]:
#Fetching the dataset
import pandas as pd
dataset = fetch_california_housing()
train, target = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)

In [17]:
train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24


In [18]:
target.head(3)

Unnamed: 0,0
0,4.526
1,3.585
2,3.521


In [19]:
train.columns = ['0','1','2','3','4','5','6','7']
train.insert(loc=len(train.columns), column='target', value=target)

In [20]:
train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [21]:
#Randomly replace 40% of the first column with NaN values
column = train['0']
print(column.size)
missing_pct = int(column.size * 0.4)
print(missing_pct)
print(column.shape[0])
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
print(column.shape[0])

20640
8256
20640
20640


In [16]:
train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [10]:
#Impute the values using scikit-learn SimpleImpute Class
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='median') #for median imputation replace 'mean' with 'median'
imp_mean.fit(train)
imputed_train_df = imp_mean.transform(train)

In [11]:
#Creating pandas dataframe from numpy array
dataset_imp = pd.DataFrame({'0':imputed_train_df[:,0],'1':imputed_train_df[:,1],'2':imputed_train_df[:,2],'3':imputed_train_df[:,3],'4':imputed_train_df[:,4],'5':imputed_train_df[:,5],'6':imputed_train_df[:,6],'7':imputed_train_df[:,7],'target':imputed_train_df[:,8]})
print(dataset_imp.head(5))

        0     1         2         3       4         5      6       7  target
0  3.5497  41.0  6.984127  1.023810   322.0  2.555556  37.88 -122.23   4.526
1  8.3014  21.0  6.238137  0.971880  2401.0  2.109842  37.86 -122.22   3.585
2  3.5497  52.0  8.288136  1.073446   496.0  2.802260  37.85 -122.24   3.521
3  5.6431  52.0  5.817352  1.073059   558.0  2.547945  37.85 -122.25   3.413
4  3.5497  52.0  6.281853  1.081081   565.0  2.181467  37.85 -122.25   3.422


In [17]:
#Most Frequent
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(train)
imputed_train_df = imp_mean.transform(train)

In [18]:
#Creating pandas dataframe from numpy array
dataset_imp = pd.DataFrame({'0':imputed_train_df[:,0],'1':imputed_train_df[:,1],'2':imputed_train_df[:,2],'3':imputed_train_df[:,3],'4':imputed_train_df[:,4],'5':imputed_train_df[:,5],'6':imputed_train_df[:,6],'7':imputed_train_df[:,7],'target':imputed_train_df[:,8]})
print(dataset_imp.head(5))

         0     1         2         3       4         5      6       7  target
0   8.3252  41.0  6.984127  1.023810   322.0  2.555556  37.88 -122.23   4.526
1   8.3014  21.0  6.238137  0.971880  2401.0  2.109842  37.86 -122.22   3.585
2   7.2574  52.0  8.288136  1.073446   496.0  2.802260  37.85 -122.24   3.521
3  15.0001  52.0  5.817352  1.073059   558.0  2.547945  37.85 -122.25   3.413
4   3.8462  52.0  6.281853  1.081081   565.0  2.181467  37.85 -122.25   3.422


In [35]:
#KNN
import sys
from impyute.imputation.cs import fast_knn
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS

# start the KNN training
imputed_train_df=fast_knn(train.values, k=30)

In [36]:
#Creating pandas dataframe from numpy array
dataset_imp = pd.DataFrame({'0':imputed_train_df[:,0],'1':imputed_train_df[:,1],'2':imputed_train_df[:,2],'3':imputed_train_df[:,3],'4':imputed_train_df[:,4],'5':imputed_train_df[:,5],'6':imputed_train_df[:,6],'7':imputed_train_df[:,7],'target':imputed_train_df[:,8]})
print(dataset_imp.head(5))

          0     1         2         3       4         5      6       7  target
0  3.301179  41.0  6.984127  1.023810   322.0  2.555556  37.88 -122.23   4.526
1  8.301400  21.0  6.238137  0.971880  2401.0  2.109842  37.86 -122.22   3.585
2  4.313269  52.0  8.288136  1.073446   496.0  2.802260  37.85 -122.24   3.521
3  5.643100  52.0  5.817352  1.073059   558.0  2.547945  37.85 -122.25   3.413
4  3.552823  52.0  6.281853  1.081081   565.0  2.181467  37.85 -122.25   3.422


In [37]:
#MICE
from impyute.imputation.cs import mice

# start the MICE training
imputed_train_df=mice(train.values)

In [38]:
#Creating pandas dataframe from numpy array
dataset_imp = pd.DataFrame({'0':imputed_train_df[:,0],'1':imputed_train_df[:,1],'2':imputed_train_df[:,2],'3':imputed_train_df[:,3],'4':imputed_train_df[:,4],'5':imputed_train_df[:,5],'6':imputed_train_df[:,6],'7':imputed_train_df[:,7],'target':imputed_train_df[:,8]})
print(dataset_imp.head(5))

          0     1         2         3       4         5      6       7  target
0  7.042911  41.0  6.984127  1.023810   322.0  2.555556  37.88 -122.23   4.526
1  8.301400  21.0  6.238137  0.971880  2401.0  2.109842  37.86 -122.22   3.585
2  6.931901  52.0  8.288136  1.073446   496.0  2.802260  37.85 -122.24   3.521
3  5.643100  52.0  5.817352  1.073059   558.0  2.547945  37.85 -122.25   3.413
4  5.280041  52.0  6.281853  1.081081   565.0  2.181467  37.85 -122.25   3.422


In [44]:
#Datawig
import datawig

df_train, df_test = datawig.utils.random_split(train)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['1','2','3','4','5','6','7', 'target'], # column(s) containing information about the column we want to impute
    output_column= '0', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

