# Different Imputation Methods to Handle Missing Data

In [1]:
import numpy as np
import pandas as pd

### Dataframe example

In [2]:
dic = {'f1': [1,4,np.nan,np.nan,np.nan,8,np.nan,np.nan,9,np.nan],'f2':[0.1,0.6,0.2,0.7,0.7,0.3,0.2,0.8,0.4,0.3], 'f3': [0.62,0.22,0.63,0.36,0.38,0.07,0.6,0.17,0.24,0.05],'f4': [0.517,0.879,0.821,0.018,0.469,0.969,0.439,0.177,0.664,0.007], 'Class':[0,1,1,1,0,0,1,0,0,1]}
df_example = pd.DataFrame(dic)

In [3]:
df_example

Unnamed: 0,f1,f2,f3,f4,Class
0,1.0,0.1,0.62,0.517,0
1,4.0,0.6,0.22,0.879,1
2,,0.2,0.63,0.821,1
3,,0.7,0.36,0.018,1
4,,0.7,0.38,0.469,0
5,8.0,0.3,0.07,0.969,0
6,,0.2,0.6,0.439,1
7,,0.8,0.17,0.177,0
8,9.0,0.4,0.24,0.664,0
9,,0.3,0.05,0.007,1


## 1.What is imputation?

Imputation is the process of replacing missing values with substituted data.

## 2.Normal Imputation

In the example, the f1 feature has missing value. We can replace the missing values with different methods depending on the data type of feature f1.

- Mean
- Median
- Mode

If the data is numerical : Mean and Median

If the data is categorial: Mode (Frequently occurring value) 

### Create the same dataframe example to fill missing values with mean method

In [4]:
df_example_NormalImputation_mean = pd.DataFrame(dic)

In [5]:
df_example_NormalImputation_mean

Unnamed: 0,f1,f2,f3,f4,Class
0,1.0,0.1,0.62,0.517,0
1,4.0,0.6,0.22,0.879,1
2,,0.2,0.63,0.821,1
3,,0.7,0.36,0.018,1
4,,0.7,0.38,0.469,0
5,8.0,0.3,0.07,0.969,0
6,,0.2,0.6,0.439,1
7,,0.8,0.17,0.177,0
8,9.0,0.4,0.24,0.664,0
9,,0.3,0.05,0.007,1


In [6]:
from sklearn.impute import SimpleImputer
#Instantiate SimpleImputer
si=SimpleImputer(missing_values = np.nan, strategy='mean')
si.fit(df_example_NormalImputation_mean[['f1']])
  
#Filling missing data with mean
df_example_NormalImputation_mean[['f1']] = si.transform(df_example_NormalImputation_mean[['f1']])

df_example_NormalImputation_mean

Unnamed: 0,f1,f2,f3,f4,Class
0,1.0,0.1,0.62,0.517,0
1,4.0,0.6,0.22,0.879,1
2,5.5,0.2,0.63,0.821,1
3,5.5,0.7,0.36,0.018,1
4,5.5,0.7,0.38,0.469,0
5,8.0,0.3,0.07,0.969,0
6,5.5,0.2,0.6,0.439,1
7,5.5,0.8,0.17,0.177,0
8,9.0,0.4,0.24,0.664,0
9,5.5,0.3,0.05,0.007,1


As we could see, the missing values were filling with 5.5 that is the mean of the column 'f1'

### Create the same dataframe example to fill missing values with median method

In [7]:
df_example_NormalImputation_median = pd.DataFrame(dic)
df_example_NormalImputation_median

Unnamed: 0,f1,f2,f3,f4,Class
0,1.0,0.1,0.62,0.517,0
1,4.0,0.6,0.22,0.879,1
2,,0.2,0.63,0.821,1
3,,0.7,0.36,0.018,1
4,,0.7,0.38,0.469,0
5,8.0,0.3,0.07,0.969,0
6,,0.2,0.6,0.439,1
7,,0.8,0.17,0.177,0
8,9.0,0.4,0.24,0.664,0
9,,0.3,0.05,0.007,1


In [8]:
from sklearn.impute import SimpleImputer
#Instantiate SimpleImputer
si=SimpleImputer(missing_values = np.nan, strategy='median')
si.fit(df_example_NormalImputation_median[['f1']])
  
#Filling missing data with median
df_example_NormalImputation_median[['f1']] = si.transform(df_example_NormalImputation_median[['f1']])

df_example_NormalImputation_median

Unnamed: 0,f1,f2,f3,f4,Class
0,1.0,0.1,0.62,0.517,0
1,4.0,0.6,0.22,0.879,1
2,6.0,0.2,0.63,0.821,1
3,6.0,0.7,0.36,0.018,1
4,6.0,0.7,0.38,0.469,0
5,8.0,0.3,0.07,0.969,0
6,6.0,0.2,0.6,0.439,1
7,6.0,0.8,0.17,0.177,0
8,9.0,0.4,0.24,0.664,0
9,6.0,0.3,0.05,0.007,1


As we could see, the missing values were filling with 6.0 that is the median of the column 'f1'

## 3.Imputation based on class label
Here, we take based on class

It takes the average of all the values in the feature f1 that belongs to class 0 or 1 and replace the missing value. Same with median and mode.

In [9]:
#Calculate the mean of the f1 column where class label is equal to 1 and the mean of f1 column where class label is equal to 0
mean_cero = df_example.f1[df_example['Class']==0].mean()
mean_one  = df_example.f1[df_example['Class']==1].mean()

Now, create a Series where replace the missing values with each mean... based on the class label (0 or 1)

In [10]:
f1_0 = df_example.f1[df_example['Class']==0].replace(np.nan, mean_cero)

In [11]:
f1_1 = df_example.f1[df_example['Class']==1].replace(np.nan, mean_one)

In [12]:
#Join both series and sort the index 
f1_total = f1_0.append(f1_1).sort_index()

In [13]:
# Delete the f1 column with missing values
df_example = df_example.drop(['f1'], axis=1)

In [14]:
# Aggregate to dataframe, the new f1 column with values based on class label
df_example['f1'] = f1_total

In [15]:
# Sort the columns and show the new dataframe without missing values
df_example = df_example[['f1','f2','f3','f4','Class']]
df_example

Unnamed: 0,f1,f2,f3,f4,Class
0,1.0,0.1,0.62,0.517,0
1,4.0,0.6,0.22,0.879,1
2,4.0,0.2,0.63,0.821,1
3,4.0,0.7,0.36,0.018,1
4,6.0,0.7,0.38,0.469,0
5,8.0,0.3,0.07,0.969,0
6,4.0,0.2,0.6,0.439,1
7,6.0,0.8,0.17,0.177,0
8,9.0,0.4,0.24,0.664,0
9,4.0,0.3,0.05,0.007,1


## 4.Model-based imputation

The idea is take feature f1 as the class and all the remaining columns as feature. Then, train the data with any model and predict the missing values.

Here, we have train data and test data that has missing values in feature f1. We are going to use the KNN algorithm and take k=2 to train the model since it is simple and uses neighborhood concept.

### TRAIN DATAFRAME

In [16]:
dic = {'f1': [9,4,3,3,7,7,4,1,9,9],'f2':[0.1,0.6,0.2,0.7,0.7,0.3,0.2,0.8,0.4,0.3], 'f3': [0.62,0.22,0.63,0.36,0.38,0.07,0.6,0.17,0.24,0.05],'f4': [0.517,0.879,0.821,0.018,0.469,0.969,0.439,0.177,0.664,0.007], 'Class':[0,1,1,1,0,0,1,0,0,1]}
df_train = pd.DataFrame(dic)

In [17]:
df_train

Unnamed: 0,f1,f2,f3,f4,Class
0,9,0.1,0.62,0.517,0
1,4,0.6,0.22,0.879,1
2,3,0.2,0.63,0.821,1
3,3,0.7,0.36,0.018,1
4,7,0.7,0.38,0.469,0
5,7,0.3,0.07,0.969,0
6,4,0.2,0.6,0.439,1
7,1,0.8,0.17,0.177,0
8,9,0.4,0.24,0.664,0
9,9,0.3,0.05,0.007,1


### TEST DATAFRAME

In [18]:
dic = {'f1': [1,4,np.nan,np.nan,np.nan,8,np.nan,np.nan,9,np.nan],'f2':[0.1,0.6,0.2,0.7,0.7,0.3,0.2,0.8,0.4,0.3], 'f3': [0.62,0.22,0.63,0.36,0.38,0.07,0.6,0.17,0.24,0.05],'f4': [0.517,0.879,0.821,0.018,0.469,0.969,0.439,0.177,0.664,0.007], 'Class':[0,1,1,1,0,0,1,0,0,1]}
df_test = pd.DataFrame(dic)

In [19]:
df_test

Unnamed: 0,f1,f2,f3,f4,Class
0,1.0,0.1,0.62,0.517,0
1,4.0,0.6,0.22,0.879,1
2,,0.2,0.63,0.821,1
3,,0.7,0.36,0.018,1
4,,0.7,0.38,0.469,0
5,8.0,0.3,0.07,0.969,0
6,,0.2,0.6,0.439,1
7,,0.8,0.17,0.177,0
8,9.0,0.4,0.24,0.664,0
9,,0.3,0.05,0.007,1


In [20]:
#Define the train data and test data
train_data = df_train
test_data = df_test

In [21]:
#Split the data 
X_train = train_data.drop(['f1'], axis=1)
Y_train = train_data['f1']
X_test = test_data.drop(['f1'],axis=1)

In [22]:
#With KNN impute the missing values
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train, Y_train)
f1_ = knn.predict(X_test)
f1_

array([9, 3, 3, 3, 1, 7, 3, 1, 7, 3], dtype=int64)

In [23]:
#Implement the algorithm with different K value

neighbors = np.arange(1,9) #Creacion de array
train_accuaracy = np.empty(len(neighbors))
test_accuaracy = np.empty(len(neighbors))

for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors= k)
    knn.fit(X_train, Y_train)
    Y_pred = knn.predict(X_test)
    print("With",k,"Neightborhs, f1 predicted is:",Y_pred)
    print("mean:", Y_pred.mean())
    print("median:", np.median(Y_pred), "\n")

With 1 Neightborhs, f1 predicted is: [9 4 3 3 7 7 4 1 9 9]
mean: 5.6
median: 5.5 

With 2 Neightborhs, f1 predicted is: [9 3 3 3 1 7 3 1 7 3]
mean: 4.0
median: 3.0 

With 3 Neightborhs, f1 predicted is: [9 4 4 3 1 7 3 1 7 3]
mean: 4.2
median: 3.5 

With 4 Neightborhs, f1 predicted is: [7 3 3 4 9 7 3 9 7 4]
mean: 5.6
median: 5.5 

With 5 Neightborhs, f1 predicted is: [7 3 3 3 7 7 3 7 7 3]
mean: 5.0
median: 5.0 

With 6 Neightborhs, f1 predicted is: [7 3 3 3 7 7 3 7 7 3]
mean: 5.0
median: 5.0 

With 7 Neightborhs, f1 predicted is: [7 3 9 3 7 7 9 9 7 3]
mean: 6.4
median: 7.0 

With 8 Neightborhs, f1 predicted is: [4 3 9 3 4 4 9 9 4 3]
mean: 5.2
median: 4.0 



In [24]:
# Delete the f1 column with missing values
df_test = df_test.drop(['f1'],axis=1)
# Aggregate to dataframe, the new f1 column with values based on class label
df_test['f1'] = f1_
# Sort the columns and show the new dataframe without missing values
df_test = df_test[['f1','f2','f3','f4','Class']]
df_test

Unnamed: 0,f1,f2,f3,f4,Class
0,9,0.1,0.62,0.517,0
1,3,0.6,0.22,0.879,1
2,3,0.2,0.63,0.821,1
3,3,0.7,0.36,0.018,1
4,1,0.7,0.38,0.469,0
5,7,0.3,0.07,0.969,0
6,3,0.2,0.6,0.439,1
7,1,0.8,0.17,0.177,0
8,7,0.4,0.24,0.664,0
9,3,0.3,0.05,0.007,1


## 5.CREATING MISSING VALUE FEATURE

In addition to performing imputation on the features, we can create new corresponding features which will have binary values that say whether the data is missing in the features or not with 0 as not missing and 1 as missing. We do this for the record and also missing values can be a source of useful information.

In [25]:
#Create the dataframe example with missing values
dic = {'f1': [1,4,np.nan,np.nan,np.nan,8,np.nan,np.nan,9,np.nan],'f2':[0.1,0.6,0.2,0.7,0.7,0.3,0.2,0.8,0.4,0.3], 'f3': [0.62,0.22,0.63,0.36,0.38,0.07,0.6,0.17,0.24,0.05],'f4': [0.517,0.879,0.821,0.018,0.469,0.969,0.439,0.177,0.664,0.007], 'Class':[0,1,1,1,0,0,1,0,0,1]}
df = pd.DataFrame(dic)

In [26]:
#Find location where exists missing values
df_ = df.isna()

In [27]:
df_

Unnamed: 0,f1,f2,f3,f4,Class
0,False,False,False,False,False
1,False,False,False,False,False
2,True,False,False,False,False
3,True,False,False,False,False
4,True,False,False,False,False
5,False,False,False,False,False
6,True,False,False,False,False
7,True,False,False,False,False
8,False,False,False,False,False
9,True,False,False,False,False


In [28]:
#Replace False -> 0 and True -> 1
df_=df_.replace([False,True],[0,1])

In [29]:
df_

Unnamed: 0,f1,f2,f3,f4,Class
0,0,0,0,0,0
1,0,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
5,0,0,0,0,0
6,1,0,0,0,0
7,1,0,0,0,0
8,0,0,0,0,0
9,1,0,0,0,0


In [30]:
#Change the name of columns
df_.columns = ["f1'","f2'","f3'","f4'","Class'"]

In [31]:
#Join the dataframe example with new binary dataframe
df_df = df.merge(df_, left_index=True, right_index=True)

In [32]:
df_df

Unnamed: 0,f1,f2,f3,f4,Class,f1',f2',f3',f4',Class'
0,1.0,0.1,0.62,0.517,0,0,0,0,0,0
1,4.0,0.6,0.22,0.879,1,0,0,0,0,0
2,,0.2,0.63,0.821,1,1,0,0,0,0
3,,0.7,0.36,0.018,1,1,0,0,0,0
4,,0.7,0.38,0.469,0,1,0,0,0,0
5,8.0,0.3,0.07,0.969,0,0,0,0,0,0
6,,0.2,0.6,0.439,1,1,0,0,0,0
7,,0.8,0.17,0.177,0,1,0,0,0,0
8,9.0,0.4,0.24,0.664,0,0,0,0,0,0
9,,0.3,0.05,0.007,1,1,0,0,0,0




## In this notebook, you have different imputation methods for handling missing data.

# The project was made with the guide from this link:  https://towardsdatascience.com/different-imputation-methods-to-handle-missing-data-8dd5bce97583#7d84