# Imputation testing code

### Before running code you must have :
 
 * train.csv in same directory.
 * python 3 installed
 * libraries used in code installed.
 
**Importing the libraries that will be used**

In [25]:
import pandas as pd
import numpy as np
import sys as sys

# for fitting a decision tree
from sklearn.tree import DecisionTreeRegressor

# this import an renaming is needed to import missforest
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
# Importing the library from python to impute the missing values with MissForest tactic.
from missingpy import MissForest

# Import the KNNImputer
from sklearn.impute import KNNImputer

# Can create a confusion matrix with accuracy to see how good model is.
from sklearn import metrics

**Reading the train.csv in the code to use**

In [6]:
original_df = pd.read_csv("train.csv") 
original_df.head(6)

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1
5,7,A,I,C,A,E,BI,C,AV,Y,...,0.52476,0.580085,0.301498,0.261055,0.193988,0.935688,0.685868,0.277694,0.428115,0


### Comments (Tactic to follow Q):
Should we use only one column for testing or the whole table?
- Here we use only 3 columns cont0, cont1 and cont2 just for testing illustrations for now.

In [7]:
# Keeping columns interested.
imputation_data = original_df[['cont0','cont1', 'cont2', 'target']].copy()
# Keep original columns to compare with the imputed values.
original_used_data = imputation_data.copy()
imputation_data.head(3)

Unnamed: 0,cont0,cont1,cont2,target
0,0.629858,0.855349,0.759439,0
1,0.370727,0.328929,0.386385,0
2,0.502272,0.322749,0.343255,0


In [8]:
# Divide predictors from respond.
imputation_predictors = imputation_data[['cont0', 'cont1', 'cont2']].copy()
imputation_respond = imputation_data[['target']].copy()
imputation_predictors.head(5)

Unnamed: 0,cont0,cont1,cont2
0,0.629858,0.855349,0.759439
1,0.370727,0.328929,0.386385
2,0.502272,0.322749,0.343255
3,0.934242,0.707663,0.831147
4,0.254427,0.274514,0.338818


In [9]:
imputation_respond.head(5)

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,1


In [10]:
### Fit a model to predict target using cont 2 with all the true values.

# Initializing random-state for repsoducible reason.
tree_reg = DecisionTreeRegressor(random_state=96)
# Fitting a Decision Tree
tree_reg.fit(imputation_predictors, imputation_respond)

DecisionTreeRegressor(random_state=96)

In [11]:
# Making predictions with same data used to train model.
predicted = tree_reg.predict(imputation_predictors)
print("True values: ", imputation_respond.values[0:10])
print("Predicted values: ", predicted[0:10])

True values:  [[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]]
Predicted values:  [0. 0. 0. 0. 1. 0. 0. 1. 0. 1.]


In [12]:
# Can create a confusion matrix with accuracy to see how good model is.
from sklearn import metrics
print(metrics.confusion_matrix(imputation_respond, predicted))

[[220539      0]
 [     0  79461]]


We see a PERFECT accuracy prediction. Most likely is overfitting because memorising the data.

## Randomly Removing the values and making them null

In [13]:
len(imputation_predictors)*20/100

60000.0

In [14]:
# funtion the will remove values from column cont1 at random
# replace the removed values with NaN which is null.
import random
def remove_random_values(df):
    # Set seed for reproducible reasons. Achieving same modified set every run.
    random.seed(5059)
    
    # Getting 20% of the data sample to remove values.
    put_missing_values = df.sample(
        int(len(df)*20/100))
    
    # Replacing all values with string nan because we cannot update data frame IF
    # values are null. Python will keep old values in the update.
    put_missing_values['cont0'] = 'nan'
    
    # Update data frame with new 'nan' values
    df.update(put_missing_values)

    # Replacing the 'nan' values to real nan values.
    df = df.replace('nan', np.nan)
    
    return df

In [15]:
# Calling function to remove randome values and replacing them with null.
median_imputation = remove_random_values(imputation_predictors.copy())
median_imputation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   cont0   240000 non-null  float64
 1   cont1   300000 non-null  float64
 2   cont2   300000 non-null  float64
dtypes: float64(3)
memory usage: 6.9 MB


From the above results we see we have 60 000 values as null in column 'cont0'

## Imputing with median value.

In [16]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(median_imputation)

RawOutput = imputer.transform(median_imputation) # the output is an array

median_imputation = pd.DataFrame(RawOutput, 
                                 columns = median_imputation.columns,
                                 index = median_imputation.index)

median_imputation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   cont0   300000 non-null  float64
 1   cont1   300000 non-null  float64
 2   cont2   300000 non-null  float64
dtypes: float64(3)
memory usage: 6.9 MB


In [17]:
# Making predictions with same data used to train model.
predicted_median = tree_reg.predict(median_imputation)
print("True values: ", imputation_respond.values[0:10])
print("Predicted values: ", predicted_median[0:10])

True values:  [[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]]
Predicted values:  [0. 0. 0. 0. 1. 0. 0. 1. 0. 1.]


In [24]:
# Can create a confusion matrix with accuracy to see how good model is.
print(metrics.confusion_matrix(imputation_respond, predicted_median))

[[208982  11557]
 [  9192  70269]]


## Imputing with MissForest

imputing tactic found on youtube [press here to view](https://www.youtube.com/watch?v=WhUm9jCoYf4) and website ecplaining in text [press here](https://www.betterdatascience.com/python-missforest/)

- We can write about it towards the end. We have the material here.

In [35]:
# Getting the data frame with missing values to impute.
miss_forest_data = remove_random_values(imputation_predictors.copy())

# Create imputation tactic.
miss_forest_imputer = MissForest()

In [36]:
%%time
# Imputing the missing values.
miss_forest_data = imputer.fit_transform(miss_forest_data)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
CPU times: user 12min 27s, sys: 14 s, total: 12min 41s
Wall time: 3min 30s


In [21]:
# Making predictions with same data used to train model.
predicted_missforest = tree_reg.predict(miss_forest_imputer)
print("True values: ", imputation_respond.values[0:10])
print("Predicted values: ", predicted_missforest[0:10])

True values:  [[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]]
Predicted values:  [0. 0. 1. 0. 1. 0. 0. 1. 0. 1.]


In [23]:
print(metrics.confusion_matrix(imputation_respond, predicted_missforest))

[[210292  10247]
 [ 10065  69396]]


## Impute with KNNImputer

- We can research a bit more on the algorithm, for now we testing them to see how they work. I found this from : [press here](https://towardsdatascience.com/missing-value-imputation-with-python-and-k-nearest-neighbors-308e7abd273d)

In [37]:
# Getting the data frame with missing values to impute.
knn_imputer_data = remove_random_values(imputation_predictors.copy())

# We can adjust hyperparameter n_neighbors.
knn_imputer = KNNImputer(n_neighbors=3)

In [38]:
%%time
# Imputing values
knn_imputer_data = imputer.fit_transform(knn_imputer_data)

Iteration: 0
Iteration: 1
Iteration: 2
CPU times: user 6min 17s, sys: 5.9 s, total: 6min 23s
Wall time: 1min 41s


In [27]:
# Making predictions with same data used to train model.
predicted_knn = tree_reg.predict(knn_imputer_data)
print("True values: ", imputation_respond.values[0:10])
print("Predicted values: ", predicted_knn[0:10])

True values:  [[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]]
Predicted values:  [0. 0. 0. 0. 1. 1. 0. 1. 0. 1.]


In [28]:
print(metrics.confusion_matrix(imputation_respond, predicted_knn))

[[210203  10336]
 [ 10019  69442]]
