##  Appendix D

In [1]:
# !pip install missingpy

In [2]:
import os
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
import sklearn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df= pd.read_excel("C:/Users/akash/Downloads/Movie Reviews data/Missing value data set.xlsx")
df.head()

Unnamed: 0,Gender,Height,Smoker,Exercise,Age,Lung Capacity
0,Male,69.503512,Non-smoker,24.607049,,5672.764336
1,Male,70.139632,,23.5138,66.674258,5631.767518
2,,68.153641,Non-smoker,25.660212,35.675341,5712.257932
3,Male,70.917129,Non-smoker,,67.721565,5723.326423
4,Male,,Smoker,19.571829,58.310837,5483.943573


In [4]:
df.columns

Index(['Gender', 'Height', 'Smoker', 'Exercise', 'Age', 'Lung Capacity'], dtype='object')

In [5]:
df.shape

(100, 6)

In [6]:
df.isnull().sum()

Gender           11
Height            3
Smoker            7
Exercise          4
Age               5
Lung Capacity     0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Gender         89 non-null     object 
 1   Height         97 non-null     float64
 2   Smoker         93 non-null     object 
 3   Exercise       96 non-null     float64
 4   Age            95 non-null     float64
 5   Lung Capacity  100 non-null    float64
dtypes: float64(4), object(2)
memory usage: 4.8+ KB


In [8]:
# converting Categorical text coloumn into Categorical numerical
cateogry_columns = df.select_dtypes(include=['object']).columns.tolist() # to get categorical columns as list
encoder_columns = [] # to store particular encoder object

for cat_column in cateogry_columns:
    print('Categorical column is : ', cat_column)
    encoder = LabelEncoder()
    df[cat_column] = df[cat_column].fillna('missing') # Fill missing values with a special value (e.g., 'missing')
    df[cat_column] = encoder.fit_transform(df[cat_column])
    # Convert the special value back to NaN
    df[cat_column] = df[cat_column].replace(encoder.transform(['missing'])[0], np.nan)
    print(encoder.classes_)
    encoder_columns.append(encoder)
    
df.head()

Categorical column is :  Gender
['Female' 'Male' 'missing']
Categorical column is :  Smoker
['Non-smoker' 'Smoker' 'missing']


Unnamed: 0,Gender,Height,Smoker,Exercise,Age,Lung Capacity
0,1.0,69.503512,0.0,24.607049,,5672.764336
1,1.0,70.139632,,23.5138,66.674258,5631.767518
2,,68.153641,0.0,25.660212,35.675341,5712.257932
3,1.0,70.917129,0.0,,67.721565,5723.326423
4,1.0,,1.0,19.571829,58.310837,5483.943573


In [9]:
df.isnull().sum()

Gender           11
Height            3
Smoker            7
Exercise          4
Age               5
Lung Capacity     0
dtype: int64

## Miss Forest

In [10]:
imputer = MissForest(max_iter=10) #miss forest
#  to impute the missing values.
df_missforest = imputer.fit_transform(df, cat_vars = [0,2]) # cat_vars : an array containing column indices of categorical variable(s) present in dataset.
df_missforest = pd.DataFrame(df_missforest, columns = df.columns) 
df_missforest.head()

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3


Unnamed: 0,Gender,Height,Smoker,Exercise,Age,Lung Capacity
0,1.0,69.503512,0.0,24.607049,46.825169,5672.764336
1,1.0,70.139632,0.0,23.5138,66.674258,5631.767518
2,1.0,68.153641,0.0,25.660212,35.675341,5712.257932
3,1.0,70.917129,0.0,25.469206,67.721565,5723.326423
4,1.0,67.726899,1.0,19.571829,58.310837,5483.943573


In [11]:
cateogry_columns # list of categorical columns
encoder_columns # list of particular encoder object

for index, cat_column in enumerate(cateogry_columns):
    # inverting number to text
    df_missforest[cat_column] = encoder_columns[index].inverse_transform([int(x) for x in df_missforest[cat_column].tolist()]) # convert float to int
    
df_missforest.head()

Unnamed: 0,Gender,Height,Smoker,Exercise,Age,Lung Capacity
0,Male,69.503512,Non-smoker,24.607049,46.825169,5672.764336
1,Male,70.139632,Non-smoker,23.5138,66.674258,5631.767518
2,Male,68.153641,Non-smoker,25.660212,35.675341,5712.257932
3,Male,70.917129,Non-smoker,25.469206,67.721565,5723.326423
4,Male,67.726899,Smoker,19.571829,58.310837,5483.943573


In [12]:
# saving final df
df_missforest.to_csv('df_missforest.csv',index=False)
# re read final df
dfm = pd.read_csv("df_missforest.csv")
dfm.head()

Unnamed: 0,Gender,Height,Smoker,Exercise,Age,Lung Capacity
0,Male,69.503512,Non-smoker,24.607049,46.825169,5672.764336
1,Male,70.139632,Non-smoker,23.5138,66.674258,5631.767518
2,Male,68.153641,Non-smoker,25.660212,35.675341,5712.257932
3,Male,70.917129,Non-smoker,25.469206,67.721565,5723.326423
4,Male,67.726899,Smoker,19.571829,58.310837,5483.943573


## KNN Imputation

In [13]:
impute = KNNImputer(n_neighbors=5) #KNN imputation
df_KNN = impute.fit_transform(df)
df_KNN = pd.DataFrame(df_KNN, columns = df.columns)
df_KNN.head()

Unnamed: 0,Gender,Height,Smoker,Exercise,Age,Lung Capacity
0,1.0,69.503512,0.0,24.607049,40.636241,5672.764336
1,1.0,70.139632,0.0,23.5138,66.674258,5631.767518
2,1.0,68.153641,0.0,25.660212,35.675341,5712.257932
3,1.0,70.917129,0.0,28.191213,67.721565,5723.326423
4,1.0,68.186725,1.0,19.571829,58.310837,5483.943573


In [14]:
cateogry_columns # list of categorical columns
encoder_columns # list of particular encoder object

for index, cat_column in enumerate(cateogry_columns):
    # inverting number to text
    df_KNN[cat_column] = encoder_columns[index].inverse_transform([int(x) for x in df_KNN[cat_column].tolist()]) # convert float to int
    
df_KNN.head()

Unnamed: 0,Gender,Height,Smoker,Exercise,Age,Lung Capacity
0,Male,69.503512,Non-smoker,24.607049,40.636241,5672.764336
1,Male,70.139632,Non-smoker,23.5138,66.674258,5631.767518
2,Male,68.153641,Non-smoker,25.660212,35.675341,5712.257932
3,Male,70.917129,Non-smoker,28.191213,67.721565,5723.326423
4,Male,68.186725,Smoker,19.571829,58.310837,5483.943573


In [15]:
# saving final df
df_KNN.to_csv('df_KNN.csv',index=False)
# re read final df
dfk = pd.read_csv("df_KNN.csv")
dfk.head()

Unnamed: 0,Gender,Height,Smoker,Exercise,Age,Lung Capacity
0,Male,69.503512,Non-smoker,24.607049,40.636241,5672.764336
1,Male,70.139632,Non-smoker,23.5138,66.674258,5631.767518
2,Male,68.153641,Non-smoker,25.660212,35.675341,5712.257932
3,Male,70.917129,Non-smoker,28.191213,67.721565,5723.326423
4,Male,68.186725,Smoker,19.571829,58.310837,5483.943573
