# Data imputation

### Find type of NaN

In [1]:
import pandas as pd

dataset = pd.read_csv("datasets/complete_dataset.csv")

#find rows with NaN value
nan_values = dataset[dataset.isna().any(axis=1)]
nan_values

Unnamed: 0,ComponentName,nameProject,M_CBO,M_CYCLO,M_DIT,M_ELOC,M_FanIn,M_FanIn_1,M_LCOM,M_LOC,...,M_WLOCNAMM,M_WMC,M_WMCNAMM,M_TextualCohesion,M_TextualEntropy,ComplexClass,LargeClass,LazyClass,RefusedBequest,SpaghettiCode
156,org.apache.tools.ant.taskdefs.optional.rjunit....,ant-rel-1.6.0,4,0,1,55,8,0,0,55,...,,0,0,1.0,0.813623,0,0,0,0,0
182,.Task1,ant-rel-1.6.0,1,0,2,2,2,0,0,2,...,,0,0,1.0,1.000000,0,0,0,0,0
184,.A,ant-rel-1.6.0,0,0,2,3,14288,0,0,3,...,,0,0,1.0,0.864974,0,0,0,0,0
185,.B,ant-rel-1.6.0,0,0,2,2,9711,0,0,2,...,,0,0,1.0,1.000000,0,0,0,0,0
186,.C,ant-rel-1.6.0,0,0,1,2,15494,0,0,2,...,,0,0,1.0,0.000000,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73846,org.apache.xml.serialize.Method,xerces-Xerces-J_1_4_2,0,0,1,27,104,0,0,27,...,,0,0,1.0,0.927032,0,0,0,0,0
73857,org.w3c.dom.CDATASection,xerces-Xerces-J_1_4_2,0,0,1,5,68,0,0,5,...,,0,0,1.0,0.893543,0,0,0,0,0
73859,org.w3c.dom.Comment,xerces-Xerces-J_1_4_2,0,0,1,5,143,0,0,5,...,,0,0,1.0,0.949611,0,0,0,0,0
73863,org.w3c.dom.DocumentFragment,xerces-Xerces-J_1_4_2,0,0,1,5,125,0,0,5,...,,0,0,1.0,0.851116,0,0,0,0,0


In [2]:
#Count Nan in each column
nan_count = pd.isnull(dataset).sum()

#True or False describing if that column had nulls
is_null = nan_count > 0
nan_count[is_null]

M_WLOCNAMM           1352
M_TextualCohesion       1
M_TextualEntropy        8
dtype: int64

### Replace Nan with KNN imputation

In [3]:
import sys
from impyute.imputation.cs import fast_knn

#increase the recursion limit of the OS
sys.setrecursionlimit(100000)

#get features columns
X = dataset.iloc[:,2:-5]
features_name = X.columns.values.tolist()

#get five target columns
y = dataset.iloc[:,-5:]

#stat the KNN training
imputed_training= fast_knn(X.to_numpy(), k=30)

imputed_dataset = pd.DataFrame(imputed_training, columns = features_name)
imputed_dataset

Unnamed: 0,M_CBO,M_CYCLO,M_DIT,M_ELOC,M_FanIn,M_FanIn_1,M_LCOM,M_LOC,M_LOCNAMM,M_NOA,...,M_NOM,M_NOMNAMM,M_NOPA,M_PMMM,M_PRB,M_WLOCNAMM,M_WMC,M_WMCNAMM,M_TextualCohesion,M_TextualEntropy
0,12.0,191.0,1.0,889.0,61.0,141.0,1922.0,889.0,889.0,9.0,...,64.0,64.0,0.0,0.125000,0.0,13.453125,191.0,0.0,0.135878,0.823630
1,12.0,5.0,1.0,14.0,5.0,0.0,0.0,14.0,14.0,0.0,...,1.0,1.0,0.0,0.000000,0.0,8.000000,5.0,0.0,1.000000,0.931994
2,31.0,156.0,2.0,1606.0,87.0,167.0,608.0,1606.0,1606.0,36.0,...,41.0,41.0,1.0,0.048780,0.0,20.365854,156.0,0.0,0.210269,0.814864
3,12.0,8.0,1.0,30.0,11.0,8.0,1.0,30.0,25.0,2.0,...,3.0,2.0,0.0,0.333333,0.0,8.500000,8.0,0.0,0.500000,0.890971
4,6.0,69.0,1.0,327.0,78.0,69.0,80.0,327.0,324.0,5.0,...,20.0,19.0,0.0,0.000000,0.0,15.947368,69.0,0.0,0.361105,0.832713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73905,0.0,29.0,1.0,70.0,6.0,0.0,325.0,70.0,70.0,0.0,...,26.0,26.0,0.0,0.000000,0.0,2.500000,29.0,0.0,0.163969,0.881089
73906,0.0,54.0,1.0,109.0,5.0,0.0,703.0,109.0,109.0,0.0,...,38.0,38.0,0.0,0.000000,0.0,2.736842,54.0,0.0,0.155533,0.836960
73907,0.0,5.0,1.0,14.0,5.0,0.0,3.0,14.0,14.0,0.0,...,3.0,3.0,0.0,0.000000,0.0,3.000000,5.0,0.0,0.216667,0.921425
73908,0.0,4.0,1.0,15.0,5.0,0.0,6.0,15.0,15.0,0.0,...,4.0,4.0,0.0,0.000000,0.0,2.500000,4.0,0.0,0.246667,0.939300


In [4]:
df_left = dataset.iloc[:,:2]

#join first and second column to new dataset
result = pd.concat([df_left, imputed_dataset], axis=1)

#join target columns to new dataset
result = pd.concat([result, y], axis=1)
result.to_csv('datasets/imputed_dataset.csv', index=False)
result

Unnamed: 0,ComponentName,nameProject,M_CBO,M_CYCLO,M_DIT,M_ELOC,M_FanIn,M_FanIn_1,M_LCOM,M_LOC,...,M_WLOCNAMM,M_WMC,M_WMCNAMM,M_TextualCohesion,M_TextualEntropy,ComplexClass,LargeClass,LazyClass,RefusedBequest,SpaghettiCode
0,org.apache.tools.ant.IntrospectionHelper,ant-rel-1.6.0,12.0,191.0,1.0,889.0,61.0,141.0,1922.0,889.0,...,13.453125,191.0,0.0,0.135878,0.823630,1,0,0,0,0
1,org.apache.tools.ant.ProjectComponentFactory,ant-rel-1.6.0,12.0,5.0,1.0,14.0,5.0,0.0,0.0,14.0,...,8.000000,5.0,0.0,1.000000,0.931994,0,0,0,0,0
2,org.apache.tools.ant.taskdefs.optional.net.FTP,ant-rel-1.6.0,31.0,156.0,2.0,1606.0,87.0,167.0,608.0,1606.0,...,20.365854,156.0,0.0,0.210269,0.814864,1,0,0,0,0
3,org.apache.tools.ant.ProjectComponentHelper,ant-rel-1.6.0,12.0,8.0,1.0,30.0,11.0,8.0,1.0,30.0,...,8.500000,8.0,0.0,0.500000,0.890971,0,0,0,0,0
4,org.apache.tools.ant.PropertyHelper,ant-rel-1.6.0,6.0,69.0,1.0,327.0,78.0,69.0,80.0,327.0,...,15.947368,69.0,0.0,0.361105,0.832713,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73905,org.w3c.dom.html.HTMLImageElement,xerces-Xerces-J_1_4_2,0.0,29.0,1.0,70.0,6.0,0.0,325.0,70.0,...,2.500000,29.0,0.0,0.163969,0.881089,0,0,0,0,0
73906,org.w3c.dom.html.HTMLInputElement,xerces-Xerces-J_1_4_2,0.0,54.0,1.0,109.0,5.0,0.0,703.0,109.0,...,2.736842,54.0,0.0,0.155533,0.836960,0,0,0,0,0
73907,org.w3c.dom.html.HTMLIsIndexElement,xerces-Xerces-J_1_4_2,0.0,5.0,1.0,14.0,5.0,0.0,3.0,14.0,...,3.000000,5.0,0.0,0.216667,0.921425,0,0,0,0,0
73908,org.w3c.dom.html.HTMLLIElement,xerces-Xerces-J_1_4_2,0.0,4.0,1.0,15.0,5.0,0.0,6.0,15.0,...,2.500000,4.0,0.0,0.246667,0.939300,0,0,0,0,0
