In [80]:
import pandas as pd

df = pd.read_csv("preprocessed_dataset.csv", index_col=0)

In [81]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [82]:
# check for missing values
df.isna().sum() 

age       11
bp        14
sg        49
al        48
su        51
rbc      154
pc        67
pcc        6
ba         6
bgr       46
bu        21
sc        19
sod       89
pot       90
hemo      54
pcv       74
wbcc     108
rbcc     133
htn        4
dm         5
cad        4
appet      3
pe         3
ane        3
class      2
dtype: int64

In [83]:
# From documentation of the dataset
NumericalColumns = ['age','bp','bgr','bu','sc','sod','pot','pcv','hemo','wbcc','rbcc',]
CategoricalColumns = ['al','su','rbc','sg','pc','pcc','ba','htn','dm','cad','appet','pe','ane']

In [88]:
df[CategoricalColumns] = df[CategoricalColumns].astype("object")

df.describe(include="object")

Unnamed: 0,sg,al,su,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
count,353.0,354.0,351.0,248,335,396,396,398,397,398,399,399,399,400
unique,5.0,6.0,6.0,2,2,2,2,2,2,2,3,3,2,2
top,1.02,0.0,0.0,normal,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
freq,106.0,199.0,290.0,201,259,354,374,251,260,364,316,322,339,250


In [89]:
df[CategoricalColumns].isna().sum() 

al        48
su        51
rbc      154
sg        49
pc        67
pcc        6
ba         6
htn        4
dm         5
cad        4
appet      3
pe         3
ane        3
dtype: int64

In [60]:
# We could use different techniques to fill NaNs
# * most frequent values for categorical and mean for numerical (most reasonable in our case) [rbc can be problematic, a lot of NaNs]
# * delete rows with missing values (if we do that we will lose a lot of data)
# * we could create generative model to fill missing values (not enough data to train that model with)

In [61]:
for columnName in CategoricalColumns:
    df[columnName].fillna(df[columnName].mode()[0], inplace=True)

In [62]:
df[NumericalColumns] = df[NumericalColumns].apply(pd.to_numeric)

for columnName in NumericalColumns:
    df[columnName].fillna(df[columnName].mean(), inplace=True)

In [63]:
df.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
count,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0
mean,51.483376,76.469072,1.017724,0.895522,0.393035,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437,38.871951,8406.122449,4.707435
std,16.932582,13.442649,0.005423,1.311385,1.037814,74.595911,49.162826,5.603464,9.181291,2.812742,2.709389,8.128123,2516.919791,0.838216
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1
25%,42.0,70.0,1.015,0.0,0.0,101.25,27.25,0.9,135.0,4.0,10.9,34.0,7000.0,4.5
50%,54.0,76.469072,1.02,0.0,0.0,127.0,44.5,1.4,137.528754,4.627244,12.526437,38.871951,8406.122449,4.707435
75%,64.0,80.0,1.02,2.0,0.0,150.0,60.75,3.072454,141.0,4.8,14.6,44.0,9375.0,5.1
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0


In [None]:
# What we can see is different scale of all the numerical columns, we will need to fix that with scaling

In [50]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,148.036517,...,38.0,6000.0,4.707435,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,4.707435,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [51]:
df = pd.get_dummies(df , columns=CategoricalColumns , prefix=CategoricalColumns , drop_first=True)
df.shape

(402, 38)

In [52]:
# So we have 38 instead of 25 columns (it is menagable amount). 
# If there would be more categorical features, we could try different encoding techniques

In [53]:
df['class'].replace(["ckd","notckd"],[1,0], inplace=True)

In [54]:
df['class'].value_counts()

1.0    250
0.0    150
Name: class, dtype: int64

Unnamed: 0,age,bp,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,...,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_no,appet_poor,pe_no,pe_yes,ane_yes
count,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,...,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0
mean,51.483376,76.469072,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437,38.871951,8406.122449,...,0.104478,0.054726,0.365672,0.340796,0.084577,0.002488,0.20398,0.808458,0.189055,0.149254
std,16.932582,13.442649,74.595911,49.162826,5.603464,9.181291,2.812742,2.709389,8.128123,2516.919791,...,0.30626,0.227729,0.482218,0.474568,0.278598,0.049875,0.403456,0.394005,0.39204,0.356782
min,2.0,50.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.0,70.0,101.25,27.25,0.9,135.0,4.0,10.9,34.0,7000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,54.0,76.469072,127.0,44.5,1.4,137.528754,4.627244,12.526437,38.871951,8406.122449,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,64.0,80.0,150.0,60.75,3.072454,141.0,4.8,14.6,44.0,9375.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,90.0,180.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
