In [3]:
import pandas as pd

df = pd.read_csv("..\dataset\preprocessed_dataset_full.csv", index_col=0)

In [4]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [5]:
# check for missing values
df.isna().sum() 

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       72
wbcc     106
rbcc     131
htn        2
dm         3
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

In [6]:
# From documentation of the dataset
NumericalColumns = ['age','bp','bgr','bu','sc','sod','pot','pcv','hemo','wbcc','rbcc',]
CategoricalColumns = ['al','su','rbc','sg','pc','pcc','ba','htn','dm','cad','appet','pe','ane']

In [7]:
df[CategoricalColumns] = df[CategoricalColumns].astype("object")

df.describe(include="object")

Unnamed: 0,sg,al,su,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
count,353.0,354.0,351.0,248,335,396,396,398,397,398,399,399,399,400
unique,5.0,6.0,6.0,2,2,2,2,2,2,2,3,3,2,2
top,1.02,0.0,0.0,normal,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
freq,106.0,199.0,290.0,201,259,354,374,251,260,364,316,322,339,250


In [8]:
df[CategoricalColumns].isna().sum() 

al        46
su        49
rbc      152
sg        47
pc        65
pcc        4
ba         4
htn        2
dm         3
cad        2
appet      1
pe         1
ane        1
dtype: int64

In [9]:
# We could use different techniques to fill NaNs
# * most frequent values for categorical and mean for numerical (most reasonable in our case) [rbc can be problematic, a lot of NaNs]
# * delete rows with missing values (if we do that we will lose a lot of data)
# * we could create generative model to fill missing values (not enough data to train that model with)

In [10]:
for columnName in CategoricalColumns:
    df[columnName].fillna(df[columnName].mode()[0], inplace=True)

In [11]:
df[NumericalColumns] = df[NumericalColumns].apply(pd.to_numeric)

for columnName in NumericalColumns:
    df[columnName].fillna(df[columnName].mean(), inplace=True)

In [12]:
df.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,51.483376,76.469072,1.017712,0.9,0.395,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437,38.871951,8406.122449,4.707435
std,16.974966,13.476298,0.005434,1.31313,1.040038,74.782634,49.285887,5.61749,9.204273,2.819783,2.716171,8.148469,2523.219976,0.840314
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1
25%,42.0,70.0,1.015,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.875,34.0,6975.0,4.5
50%,54.0,78.234536,1.02,0.0,0.0,126.0,44.0,1.4,137.528754,4.627244,12.526437,38.871951,8406.122449,4.707435
75%,64.0,80.0,1.02,2.0,0.0,150.0,61.75,3.072454,141.0,4.8,14.625,44.0,9400.0,5.1
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0


In [13]:
# What we can see is different scale of all the numerical columns, we will need to fix that with scaling

In [14]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,148.036517,...,38.0,6000.0,4.707435,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,4.707435,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [15]:
df = pd.get_dummies(df , columns=CategoricalColumns , prefix=CategoricalColumns , drop_first=True)
df.shape

(400, 38)

In [16]:
# So we have 38 instead of 25 columns (it is menagable amount). 
# If there would be more categorical features, we could try different encoding techniques

In [17]:
import numpy as np
df[df['class'] == np.nan]

Unnamed: 0,age,bp,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,...,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_no,appet_poor,pe_no,pe_yes,ane_yes


In [18]:
df['class'].replace(["ckd","notckd"],[1,0], inplace=True)

In [19]:
df['class'].value_counts()

1    250
0    150
Name: class, dtype: int64

In [20]:
# Simple modeling

X = df.loc[:, df.columns != 'class']
y = df['class']

In [21]:
from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()
min_max.fit(X)
X = min_max.transform(X)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score , train_test_split
from sklearn.metrics import classification_report

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1990)

In [91]:
rfc = RandomForestClassifier(n_estimators=5, criterion='gini', max_depth=3, 
                             class_weight={0: len(df[df['class'] == 0]), 1: len(df[df['class'] == 1])})
cv_v = cross_val_score(rfc, X_train, y_train, cv=4)
print(cv_v)
print(cv_v.std())
print(cv_v.mean())

[0.96       0.97333333 0.94666667 0.97333333]
0.011055415967851362
0.9633333333333334


In [92]:
rfc.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 150, 1: 250}, max_depth=3,
                       n_estimators=5)

In [93]:
y_pred = rfc.predict(X_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.94      0.97      0.96        33
           1       0.98      0.97      0.98        67

    accuracy                           0.97       100
   macro avg       0.96      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100



In [99]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 4, stop = 10, num=5)]

# Maximum number of levels in tree
max_depth = [2,3,4,5]

# Criterion of tree creation
criterion = ['gini', 'entropy']

# Weight of classes
class_weight = ["balanced", {0: len(df[df['class'] == 0]), 1: len(df[df['class'] == 1])}]

In [100]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'criterion': criterion,
               'class_weight': class_weight
             }
print(param_grid)

{'n_estimators': [4, 5, 7, 8, 10], 'max_depth': [2, 3, 4, 5], 'criterion': ['gini', 'entropy'], 'class_weight': ['balanced', {0: 150, 1: 250}]}


In [101]:
from sklearn.model_selection import RandomizedSearchCV

rf_Model = RandomForestClassifier()
rf_RandomGrid = RandomizedSearchCV(estimator = rf_Model, param_distributions = param_grid, cv = 4, verbose=2, n_jobs = 4)
rf_RandomGrid.fit(X_train, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


RandomizedSearchCV(cv=4, estimator=RandomForestClassifier(), n_jobs=4,
                   param_distributions={'class_weight': ['balanced',
                                                         {0: 150, 1: 250}],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [2, 3, 4, 5],
                                        'n_estimators': [4, 5, 7, 8, 10]},
                   verbose=2)

In [102]:
rf_RandomGrid.best_params_


{'n_estimators': 8,
 'max_depth': 5,
 'criterion': 'entropy',
 'class_weight': {0: 150, 1: 250}}

In [103]:
print (f'Train Accuracy - : {rf_RandomGrid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_RandomGrid.score(X_test,y_test):.3f}')

Train Accuracy - : 0.993
Test Accuracy - : 1.000


In [105]:
y_pred = rf_RandomGrid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        33
           1       1.00      1.00      1.00        67

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

