Importing the libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

Data Collection And Processing

In [None]:
# loading a csv data to a pandas dataframe
kidney_data = pd.read_csv('/content/kidney_disease.csv')

In [None]:
# print first first 5 rows
kidney_data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [None]:
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

Deleting the redundant attributes


In [None]:
kidney_data.drop('id', inplace=True, axis=1)
kidney_data.drop('sg', inplace=True, axis=1)
kidney_data.drop('su', inplace=True, axis=1)
kidney_data.drop('ba', inplace=True, axis=1)
kidney_data.drop('bgr', inplace=True, axis=1)
kidney_data.drop('bu', inplace=True, axis=1)
kidney_data.drop('sod', inplace=True, axis=1)
kidney_data.drop('pot', inplace=True, axis=1)
kidney_data.drop('htn', inplace=True, axis=1)
kidney_data.drop('dm', inplace=True, axis=1)
kidney_data.drop('cad', inplace=True, axis=1)
kidney_data.drop('pe', inplace=True, axis=1)

In [None]:
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   al              354 non-null    float64
 3   rbc             248 non-null    object 
 4   pc              335 non-null    object 
 5   pcc             396 non-null    object 
 6   sc              383 non-null    float64
 7   hemo            348 non-null    float64
 8   pcv             330 non-null    object 
 9   wc              295 non-null    object 
 10  rc              270 non-null    object 
 11  appet           399 non-null    object 
 12  ane             399 non-null    object 
 13  classification  400 non-null    object 
dtypes: float64(5), object(9)
memory usage: 43.9+ KB


Checking the null values

In [None]:
kidney_data.isnull().sum()

age                 9
bp                 12
al                 46
rbc               152
pc                 65
pcc                 4
sc                 17
hemo               52
pcv                70
wc                105
rc                130
appet               1
ane                 1
classification      0
dtype: int64

Imputing the Null values

In [None]:
from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan,strategy ='most_frequent')

kidney_data_imputed=pd.DataFrame(imp_mode.fit_transform(kidney_data))
kidney_data_imputed.columns=kidney_data.columns
kidney_data_imputed



Unnamed: 0,age,bp,al,rbc,pc,pcc,sc,hemo,pcv,wc,rc,appet,ane,classification
0,48.0,80.0,1.0,normal,normal,notpresent,1.2,15.4,44,7800,5.2,good,no,ckd
1,7.0,50.0,4.0,normal,normal,notpresent,0.8,11.3,38,6000,5.2,good,no,ckd
2,62.0,80.0,2.0,normal,normal,notpresent,1.8,9.6,31,7500,5.2,poor,yes,ckd
3,48.0,70.0,4.0,normal,abnormal,present,3.8,11.2,32,6700,3.9,poor,yes,ckd
4,51.0,80.0,2.0,normal,normal,notpresent,1.4,11.6,35,7300,4.6,good,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,0.0,normal,normal,notpresent,0.5,15.7,47,6700,4.9,good,no,notckd
396,42.0,70.0,0.0,normal,normal,notpresent,1.2,16.5,54,7800,6.2,good,no,notckd
397,12.0,80.0,0.0,normal,normal,notpresent,0.6,15.8,49,6600,5.4,good,no,notckd
398,17.0,60.0,0.0,normal,normal,notpresent,1.0,14.2,51,7200,5.9,good,no,notckd


In [None]:
kidney_data_imputed.isnull().sum()

age               0
bp                0
al                0
rbc               0
pc                0
pcc               0
sc                0
hemo              0
pcv               0
wc                0
rc                0
appet             0
ane               0
classification    0
dtype: int64

In [None]:
kidney_data_imputed.drop(index=[37,230],axis=1,inplace=True)

Label Encoding of Categorical values to Numeric Values

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(kidney_data_imputed['rbc'])
kidney_data_imputed.drop("rbc", axis=1, inplace=True)
kidney_data_imputed["rbc"] = label



In [None]:
label = le.fit_transform(kidney_data_imputed['pc'])
kidney_data_imputed.drop("pc", axis=1, inplace=True)
kidney_data_imputed["pc"] = label

In [None]:
label = le.fit_transform(kidney_data_imputed['pcc'])
kidney_data_imputed.drop("pcc", axis=1, inplace=True)
kidney_data_imputed["pcc"] = label

In [None]:
label = le.fit_transform(kidney_data_imputed['pcv'])
kidney_data_imputed.drop("pcv", axis=1, inplace=True)
kidney_data_imputed["pcv"] = label

In [None]:
label = le.fit_transform(kidney_data_imputed['wc'])
kidney_data_imputed.drop("wc", axis=1, inplace=True)
kidney_data_imputed["wc"] = label

In [None]:
label = le.fit_transform(kidney_data_imputed['rc'])
kidney_data_imputed.drop("rc", axis=1, inplace=True)
kidney_data_imputed["rc"] = label

In [None]:
label = le.fit_transform(kidney_data_imputed['appet'])
kidney_data_imputed.drop("appet", axis=1, inplace=True)
kidney_data_imputed["appet"] = label

In [None]:
label = le.fit_transform(kidney_data_imputed['ane'])
kidney_data_imputed.drop("ane", axis=1, inplace=True)
kidney_data_imputed["ane"] = label

In [None]:
label = le.fit_transform(kidney_data_imputed['classification'])
kidney_data_imputed.drop("classification", axis=1, inplace=True)
kidney_data_imputed["classification"] = label

In [None]:
kidney_data_imputed['classification'].value_counts

<bound method IndexOpsMixin.value_counts of 0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    1
399    1
Name: classification, Length: 398, dtype: int64>

0 represents defective kidney 1 represents healthy kidney

Splitting the features and target

In [None]:
X = kidney_data_imputed.drop(columns='classification',axis=1)
Y = kidney_data_imputed['classification']

In [None]:
print(X)

      age    bp   al   sc  hemo  rbc  pc  pcc  pcv  wc  rc  appet  ane
0    48.0  80.0  1.0  1.2  15.4    1   1    0   32  72  34      0    0
1     7.0  50.0  4.0  0.8  11.3    1   1    0   26  56  34      0    0
2    62.0  80.0  2.0  1.8   9.6    1   1    0   19  70  34      1    1
3    48.0  70.0  4.0  3.8  11.2    1   0    1   20  62  19      1    1
4    51.0  80.0  2.0  1.4  11.6    1   1    0   23  68  27      0    0
..    ...   ...  ...  ...   ...  ...  ..  ...  ...  ..  ..    ...  ...
395  55.0  80.0  0.0  0.5  15.7    1   1    0   35  62  30      0    0
396  42.0  70.0  0.0  1.2  16.5    1   1    0   42  72  44      0    0
397  12.0  80.0  0.0  0.6  15.8    1   1    0   37  61  36      0    0
398  17.0  60.0  0.0  1.0  14.2    1   1    0   39  67  41      0    0
399  58.0  80.0  0.0  1.1  15.8    1   1    0   41  63  43      0    0

[398 rows x 13 columns]


In [None]:
X.to_csv('Features_Kidney.csv',index=False)

In [None]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    1
399    1
Name: classification, Length: 398, dtype: int64


In [None]:
Y.to_csv('Target_Kidney.csv',index=False)

Splitting into training and test data

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [None]:
print(X.shape,X_train.shape,X_test.shape)

(398, 13) (318, 13) (80, 13)


In [None]:
print(Y.shape,Y_train.shape,Y_test.shape)

(398,) (318,) (80,)


In [None]:
Y_train.value_counts()

0    198
1    120
Name: classification, dtype: int64

In [None]:
Y_test.value_counts()

0    50
1    30
Name: classification, dtype: int64

Model Training

K-Nearest-Neighbors

In [None]:
knn=KNeighborsClassifier()

In [None]:
# training the KNN model with Training Data
knn.fit(X_train, Y_train)

Model Evaluation (KNN)

Accuracy Score

In [None]:
# Accuracy on training data
X_train_pred_knn = knn.predict(X_train)
train_accuracy_knn = accuracy_score(X_train_pred_knn, Y_train)


In [None]:
print('Accuracy on Training data :', train_accuracy_knn)

Accuracy on Training data : 0.9245283018867925


In [None]:
# Accuracy on test data
X_test_pred_knn = knn.predict(X_test)
test_accuracy_knn = accuracy_score(X_test_pred_knn, Y_test)


In [None]:
print('Accuracy on Test data :', test_accuracy_knn)

Accuracy on Test data : 0.8875


Metrics for KNN

In [None]:
print(classification_report(Y_test, X_test_pred_knn))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91        50
           1       0.84      0.87      0.85        30

    accuracy                           0.89        80
   macro avg       0.88      0.88      0.88        80
weighted avg       0.89      0.89      0.89        80



Predictive system

In [None]:
input_data=(12.0,80.0,0.0,0.6,15.8,1,1,0,37,61,36,0,0)
#input data into numpy array
input_data_as_numpy_array=np.asarray(input_data)
#reshape the numpy array as we are predicting one instance
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)
prediction=knn.predict(input_data_reshaped)
print(prediction)

if(prediction[0]==0):
  print('The person has chronic kidney disease')
else:
  print('The person does not have chronic kidney disease')  


[1]
The person does not have chronic kidney disease


Saving the trained model (KNN)

In [None]:
filename = 'kidney_disease_model_knn.sav'
pickle.dump(knn, open(filename, 'wb'))

In [None]:
# loading the saved model
loaded_model = pickle.load(open('kidney_disease_model_knn.sav', 'rb'))

In [None]:
for column in X.columns:
  print(column)

age
bp
al
sc
hemo
rbc
pc
pcc
pcv
wc
rc
appet
ane


Logistic Regression

In [None]:
lr = LogisticRegression()

In [None]:
# training Logistic Regression model with training data
lr.fit(X_train, Y_train)

Model Evaluation (Logistic Regression)

Accuracy Score

In [None]:
# accuracy on training data
X_train_pred_lr = lr.predict(X_train)
train_accuracy_lr = accuracy_score(X_train_pred_lr, Y_train)

In [None]:
print('Accuracy on training data with LR :', train_accuracy_lr)

Accuracy on training data with LR : 0.9748427672955975


In [None]:
# Accuracy on test data
X_test_pred_lr = lr.predict(X_test)
test_accuracy_lr = accuracy_score(X_test_pred_lr, Y_test)

In [None]:
print('Accuracy on test data :', test_accuracy_lr)

Accuracy on test data : 0.975


Metrics for Logistic Regression

In [None]:
print(classification_report(Y_test, X_test_pred_lr))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        50
           1       0.94      1.00      0.97        30

    accuracy                           0.97        80
   macro avg       0.97      0.98      0.97        80
weighted avg       0.98      0.97      0.98        80



Saving trained model (LR)

In [None]:
filename1 = 'kidney_disease_model_lr.sav'
pickle.dump(lr, open(filename1, 'wb'))

In [None]:
# loading the saved model
loaded_model = pickle.load(open('kidney_disease_model_lr.sav', 'rb'))

In [None]:
for column in X.columns:
  print(column)

age
bp
al
sc
hemo
rbc
pc
pcc
pcv
wc
rc
appet
ane


Hybrid model (Combining Logistic Regression and KNN using voting classifier)

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
estimators = [('knn', knn), ('lr', lr)]
ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, Y_train)

pred = ensemble.predict(X_test)
accuracy = accuracy_score(Y_test, pred)
print("Accuracy:", accuracy)

Accuracy: 0.9375


Metrics for Hybrid model

In [None]:
print(classification_report(Y_test, pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95        50
           1       0.93      0.90      0.92        30

    accuracy                           0.94        80
   macro avg       0.94      0.93      0.93        80
weighted avg       0.94      0.94      0.94        80



Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

max_accuracy = 0


for x in range(200):
    rf = RandomForestClassifier(random_state=x)
    rf.fit(X_train,Y_train)
    X_test_pred_rf = rf.predict(X_test)
    current_accuracy = round(accuracy_score(X_test_pred_rf, Y_test)*100,2)
    if(current_accuracy>max_accuracy):
        max_accuracy = current_accuracy
        best_x = x
        
print(max_accuracy)
print(best_x)

rf = RandomForestClassifier(random_state=best_x)
rf.fit(X_train,Y_train)
X_test_pred_rf = rf.predict(X_test)

95.0
0


Model Evaluation (Random Forest)

Accuracy Score

In [None]:
# Accuracy on test data
test_accuracy_rf = accuracy_score(X_test_pred_rf, Y_test)
print('Accuracy on test data :', test_accuracy_rf)

Accuracy on test data : 0.95


Metrics for Random Forest

In [None]:
print(classification_report(Y_test, X_test_pred_rf))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        50
           1       0.93      0.93      0.93        30

    accuracy                           0.95        80
   macro avg       0.95      0.95      0.95        80
weighted avg       0.95      0.95      0.95        80

