Importing the libraries

In [36]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt 
import seaborn as sns

Data Collection And Processing

In [37]:
# loading a csv data to a pandas dataframe
kidney_data = pd.read_csv('/content/kidney_disease.csv')

In [38]:
# print first first 5 rows
kidney_data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [39]:
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

Deleting the redundant attributes


In [40]:
kidney_data.drop('id', inplace=True, axis=1)
kidney_data.drop('sg', inplace=True, axis=1)
kidney_data.drop('su', inplace=True, axis=1)
kidney_data.drop('ba', inplace=True, axis=1)
kidney_data.drop('bgr', inplace=True, axis=1)
kidney_data.drop('bu', inplace=True, axis=1)
kidney_data.drop('sod', inplace=True, axis=1)
kidney_data.drop('pot', inplace=True, axis=1)
kidney_data.drop('htn', inplace=True, axis=1)
kidney_data.drop('dm', inplace=True, axis=1)
kidney_data.drop('cad', inplace=True, axis=1)
kidney_data.drop('pe', inplace=True, axis=1)

In [41]:
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   al              354 non-null    float64
 3   rbc             248 non-null    object 
 4   pc              335 non-null    object 
 5   pcc             396 non-null    object 
 6   sc              383 non-null    float64
 7   hemo            348 non-null    float64
 8   pcv             330 non-null    object 
 9   wc              295 non-null    object 
 10  rc              270 non-null    object 
 11  appet           399 non-null    object 
 12  ane             399 non-null    object 
 13  classification  400 non-null    object 
dtypes: float64(5), object(9)
memory usage: 43.9+ KB


Checking the null values

In [42]:
kidney_data.isnull().sum()

age                 9
bp                 12
al                 46
rbc               152
pc                 65
pcc                 4
sc                 17
hemo               52
pcv                70
wc                105
rc                130
appet               1
ane                 1
classification      0
dtype: int64

Imputing the Null values

In [43]:
from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan,strategy ='most_frequent')

kidney_data_imputed=pd.DataFrame(imp_mode.fit_transform(kidney_data))
kidney_data_imputed.columns=kidney_data.columns
kidney_data_imputed



Unnamed: 0,age,bp,al,rbc,pc,pcc,sc,hemo,pcv,wc,rc,appet,ane,classification
0,48.0,80.0,1.0,normal,normal,notpresent,1.2,15.4,44,7800,5.2,good,no,ckd
1,7.0,50.0,4.0,normal,normal,notpresent,0.8,11.3,38,6000,5.2,good,no,ckd
2,62.0,80.0,2.0,normal,normal,notpresent,1.8,9.6,31,7500,5.2,poor,yes,ckd
3,48.0,70.0,4.0,normal,abnormal,present,3.8,11.2,32,6700,3.9,poor,yes,ckd
4,51.0,80.0,2.0,normal,normal,notpresent,1.4,11.6,35,7300,4.6,good,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,0.0,normal,normal,notpresent,0.5,15.7,47,6700,4.9,good,no,notckd
396,42.0,70.0,0.0,normal,normal,notpresent,1.2,16.5,54,7800,6.2,good,no,notckd
397,12.0,80.0,0.0,normal,normal,notpresent,0.6,15.8,49,6600,5.4,good,no,notckd
398,17.0,60.0,0.0,normal,normal,notpresent,1.0,14.2,51,7200,5.9,good,no,notckd


In [44]:
kidney_data_imputed.isnull().sum()

age               0
bp                0
al                0
rbc               0
pc                0
pcc               0
sc                0
hemo              0
pcv               0
wc                0
rc                0
appet             0
ane               0
classification    0
dtype: int64

In [45]:
kidney_data_imputed.drop(index=[37,230],axis=1,inplace=True)

Label Encoding of Categorical values to Numeric Values

In [46]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(kidney_data_imputed['rbc'])
kidney_data_imputed.drop("rbc", axis=1, inplace=True)
kidney_data_imputed["rbc"] = label



In [47]:
label = le.fit_transform(kidney_data_imputed['pc'])
kidney_data_imputed.drop("pc", axis=1, inplace=True)
kidney_data_imputed["pc"] = label

In [48]:
label = le.fit_transform(kidney_data_imputed['pcc'])
kidney_data_imputed.drop("pcc", axis=1, inplace=True)
kidney_data_imputed["pcc"] = label

In [49]:
label = le.fit_transform(kidney_data_imputed['pcv'])
kidney_data_imputed.drop("pcv", axis=1, inplace=True)
kidney_data_imputed["pcv"] = label

In [50]:
label = le.fit_transform(kidney_data_imputed['wc'])
kidney_data_imputed.drop("wc", axis=1, inplace=True)
kidney_data_imputed["wc"] = label

In [51]:
label = le.fit_transform(kidney_data_imputed['rc'])
kidney_data_imputed.drop("rc", axis=1, inplace=True)
kidney_data_imputed["rc"] = label

In [52]:
label = le.fit_transform(kidney_data_imputed['appet'])
kidney_data_imputed.drop("appet", axis=1, inplace=True)
kidney_data_imputed["appet"] = label

In [53]:
label = le.fit_transform(kidney_data_imputed['ane'])
kidney_data_imputed.drop("ane", axis=1, inplace=True)
kidney_data_imputed["ane"] = label

In [54]:
label = le.fit_transform(kidney_data_imputed['classification'])
kidney_data_imputed.drop("classification", axis=1, inplace=True)
kidney_data_imputed["classification"] = label

In [55]:
kidney_data_imputed['classification'].value_counts

<bound method IndexOpsMixin.value_counts of 0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    1
399    1
Name: classification, Length: 398, dtype: int64>

0 represents defective kidney 1 represents healthy kidney

Splitting the features and target

In [56]:
X = kidney_data_imputed.drop(columns='classification',axis=1)
Y = kidney_data_imputed['classification']

In [57]:
print(X)

      age    bp   al   sc  hemo  rbc  pc  pcc  pcv  wc  rc  appet  ane
0    48.0  80.0  1.0  1.2  15.4    1   1    0   32  72  34      0    0
1     7.0  50.0  4.0  0.8  11.3    1   1    0   26  56  34      0    0
2    62.0  80.0  2.0  1.8   9.6    1   1    0   19  70  34      1    1
3    48.0  70.0  4.0  3.8  11.2    1   0    1   20  62  19      1    1
4    51.0  80.0  2.0  1.4  11.6    1   1    0   23  68  27      0    0
..    ...   ...  ...  ...   ...  ...  ..  ...  ...  ..  ..    ...  ...
395  55.0  80.0  0.0  0.5  15.7    1   1    0   35  62  30      0    0
396  42.0  70.0  0.0  1.2  16.5    1   1    0   42  72  44      0    0
397  12.0  80.0  0.0  0.6  15.8    1   1    0   37  61  36      0    0
398  17.0  60.0  0.0  1.0  14.2    1   1    0   39  67  41      0    0
399  58.0  80.0  0.0  1.1  15.8    1   1    0   41  63  43      0    0

[398 rows x 13 columns]


In [58]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    1
399    1
Name: classification, Length: 398, dtype: int64


Splitting into training and test data

In [59]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [60]:
print(X.shape,X_train.shape,X_test.shape)

(398, 13) (318, 13) (80, 13)


In [61]:
print(Y.shape,Y_train.shape,Y_test.shape)

(398,) (318,) (80,)


In [62]:
Y_train.value_counts()

0    198
1    120
Name: classification, dtype: int64

In [63]:
Y_test.value_counts()

0    50
1    30
Name: classification, dtype: int64

Model Training

KNN

In [64]:
model=KNeighborsClassifier()

In [65]:
# training the KNN model with Training Data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [66]:
# Accuracy on training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)


In [67]:
print('Accuracy on Training data :', training_data_accuracy)

Accuracy on Training data : 0.9245283018867925


In [68]:
# Accuracy on test data
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)


In [69]:
print('Accuracy on Test data :', test_data_accuracy)

Accuracy on Test data : 0.8875


Predictive system

In [70]:
input_data=(12.0,80.0,0.0,0.6,15.8,1,1,0,37,61,36,0,0)
#input data into numpy array
input_data_as_numpy_array=np.asarray(input_data)
#reshape the numpy array as we are predicting one instance
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)
prediction=model.predict(input_data_reshaped)
print(prediction)

if(prediction[0]==0):
  print('The person has chronic kidney disease')
else:
  print('The person does not have chronic kidney disease')  


[1]
The person does not have chronic kidney disease


Saving the trained model

In [71]:
import pickle

In [72]:
filename = 'kidney_disease_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# loading the saved model
loaded_model = pickle.load(open('kidney_disease_model.sav', 'rb'))

In [73]:
for column in X.columns:
  print(column)

age
bp
al
sc
hemo
rbc
pc
pcc
pcv
wc
rc
appet
ane
