Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

Data Collection and Processing

In [None]:
# loading the csv data to a Pandas DataFrame
kidney_data = pd.read_csv('/content/kidney.csv')

In [None]:
# print first 5 rows of the dataset
kidney_data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [None]:
# print last 5 rows of the dataset
kidney_data.tail()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
395,395,55.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd
399,399,58.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,53,6800,6.1,no,no,no,good,no,no,notckd


In [None]:
# number of rows and columns in the dataset
kidney_data.shape

(400, 26)

In [None]:
# dropping id column
kidney_data.drop('id', axis = 1, inplace = True)

In [None]:
# number of rows and columns in the dataset
kidney_data.shape

(400, 25)

In [None]:
# statistical measures about the data
kidney_data.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [None]:
# getting some info about the data
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    obj

In [None]:
# checking for missing values
kidney_data.isnull().sum()

age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [None]:
# converting necessary columns to numerical type

kidney_data['pcv'] = pd.to_numeric(kidney_data['pcv'], errors='coerce')
kidney_data['wc'] = pd.to_numeric(kidney_data['wc'], errors='coerce')
kidney_data['rc'] = pd.to_numeric(kidney_data['rc'], errors='coerce')

In [None]:
# getting some info about the data
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             329 non-null    float64
 16  wc              294 non-null    float64
 17  rc              269 non-null    flo

In [None]:
# Extracting categorical and numerical columns

cat_cols = [col for col in kidney_data.columns if kidney_data[col].dtype == 'object']
num_cols = [col for col in kidney_data.columns if kidney_data[col].dtype != 'object']

In [None]:
# looking at unique values in categorical columns

for col in cat_cols:
    print(f"{col} has {kidney_data[col].unique()} values\n")

rbc has [nan 'normal' 'abnormal'] values

pc has ['normal' 'abnormal' nan] values

pcc has ['notpresent' 'present' nan] values

ba has ['notpresent' 'present' nan] values

htn has ['yes' 'no' nan] values

dm has ['yes' 'no' ' yes' '\tno' '\tyes' nan] values

cad has ['no' 'yes' '\tno' nan] values

appet has ['good' 'poor' nan] values

pe has ['no' 'yes' nan] values

ane has ['no' 'yes' nan] values

classification has ['ckd' 'ckd\t' 'notckd'] values



In [None]:
# replace incorrect values

kidney_data['dm'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)

kidney_data['cad'] = kidney_data['cad'].replace(to_replace = '\tno', value='no')

kidney_data['classification'] = kidney_data['classification'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})

In [None]:
kidney_data['classification'] = kidney_data['classification'].map({'ckd': 0, 'not ckd': 1})
kidney_data['classification'] = pd.to_numeric(kidney_data['classification'], errors='coerce')

In [None]:
cols = ['dm', 'cad', 'classification']

for col in cols:
    print(f"{col} has {kidney_data[col].unique()} values\n")

dm has ['yes' 'no' nan] values

cad has ['no' 'yes' nan] values

classification has [0 1] values



In [None]:
# checking the distribution of Target Variable
kidney_data['classification'].value_counts()

0    250
1    150
Name: classification, dtype: int64

0 --> no ckd
1 --> ckd

In [None]:
for col in cat_cols:
    print(f"{col} has {kidney_data[col].nunique()} categories\n")

rbc has 2 categories

pc has 2 categories

pcc has 2 categories

ba has 2 categories

htn has 2 categories

dm has 2 categories

cad has 2 categories

appet has 2 categories

pe has 2 categories

ane has 2 categories

classification has 2 categories



In [None]:
label_encoder = LabelEncoder()
for col in ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']:
         kidney_data[col] = label_encoder.fit_transform(kidney_data[col])

In [None]:
kidney_data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,2,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,0
1,7.0,50.0,1.02,4.0,0.0,2,1,0,0,,...,38.0,6000.0,,0,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,3.0,1,1,0,0,423.0,...,31.0,7500.0,,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.01,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,0


Splitting the Features and Target

In [None]:
X = kidney_data.drop(columns='classification', axis=1)
Y = kidney_data['classification']

In [None]:
print(X)

      age    bp     sg   al   su  rbc  pc  pcc  ba    bgr  ...  hemo   pcv  \
0    48.0  80.0  1.020  1.0  0.0    2   1    0   0  121.0  ...  15.4  44.0   
1     7.0  50.0  1.020  4.0  0.0    2   1    0   0    NaN  ...  11.3  38.0   
2    62.0  80.0  1.010  2.0  3.0    1   1    0   0  423.0  ...   9.6  31.0   
3    48.0  70.0  1.005  4.0  0.0    1   0    1   0  117.0  ...  11.2  32.0   
4    51.0  80.0  1.010  2.0  0.0    1   1    0   0  106.0  ...  11.6  35.0   
..    ...   ...    ...  ...  ...  ...  ..  ...  ..    ...  ...   ...   ...   
395  55.0  80.0  1.020  0.0  0.0    1   1    0   0  140.0  ...  15.7  47.0   
396  42.0  70.0  1.025  0.0  0.0    1   1    0   0   75.0  ...  16.5  54.0   
397  12.0  80.0  1.020  0.0  0.0    1   1    0   0  100.0  ...  15.8  49.0   
398  17.0  60.0  1.025  0.0  0.0    1   1    0   0  114.0  ...  14.2  51.0   
399  58.0  80.0  1.025  0.0  0.0    1   1    0   0  131.0  ...  15.8  53.0   

         wc   rc  htn  dm  cad  appet  pe  ane  
0    7800.0  5

In [None]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    1
399    1
Name: classification, Length: 400, dtype: int64


Splitting the Data into Training data & Test Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(400, 24) (320, 24) (80, 24)


Model Training

xgboost

In [None]:
model = XGBClassifier()

In [None]:
# training the xgboost model with Training data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print("*Confusion Matrix for XGB: ")
print(confusion_matrix(X_train_prediction, Y_train))

*Confusion Matrix for XGB: 
[[200   0]
 [  0 120]]


In [None]:
print("*classification report for XGB: ")
print(classification_report(X_train_prediction, Y_train))

*classification report for XGB: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      1.00      1.00       120

    accuracy                           1.00       320
   macro avg       1.00      1.00      1.00       320
weighted avg       1.00      1.00      1.00       320



In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  1.0


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9625


Building a Predictive System

In [None]:
input_data = (62,0,0,140,268,0,0,160,0,3.6,0,2,2,0,0,140,268,0,0,160,0,3.6,0,2,)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('no ckd')
else:
  print('kcd')

[0]
no ckd


In [None]:
import pickle

In [None]:
filename = 'kidney_disease_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# loading the saved model
loaded_model = pickle.load(open('kidney_disease_model.sav', 'rb'))

In [None]:
for column in X.columns:
  print(column)

age
bp
sg
al
su
rbc
pc
pcc
ba
bgr
bu
sc
sod
pot
hemo
pcv
wc
rc
htn
dm
cad
appet
pe
ane
