### Imports

In [50]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from fancyimpute import KNN
import joblib

### Data Preprocessing

In [51]:
df = pd.read_csv("kidney_disease.csv")
df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [52]:
print(df.iloc[0])

id                         0
age                     48.0
bp                      80.0
sg                      1.02
al                       1.0
su                       0.0
rbc                      NaN
pc                    normal
pcc               notpresent
ba                notpresent
bgr                    121.0
bu                      36.0
sc                       1.2
sod                      NaN
pot                      NaN
hemo                    15.4
pcv                       44
wc                      7800
rc                       5.2
htn                      yes
dm                       yes
cad                       no
appet                   good
pe                        no
ane                       no
classification           ckd
Name: 0, dtype: object


In [53]:
# Replace categorical values with numeric representations
df['rbc'] = df['rbc'].replace({'normal': 0, 'abnormal': 1})
df['pc'] = df['pc'].replace({'normal': 0, 'abnormal': 1})
df['pcc'] = df['pcc'].replace({'present': 0, 'notpresent': 1})
df['ba'] = df['ba'].replace({'present': 0, 'notpresent': 1})
df['htn'] = df['htn'].replace({'no': 0, 'yes': 1})
df['dm'] = df['dm'].replace({'no': 0, 'yes': 1})
df['cad'] = df['cad'].replace({'no': 0, 'yes': 1})
df['appet'] = df['appet'].replace({'poor': 0, 'good': 1})
df['pe'] = df['pe'].replace({'no': 0, 'yes': 1})
df['ane'] = df['ane'].replace({'no': 0, 'yes': 1})
df['classification'] = df['classification'].replace({'notckd': 0, 'ckd': 1})

# Convert columns to numeric type
df['rbc'] = pd.to_numeric(df['rbc'], errors='coerce')
df['pc'] = pd.to_numeric(df['pc'], errors='coerce')
df['pcc'] = pd.to_numeric(df['pcc'], errors='coerce')
df['ba'] = pd.to_numeric(df['ba'], errors='coerce')
df['htn'] = pd.to_numeric(df['htn'], errors='coerce')
df['dm'] = pd.to_numeric(df['dm'], errors='coerce')
df['cad'] = pd.to_numeric(df['cad'], errors='coerce')
df['appet'] = pd.to_numeric(df['appet'], errors='coerce')
df['pe'] = pd.to_numeric(df['pe'], errors='coerce')
df['ane'] = pd.to_numeric(df['ane'], errors='coerce')
df['classification'] = pd.to_numeric(df['classification'], errors='coerce')

print(df.head())

   id   age    bp     sg   al   su  rbc   pc  pcc   ba  ...  pcv    wc   rc  \
0   0  48.0  80.0  1.020  1.0  0.0  NaN  0.0  1.0  1.0  ...   44  7800  5.2   
1   1   7.0  50.0  1.020  4.0  0.0  NaN  0.0  1.0  1.0  ...   38  6000  NaN   
2   2  62.0  80.0  1.010  2.0  3.0  0.0  0.0  1.0  1.0  ...   31  7500  NaN   
3   3  48.0  70.0  1.005  4.0  0.0  0.0  1.0  0.0  1.0  ...   32  6700  3.9   
4   4  51.0  80.0  1.010  2.0  0.0  0.0  0.0  1.0  1.0  ...   35  7300  4.6   

   htn   dm  cad appet   pe  ane  classification  
0  1.0  1.0  0.0   1.0  0.0  0.0             1.0  
1  0.0  0.0  0.0   1.0  0.0  0.0             1.0  
2  0.0  1.0  0.0   0.0  0.0  1.0             1.0  
3  1.0  0.0  0.0   0.0  1.0  1.0             1.0  
4  0.0  0.0  0.0   1.0  0.0  0.0             1.0  

[5 rows x 26 columns]


  df['rbc'] = df['rbc'].replace({'normal': 0, 'abnormal': 1})
  df['pc'] = df['pc'].replace({'normal': 0, 'abnormal': 1})
  df['pcc'] = df['pcc'].replace({'present': 0, 'notpresent': 1})
  df['ba'] = df['ba'].replace({'present': 0, 'notpresent': 1})
  df['htn'] = df['htn'].replace({'no': 0, 'yes': 1})
  df['appet'] = df['appet'].replace({'poor': 0, 'good': 1})
  df['pe'] = df['pe'].replace({'no': 0, 'yes': 1})
  df['ane'] = df['ane'].replace({'no': 0, 'yes': 1})


In [54]:
df.drop('id', axis=1, inplace=True)

In [55]:
print(df.iloc[0])


age                48.0
bp                 80.0
sg                 1.02
al                  1.0
su                  0.0
rbc                 NaN
pc                  0.0
pcc                 1.0
ba                  1.0
bgr               121.0
bu                 36.0
sc                  1.2
sod                 NaN
pot                 NaN
hemo               15.4
pcv                  44
wc                 7800
rc                  5.2
htn                 1.0
dm                  1.0
cad                 0.0
appet               1.0
pe                  0.0
ane                 0.0
classification      1.0
Name: 0, dtype: object


In [16]:
df.to_csv('your_modified_dataset.csv', index=False)

In [35]:
df['age']=df['age'].fillna(df['age'].mean()) 
df['bp']=df['bp'].fillna(df['bp'].mean()) 
df['sg']=df['sg'].fillna(df['sg'].mean()) 
df['al']=df['al'].fillna(df['al'].mean()) 
df['su']=df['su'].fillna(df['su'].mean()) 
df['rbc']=df['rbc'].fillna(df['rbc'].mean()) 
df['pc']=df['pc'].fillna(df['pc'].mean()) 
df['pcc']=df['pcc'].fillna(df['pcc'].mean()) 
df['ba']=df['ba'].fillna(df['ba'].mean()) 
df['bgr']=df['bgr'].fillna(df['bgr'].mean()) 

In [56]:
df.isnull().sum()

age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  8
cad                 4
appet               1
pe                  1
ane                 1
classification      2
dtype: int64

In [57]:
df.size

10000

In [58]:
lb=LabelEncoder()
df['rbc']=lb.fit_transform(df['rbc'])
df['pc']=lb.fit_transform(df['pc'])
df['pcc']=lb.fit_transform(df['pcc'])
df['ba']=lb.fit_transform(df['ba'])
df['htn']=lb.fit_transform(df['htn'])
df['dm']=lb.fit_transform(df['dm'])
df['cad']=lb.fit_transform(df['cad'])
df['appet']=lb.fit_transform(df['appet'])
df['pe']=lb.fit_transform(df['pe'])
df['ane']=lb.fit_transform(df['ane'])
df['classification']=lb.fit_transform(df['classification'])

In [59]:
df['age']=df['age'].fillna(df['age'].mean()) 

In [60]:
df.replace('\t?', float('nan'), inplace=True)  # Replace '\t?' with NaN

columns_to_convert = [ 'bp',     'sg',   'al' ,  'su',  'rbc',  'pc',  'pcc' , 'ba'  ,'bgr', 'bu', 'sc', 'sod', 'pot' ,'hemo' ,'pcv' , 'wc' , 'rc' ,'htn',  'dm'  ,'cad',  'appet' , 'pe' , 'ane' , 'classification']  # Replace with the actual column names
for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce')

df.dropna(inplace=True)

In [61]:
knn_imputer = KNN()
df = knn_imputer.fit_transform(df)

Imputing row 1/205 with 0 missing, elapsed time: 0.006
Imputing row 101/205 with 0 missing, elapsed time: 0.007
Imputing row 201/205 with 0 missing, elapsed time: 0.007




In [62]:
df=pd.DataFrame(df,columns=[ 'age',   'bp',     'sg',   'al' ,  'su',  'rbc',  'pc',  'pcc' , 'ba'  ,'bgr', 'bu', 'sc', 'sod', 'pot' ,'hemo' ,'pcv' , 'wc' , 'rc' ,'htn',  'dm'  ,'cad',  'appet' , 'pe' , 'ane' , 'classification'])

In [63]:
df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,70.0,1.005,4.0,0.0,0.0,1.0,0.0,1.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1.0
1,60.0,90.0,1.015,3.0,0.0,2.0,2.0,1.0,1.0,74.0,...,39.0,7800.0,4.4,1.0,1.0,0.0,1.0,1.0,0.0,1.0
2,53.0,90.0,1.020,2.0,0.0,1.0,1.0,0.0,1.0,70.0,...,29.0,12100.0,3.7,1.0,1.0,0.0,0.0,0.0,1.0,1.0
3,63.0,70.0,1.010,3.0,0.0,1.0,1.0,0.0,1.0,380.0,...,32.0,4500.0,3.8,1.0,1.0,0.0,0.0,1.0,0.0,1.0
4,68.0,70.0,1.015,3.0,1.0,2.0,0.0,0.0,1.0,208.0,...,28.0,12200.0,3.4,1.0,1.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,55.0,80.0,1.020,0.0,0.0,0.0,0.0,1.0,1.0,140.0,...,47.0,6700.0,4.9,0.0,0.0,0.0,1.0,0.0,0.0,0.0
201,42.0,70.0,1.025,0.0,0.0,0.0,0.0,1.0,1.0,75.0,...,54.0,7800.0,6.2,0.0,0.0,0.0,1.0,0.0,0.0,0.0
202,12.0,80.0,1.020,0.0,0.0,0.0,0.0,1.0,1.0,100.0,...,49.0,6600.0,5.4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
203,17.0,60.0,1.025,0.0,0.0,0.0,0.0,1.0,1.0,114.0,...,51.0,7200.0,5.9,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [80]:
df_shuffled = df.sample(frac=1, random_state=42)
df_shuffled

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
15,82.0,80.0,1.010,2.0,2.0,0.0,2.0,1.0,1.0,140.0,...,40.0,9800.0,4.2,1.0,1.0,0.0,1.0,0.0,0.0,1.0
9,48.0,80.0,1.025,4.0,0.0,0.0,1.0,1.0,1.0,95.0,...,32.0,6900.0,3.4,1.0,0.0,0.0,1.0,0.0,1.0,1.0
100,23.0,80.0,1.025,0.0,0.0,0.0,0.0,1.0,1.0,111.0,...,41.0,7200.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
132,55.0,80.0,1.020,0.0,0.0,0.0,0.0,1.0,1.0,104.0,...,52.0,8200.0,4.8,0.0,0.0,0.0,1.0,0.0,0.0,0.0
68,55.0,80.0,1.010,3.0,1.0,0.0,1.0,0.0,0.0,214.0,...,34.0,7400.0,3.7,1.0,1.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,24.0,70.0,1.025,0.0,0.0,0.0,0.0,1.0,1.0,140.0,...,48.0,5800.0,5.6,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14,61.0,90.0,1.010,1.0,1.0,2.0,0.0,1.0,1.0,159.0,...,34.0,9600.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
92,49.0,80.0,1.020,0.0,0.0,0.0,0.0,1.0,1.0,122.0,...,41.0,5600.0,4.9,0.0,0.0,0.0,1.0,0.0,0.0,0.0
179,61.0,70.0,1.025,0.0,0.0,0.0,0.0,1.0,1.0,133.0,...,47.0,9200.0,4.9,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [90]:
print(df_shuffled.iloc[0])
df_shuffled.head(1)

age                 82.00
bp                  80.00
sg                   1.01
al                   2.00
su                   2.00
rbc                  0.00
pc                   2.00
pcc                  1.00
ba                   1.00
bgr                140.00
bu                  70.00
sc                   3.40
sod                136.00
pot                  4.20
hemo                13.00
pcv                 40.00
wc                9800.00
rc                   4.20
htn                  1.00
dm                   1.00
cad                  0.00
appet                1.00
pe                   0.00
ane                  0.00
classification       1.00
Name: 15, dtype: float64


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
15,82.0,80.0,1.01,2.0,2.0,0.0,2.0,1.0,1.0,140.0,...,40.0,9800.0,4.2,1.0,1.0,0.0,1.0,0.0,0.0,1.0


### Creating X input  and Y labels for training

In [81]:
X = df_shuffled.drop(['classification'], axis=1)
y=df_shuffled['classification']

In [82]:
print(X.iloc[0])

age        82.00
bp         80.00
sg          1.01
al          2.00
su          2.00
rbc         0.00
pc          2.00
pcc         1.00
ba          1.00
bgr       140.00
bu         70.00
sc          3.40
sod       136.00
pot         4.20
hemo       13.00
pcv        40.00
wc       9800.00
rc          4.20
htn         1.00
dm          1.00
cad         0.00
appet       1.00
pe          0.00
ane         0.00
Name: 15, dtype: float64


In [83]:
X.shape,y.shape

((205, 24), (205,))

### Splitting the data into train and test

In [84]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=101)

### Training The Random Forest Classifier

In [85]:
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99        35
         1.0       1.00      0.96      0.98        27

    accuracy                           0.98        62
   macro avg       0.99      0.98      0.98        62
weighted avg       0.98      0.98      0.98        62



In [86]:
confusion_matrix(y_test,y_pred)

array([[35,  0],
       [ 1, 26]], dtype=int64)

### Saving The Model (Using Joblib) to be used in Django Framework

In [91]:
joblib.dump(rf, 'random_forest_model.joblib')

['random_forest_model.joblib']

In [73]:
from joblib import load
loaded_rf = load('random_forest_model.joblib')

In [77]:
default_answers = [48.0, 80.0, 1.02, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 121.0, 36.0, 1.2, 111.0, 2.5, 15.4, 44.0, 6000.0, 5.2, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0]
default_answers_2d = np.array(default_answers).reshape(1, -1)
prediction = loaded_rf.predict(default_answers_2d)
prediction[0]



0.0