In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [3]:
df=pd.read_csv("dataset/kidney/kidney_disease.csv")
df.tail()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
395,395,55.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd
399,399,58.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,53,6800,6.1,no,no,no,good,no,no,notckd


In [4]:
df.drop(["id"],axis=1,inplace=True)

In [5]:
df.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [6]:
df["pcv"].unique()

array(['44', '38', '31', '32', '35', '39', '36', '33', '29', '28', nan,
       '16', '24', '37', '30', '34', '40', '45', '27', '48', '\t?', '52',
       '14', '22', '18', '42', '17', '46', '23', '19', '25', '41', '26',
       '15', '21', '43', '20', '\t43', '47', '9', '49', '50', '53', '51',
       '54'], dtype=object)

In [7]:
df["rbc"]=df["rbc"].map({"normal":0,"abnormal":1})
df["pc"]=df["pc"].map({"normal":0,"abnormal":1})
df["pcc"]=df["pcc"].map({"notpresent":0,"present":1})
df["ba"]=df["ba"].map({"notpresent":0,"present":1})
df["htn"]=df["htn"].map({"no":0,"yes":1})
df["dm"]=df["dm"].str.split("\t").str[0].apply(lambda x: 0 if x=="no" else 1)
df["cad"]=df["cad"].str.split("\t").str[0].apply(lambda x: 0 if x=="no" else 1)
df["appet"]=df["appet"].map({"good":0,"poor":1})
df["pe"]=df["pe"].map({"no":0,"yes":1})
df["ane"]=df["ane"].map({"no":0,"yes":1})
df["classification"]=df["classification"].str.split("\t").str[0].apply(lambda x: 0 if x=="ckd" else 1)
df["pcv"]=df["pcv"].str.split("\t").str[0].apply(lambda x: np.nan if x=="" else x)
df["wc"]=df["wc"].str.split("\t").str[0].apply(lambda x: np.nan if x=="" else x)
df["rc"]=df["rc"].str.split("\t").str[0].apply(lambda x: np.nan if x=="" else x)

In [8]:
col=df.columns[df.isnull().sum()>0]
for c in col:
    df[c]=df[c].fillna(df[c].mode()[0])

In [9]:
df.isnull().sum()

age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [None]:
k=df.columns
i=11
sns.distplot(df[k[i]]),df[k[i]].unique()

In [None]:
sns.boxplot(df[k[i]])

In [None]:
p25=df[k[i]].quantile(0.25)
p75=df[k[i]].quantile(0.75)
iqr=p75-p25
uprlmt=p75+1.5*iqr
lwrlmt=p25-1.5*iqr
df[k[i]]=np.where(df[k[i]]>uprlmt,uprlmt,np.where(df[k[i]]<lwrlmt,lwrlmt,df[k[i]]))

In [10]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,0.0,0.0,0.0,0.0,121.0,...,44,7800,5.2,1.0,1,0,0.0,0.0,0.0,0
1,7.0,50.0,1.02,4.0,0.0,0.0,0.0,0.0,0.0,99.0,...,38,6000,5.2,0.0,0,0,0.0,0.0,0.0,0
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,...,31,7500,5.2,0.0,1,0,1.0,0.0,1.0,0
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,...,32,6700,3.9,1.0,0,0,1.0,1.0,1.0,0
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,...,35,7300,4.6,0.0,0,0,0.0,0.0,0.0,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             400 non-null    float64
 1   bp              400 non-null    float64
 2   sg              400 non-null    float64
 3   al              400 non-null    float64
 4   su              400 non-null    float64
 5   rbc             400 non-null    float64
 6   pc              400 non-null    float64
 7   pcc             400 non-null    float64
 8   ba              400 non-null    float64
 9   bgr             400 non-null    float64
 10  bu              400 non-null    float64
 11  sc              400 non-null    float64
 12  sod             400 non-null    float64
 13  pot             400 non-null    float64
 14  hemo            400 non-null    float64
 15  pcv             400 non-null    object 
 16  wc              400 non-null    object 
 17  rc              400 non-null    obj

In [12]:
df["pcv"]=df["pcv"].astype(dtype="int")
df["wc"]=df["wc"].astype(dtype="int")
df["rc"]=df["rc"].astype(dtype="float16")

In [13]:
df.describe()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,...,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,51.675,76.575,1.017712,0.9,0.395,0.1175,0.19,0.105,0.055,142.6425,...,39.255,8788.0,4.867188,0.3675,0.355,0.095,0.205,0.19,0.15,0.375
std,17.022008,13.489785,0.005434,1.31313,1.040038,0.322418,0.392792,0.306937,0.228266,76.344226,...,8.189484,2595.022419,0.871582,0.482728,0.479113,0.293582,0.404207,0.392792,0.357519,0.484729
min,2.0,50.0,1.005,0.0,0.0,0.0,0.0,0.0,0.0,22.0,...,9.0,2200.0,2.099609,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.0,70.0,1.015,0.0,0.0,0.0,0.0,0.0,0.0,99.0,...,34.0,7000.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,55.0,80.0,1.02,0.0,0.0,0.0,0.0,0.0,0.0,114.5,...,41.0,9500.0,5.199219,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.0,80.0,1.02,2.0,0.0,0.0,0.0,0.0,0.0,150.0,...,44.0,9800.0,5.199219,1.0,1.0,0.0,0.0,0.0,0.0,1.0
max,90.0,180.0,1.025,5.0,5.0,1.0,1.0,1.0,1.0,490.0,...,54.0,26400.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [55]:
df["cad"].unique(),df["hemo"].max()

(array([0, 1], dtype=int64), 17.8)

In [56]:
df["age"]=df["age"]/100
df["bp"]=df["bp"].apply(lambda x: (x-50)/(180-50))
df["bp"]=df["bp"].apply(lambda x: (x-50)/(180-50))
df["bgr"]=df["bgr"].apply(lambda x: (x-20)/(500-20))
df["bu"]=df["bu"]/400
df["sc"]=df["sc"]/80
df["sod"]=df["sod"]/170
df["pot"]=df["pot"]/50
df["hemo"]=df["hemo"]/20
df["pcv"]=df["pcv"]/60
df["wc"]=df["wc"].apply(lambda x: (x-2200)/(26400-2200))
df["rc"]=df["rc"]/10

In [57]:
x=df.drop(["classification"],axis=1)
y=df["classification"]

In [58]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,shuffle=True)

In [59]:
y_test.value_counts()

classification
0    62
1    38
Name: count, dtype: int64

In [60]:
import tensorflow as tf

In [129]:
model=tf.keras.Sequential([
    # tf.keras.layers.Dense(64,activation="relu"),
    tf.keras.layers.Dense(64,activation="relu"),
    tf.keras.layers.Dense(48,activation="relu"),
    tf.keras.layers.Dense(32,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid")
])
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001),loss=tf.losses.binary_crossentropy,metrics=["accuracy"])

In [130]:
model.fit(x_train,y_train,epochs=10,shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ca334b6b00>

In [131]:
model.evaluate(x_test,y_test)



[0.11130031943321228, 0.9599999785423279]

In [112]:
model.save("kidneyModel.h5") #0.98

In [110]:
ypred=model.predict(x_test)
ypred=ypred.round().ravel()



In [132]:
confusion_matrix(y_test,ypred)

array([[61,  1],
       [ 1, 37]], dtype=int64)