# This program detects breast cancer , based off the data

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt

In [6]:
df = pd.read_csv("Chronic_Kidney_disease.csv")
df.head(7)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000.0,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500.0,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300.0,4.6,no,no,no,good,no,no,ckd
5,5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,...,39,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,...,36,,,no,no,no,good,no,no,ckd


In [7]:
# Create a list of column names to kepp

columns_to_retain = ['sg','al','sc','hemo','pcv','wbcc','rbcc','htn','classification']

# Drop unnecessary column
df = df.drop([col for col in df.columns if not col in columns_to_retain],axis =1)

# Drop the rows with na or missing values
df = df.dropna(axis=0)

In [8]:
# Transform the non-numeric data in columns

for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform( df[column])

In [9]:
df.tail(8)

Unnamed: 0,sg,al,sc,hemo,pcv,htn,classification
392,1.02,0.0,1.2,14.8,30,0,1
393,1.025,0.0,0.7,13.0,38,0,1
394,1.02,0.0,0.8,14.1,29,0,1
395,1.02,0.0,0.5,15.7,31,0,1
396,1.025,0.0,1.2,16.5,38,0,1
397,1.02,0.0,0.6,15.8,33,0,1
398,1.025,0.0,1.0,14.2,35,0,1
399,1.025,0.0,1.1,15.8,37,0,1


In [14]:
# Split the data into independent (x) dataset (the features) and dependent (Y) dataset (the target)

X = df.drop(['classification'],axis=1)
Y = df['classification']

In [15]:
X.columns

Index(['sg', 'al', 'sc', 'hemo', 'pcv', 'htn'], dtype='object')

In [16]:
# Feature Scaling
# min-max scaler method scales the dataset so that all the input features lie between 0 and 1 

x_scaler = MinMaxScaler()
x_scaler.fit(X)
column_names = X.columns
X[column_names] = x_scaler.transform(X)
X[column_names]

Unnamed: 0,sg,al,sc,hemo,pcv,htn
0,0.75,0.2,0.033898,0.836735,0.717949,1.0
1,0.75,0.8,0.016949,0.557823,0.564103,0.0
2,0.25,0.4,0.059322,0.442177,0.384615,0.0
3,0.00,0.8,0.144068,0.551020,0.410256,1.0
4,0.25,0.4,0.042373,0.578231,0.487179,0.0
...,...,...,...,...,...,...
395,0.75,0.0,0.004237,0.857143,0.794872,0.0
396,1.00,0.0,0.033898,0.911565,0.974359,0.0
397,0.75,0.0,0.008475,0.863946,0.846154,0.0
398,1.00,0.0,0.025424,0.755102,0.897436,0.0


In [17]:
# Split the data into 80% trainig and 20% testing and shuffle

X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2,shuffle=True)

# Creating Models (Logistic , Decision , Random Forest )

In [18]:
# Create a function for the models


def models(X_train,Y_train):
    
    # Logistic Regression
    
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state=0)
    log.fit(X_train,Y_train)
    
    
    # Decision Tree Classifier
    
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion = "entropy" , random_state=0)
    tree.fit(X_train,Y_train)
    
    # Random Forest Classifier
    
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 10 , criterion = "entropy",random_state=0)
    forest.fit(X_train,Y_train)
    
    # Print the models accuracy on the training data
    
    print('[0] Logistic Regression Training Accuracy : ',log.score(X_train,Y_train))
    print('[1] Decision Tree Classifier Training Accuracy : ',tree.score(X_train,Y_train))
    print('[2] Random Forest Classifier Training Accuracy : ',forest.score(X_train,Y_train))
    
    return log,tree,forest
    
    

In [19]:
# Getting all the models

model = models(X_train,Y_train)

[0] Logistic Regression Training Accuracy :  0.9781659388646288
[1] Decision Tree Classifier Training Accuracy :  1.0
[2] Random Forest Classifier Training Accuracy :  1.0


# Testing Model Accuracy by Confusion Matrix

In [20]:
# Test model accuracy on test data on Confusion matrix

from sklearn.metrics import confusion_matrix

for i in range(len(model)):
    print("Model ", i)
    
    cm = confusion_matrix(Y_test,model[i].predict(X_test))
    # [[true_negative , false_postive] [false_negative,true_positive]]
    TP = cm[1][1]
    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]

    print(cm)
    print("Testing Accuracy =  ",(TP + TN)/(TP + TN + FN + FP))
    print()



Model  0
[[31  1]
 [ 0 26]]
Testing Accuracy =   0.9827586206896551

Model  1
[[32  0]
 [ 0 26]]
Testing Accuracy =   1.0

Model  2
[[32  0]
 [ 0 26]]
Testing Accuracy =   1.0



# Testing accuracy by another method

In [21]:
# Show another way to get metrices of the models

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


for i in range(len(model)):
    print("Model ", i)
    print(classification_report(Y_test,model[i].predict(X_test)))
    print(accuracy_score(Y_test,model[i].predict(X_test)))
    print()

Model  0
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        32
           1       0.96      1.00      0.98        26

    accuracy                           0.98        58
   macro avg       0.98      0.98      0.98        58
weighted avg       0.98      0.98      0.98        58

0.9827586206896551

Model  1
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        26

    accuracy                           1.00        58
   macro avg       1.00      1.00      1.00        58
weighted avg       1.00      1.00      1.00        58

1.0

Model  2
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        26

    accuracy                           1.00        58
   macro avg       1.00      1.00      1.00        58
weighted avg       1.0

In [22]:
# Print the prediction of Random Forest Classifier Model

pred = model[1].predict(X_test)
print(pred)
print()
print(Y_test)

[0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 1 0 0
 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 0 0]

198    0
115    0
274    1
374    1
117    0
29     0
2      0
325    1
25     0
335    1
259    1
248    0
58     0
196    0
172    0
314    1
184    0
159    0
341    1
187    0
177    0
265    1
49     0
8      0
291    1
217    0
73     0
31     0
302    1
292    1
387    1
298    1
131    0
234    0
270    1
169    0
149    0
206    0
392    1
332    1
395    1
133    0
333    1
364    1
4      0
15     0
345    1
62     0
277    1
398    1
241    0
308    1
70     0
258    1
353    1
396    1
158    0
56     0
Name: classification, dtype: int32


# Saving Models ( Random Forest )

In [23]:
def  SaveModel():
    import pickle
    with open("CKD_Model","wb") as f:
        pickle.dump(model[1],f)

In [24]:
SaveModel()

In [25]:
import pickle
with open("CKD_Model","rb") as f:
    randomForest = pickle.load(f)

pred = randomForest.predict(X_test)
print(pred)
print()
print(Y_test)

[0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 1 0 0
 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 0 0]

198    0
115    0
274    1
374    1
117    0
29     0
2      0
325    1
25     0
335    1
259    1
248    0
58     0
196    0
172    0
314    1
184    0
159    0
341    1
187    0
177    0
265    1
49     0
8      0
291    1
217    0
73     0
31     0
302    1
292    1
387    1
298    1
131    0
234    0
270    1
169    0
149    0
206    0
392    1
332    1
395    1
133    0
333    1
364    1
4      0
15     0
345    1
62     0
277    1
398    1
241    0
308    1
70     0
258    1
353    1
396    1
158    0
56     0
Name: classification, dtype: int32
