In [83]:
# Import the required libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [84]:
# Read the data and display

diabetesDF = pd.read_csv('diabetes.csv')
diabetesDF.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [85]:
diabetesDF.drop(['BMI', 'DiabetesPedigreeFunction'], axis=1, inplace=True)

In [86]:
diabetesDF.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,Age,Outcome
0,6,148,72,35,0,50,1
1,1,85,66,29,0,31,0
2,8,183,64,0,0,32,1
3,1,89,66,23,94,21,0
4,0,137,40,35,168,33,1


In [87]:
# Total 768 patients record
# Using 650 data for training
# Using 100 data for testing
# Using 18 data for validation

dfTrain = diabetesDF[:650]
dfTest = diabetesDF[650:750]
dfCheck = diabetesDF[750:]

In [88]:
# Separating label and features and converting to numpy array to feed into our model
trainLabel = np.asarray(dfTrain['Outcome'])
trainData = np.asarray(dfTrain.drop('Outcome',1))
testLabel = np.asarray(dfTest['Outcome'])
testData = np.asarray(dfTest.drop('Outcome',1))

In [89]:
# Normalize the data 
means = np.mean(trainData, axis=0)
stds = np.std(trainData, axis=0)

trainData = (trainData - means)/stds
testData = (testData - means)/stds

In [90]:
# models target t as sigmoid(w0 + w1*x1 + w2*x2 + ... + wd*xd)
diabetesCheck = LogisticRegression()
diabetesCheck.fit(trainData,trainLabel)
accuracy = diabetesCheck.score(testData,testLabel)
print("accuracy = ",accuracy * 100,"%")

accuracy =  80.0 %


In [91]:
# predict values using training data

predict_train = diabetesCheck.predict(trainData)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(trainLabel,predict_train)))
print()

Accuracy: 0.7508



In [92]:
# predict values using testing data

predict_train = diabetesCheck.predict(testData)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(testLabel,predict_train)))
print()

Accuracy: 0.8000



In [93]:
# Confusion Matrix

print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(testLabel,predict_train)))
print("")

Confusion Matrix
[[57  6]
 [14 23]]



In [94]:
print("Classification Report")
print("{0}".format(metrics.classification_report(testLabel,predict_train)))

Classification Report
              precision    recall  f1-score   support

           0       0.80      0.90      0.85        63
           1       0.79      0.62      0.70        37

    accuracy                           0.80       100
   macro avg       0.80      0.76      0.77       100
weighted avg       0.80      0.80      0.79       100



In [95]:
# models target t as sigmoid(w0 + w1*x1 + w2*x2 + ... + wd*xd)
diabetesCheck = KNeighborsClassifier()
diabetesCheck.fit(trainData,trainLabel)
accuracy = diabetesCheck.score(testData,testLabel)
print("accuracy = ",accuracy * 100,"%")

accuracy =  71.0 %


In [96]:
# predict values using training data

predict_train = diabetesCheck.predict(trainData)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(trainLabel,predict_train)))
print()

Accuracy: 0.8108



In [97]:
# predict values using testing data

predict_train = diabetesCheck.predict(testData)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(testLabel,predict_train)))
print()

Accuracy: 0.7100



In [98]:
# Confusion Matrix

print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(testLabel,predict_train)))
print("")

Confusion Matrix
[[51 12]
 [17 20]]



In [99]:
print("Classification Report")
print("{0}".format(metrics.classification_report(testLabel,predict_train)))

Classification Report
              precision    recall  f1-score   support

           0       0.75      0.81      0.78        63
           1       0.62      0.54      0.58        37

    accuracy                           0.71       100
   macro avg       0.69      0.68      0.68       100
weighted avg       0.70      0.71      0.71       100



In [100]:
# models target t as sigmoid(w0 + w1*x1 + w2*x2 + ... + wd*xd)
diabetesCheck = SVC()
diabetesCheck.fit(trainData,trainLabel)
accuracy = diabetesCheck.score(testData,testLabel)
print("accuracy = ",accuracy * 100,"%")

accuracy =  75.0 %


In [101]:
# predict values using training data

predict_train = diabetesCheck.predict(trainData)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(trainLabel,predict_train)))
print()

Accuracy: 0.7877



In [102]:
# predict values using testing data

predict_train = diabetesCheck.predict(testData)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(testLabel,predict_train)))
print()

Accuracy: 0.7500



In [103]:
# Confusion Matrix

print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(testLabel,predict_train)))
print("")

Confusion Matrix
[[55  8]
 [17 20]]



In [104]:
print("Classification Report")
print("{0}".format(metrics.classification_report(testLabel,predict_train)))

Classification Report
              precision    recall  f1-score   support

           0       0.76      0.87      0.81        63
           1       0.71      0.54      0.62        37

    accuracy                           0.75       100
   macro avg       0.74      0.71      0.72       100
weighted avg       0.75      0.75      0.74       100



In [105]:
# models target t as sigmoid(w0 + w1*x1 + w2*x2 + ... + wd*xd)
diabetesCheck = RandomForestClassifier()
diabetesCheck.fit(trainData,trainLabel)
accuracy = diabetesCheck.score(testData,testLabel)
print("accuracy = ",accuracy * 100,"%")

accuracy =  73.0 %


In [106]:
# predict values using training data

predict_train = diabetesCheck.predict(trainData)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(trainLabel,predict_train)))
print()

Accuracy: 1.0000



In [107]:
# predict values using testing data

predict_train = diabetesCheck.predict(testData)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(testLabel,predict_train)))
print()

Accuracy: 0.7300



In [108]:
# Confusion Matrix

print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(testLabel,predict_train)))
print("")

Confusion Matrix
[[53 10]
 [17 20]]



In [109]:
print("Classification Report")
print("{0}".format(metrics.classification_report(testLabel,predict_train)))

Classification Report
              precision    recall  f1-score   support

           0       0.76      0.84      0.80        63
           1       0.67      0.54      0.60        37

    accuracy                           0.73       100
   macro avg       0.71      0.69      0.70       100
weighted avg       0.72      0.73      0.72       100

