In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.preprocessing import Binarizer,Normalizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

data = pd.read_csv("C:\\Users\\tusha\\Data_New1.csv") 

In [2]:
#This is the data after K-means clustering
# print(data)

## Splitting preprocessed data into train and test

In [3]:
X = data[['Pregnancies_binarized','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=data['Outcome'],test_size=0.25, random_state=42)

## Naive bayes on preporcessed data

In [4]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (y_test != y_pred).sum(),
          100*(1-(y_test!= y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 192 points : 51, performance 73.44%


## Logistic Regression on preprocessed data

In [5]:
logreg = LogisticRegression().fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.766
Test set score: 0.745




## Testing accuracy after applying only K-means 

In [6]:
# from sklearn.cluster import KMeans
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix
print(accuracy_score(data['Outcome'], data['cluster_0_1']))
print(confusion_matrix(data['Outcome'], data['cluster_0_1']))

0.72265625
[[371 129]
 [ 84 184]]


## Pulling correctly classified data after K-means

In [7]:
# from sklearn.cluster import KMeans
# from sklearn.metrics import accuracy_score
new_list = list()
newDF = pd.DataFrame()
for (idx,row) in data.iterrows():
    if row.Outcome == row.cluster_0_1:
        new_list.append(row)
        #print(new_list)
        d = pd.DataFrame(new_list)

In [8]:
#This is the correctly classified data
# d

In [9]:
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report

## Applying logistic regression on Correctly classified data

In [10]:
# Splitting Data into train and test for logistic regression
X = d[['Pregnancies_binarized','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
y = d['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=d['Outcome'],test_size=0.25, random_state=42)

In [11]:
# from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.962
Test set score: 0.957




In [12]:
#Printing the confusion matrix and final accuracy score
print(confusion_matrix(y_test,logreg.predict(X_test)))
print(accuracy_score(y_test,logreg.predict(X_test)))


[[93  0]
 [ 6 40]]
0.9568345323741008


## Printing the classification report After logistic Regression

In [13]:
print(classification_report(y_test,logreg.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        93
           1       1.00      0.87      0.93        46

   micro avg       0.96      0.96      0.96       139
   macro avg       0.97      0.93      0.95       139
weighted avg       0.96      0.96      0.96       139



## Performing 10-fold cross validation on correctly classified data

In [14]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kf = KFold(n_splits=10) 
# kfold = KFold(n_splits=10, random_state=10) 
score = cross_val_score(logreg, X, y, cv=kf, scoring='accuracy').mean()
print(score)

0.9527597402597403




## Applying Naive Baye's on correctly classified Data

In [15]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (y_test != y_pred).sum(),
          100*(1-(y_test!= y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 139 points : 3, performance 97.84%


In [16]:
print(classification_report(y_test,gnb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98        93
           1       0.96      0.98      0.97        46

   micro avg       0.98      0.98      0.98       139
   macro avg       0.97      0.98      0.98       139
weighted avg       0.98      0.98      0.98       139

