In [92]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [93]:
diabetes_data = pd.read_csv('/content/drive/MyDrive/ML datasets/Diabetics Prediction (SMV)/diabetes.csv')
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [94]:
diabetes_data.shape

(768, 9)

In [95]:
diabetes_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [96]:
diabetes_data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [97]:
x = diabetes_data.drop('Outcome', axis = 1)
y = diabetes_data['Outcome']

# Train Test Split

In [98]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 1)

In [99]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(614, 8) (154, 8) (614,) (154,)


In [100]:
models = [LogisticRegression(max_iter = 1000), KNeighborsClassifier(), RandomForestClassifier(), SVC(kernel = 'linear'), GaussianNB()]

In [101]:
def model_selection():
  for model in models:
    model.fit(x_train,y_train)
    training_prediction = model.predict(x_test)
    prediction = accuracy_score(y_test, training_prediction)
    print('Accuracy score of the ', model, ' = ', prediction)


In [102]:
model_selection()

Accuracy score of the  LogisticRegression(max_iter=1000)  =  0.7857142857142857
Accuracy score of the  KNeighborsClassifier()  =  0.7012987012987013
Accuracy score of the  RandomForestClassifier()  =  0.7597402597402597
Accuracy score of the  SVC(kernel='linear')  =  0.7792207792207793
Accuracy score of the  GaussianNB()  =  0.7077922077922078


## Cross validation

In [103]:
# Logistic Regression

cv_score_lr = cross_val_score(LogisticRegression(max_iter = 1000), x, y, cv = 5)
print(cv_score_lr)

mean_accuracy = sum(cv_score_lr)/len(cv_score_lr)
mean_accuracy = mean_accuracy * 100
mean_accuracy = round(mean_accuracy)
print(mean_accuracy,'%')

[0.77272727 0.74675325 0.75324675 0.81045752 0.77777778]
77 %


In [104]:
# SVC

cv_score_lr = cross_val_score(SVC(kernel ='linear'), x, y, cv = 5)
print(cv_score_lr)

mean_accuracy = sum(cv_score_lr)/len(cv_score_lr)
mean_accuracy = mean_accuracy * 100
mean_accuracy = round(mean_accuracy)
print(mean_accuracy,'%')

[0.75974026 0.75324675 0.74025974 0.81045752 0.76470588]
77 %


In [105]:
#Random Forest Classifier
cv_score_lr = cross_val_score(RandomForestClassifier(), x, y, cv = 5)
print(cv_score_lr)

mean_accuracy = sum(cv_score_lr)/len(cv_score_lr)
mean_accuracy = mean_accuracy * 100
mean_accuracy = round(mean_accuracy)
print(mean_accuracy,'%')

[0.75324675 0.72077922 0.77272727 0.83660131 0.75816993]
77 %


In [106]:
#Kneighbors

cv_score_lr = cross_val_score(KNeighborsClassifier(), x, y, cv = 5)
print(cv_score_lr)

mean_accuracy = sum(cv_score_lr)/len(cv_score_lr)
mean_accuracy = mean_accuracy * 100
mean_accuracy = round(mean_accuracy)
print(mean_accuracy,'%')

[0.72727273 0.72727273 0.7012987  0.75816993 0.70588235]
72 %


In [107]:
#naive bayes

cv_score_lr = cross_val_score(GaussianNB(), x, y, cv = 5)
print(cv_score_lr)

mean_accuracy = sum(cv_score_lr)/len(cv_score_lr)
mean_accuracy = mean_accuracy * 100
mean_accuracy = round(mean_accuracy)
print(mean_accuracy,'%')

[0.75324675 0.72727273 0.74675325 0.78431373 0.74509804]
75 %


# Creating a function for observing all the cross validation accuracies

In [108]:
def cross_validation():
  for model in models:
    cv_score_lr = cross_val_score(model, x, y, cv = 5)


    mean_accuracy = sum(cv_score_lr)/len(cv_score_lr)
    mean_accuracy = mean_accuracy * 100
    mean_accuracy = round(mean_accuracy)
    print('accuracy of',model, 'is', mean_accuracy,'%')


In [109]:
cross_validation()

accuracy of LogisticRegression(max_iter=1000) is 77 %
accuracy of KNeighborsClassifier() is 72 %
accuracy of RandomForestClassifier() is 77 %
accuracy of SVC(kernel='linear') is 77 %
accuracy of GaussianNB() is 75 %
