In [1]:
# importing the dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
diabetes_data = pd.read_csv('datasets/diabetes.csv')

## Exploring Data

In [3]:
diabetes_data.head()

In [4]:
# Checking the no. of rows and columns in the dataset
diabetes_data.shape

In [5]:
diabetes_data.columns

In [6]:
diabetes_data.info()

In [7]:
# Getting the statistical measures of the dataset
diabetes_data.describe().T

In [8]:
# checking any None values in the dataset
diabetes_data.isna().values.any()

In [9]:
diabetes_data.Outcome.value_counts()

0 -> Non-Diabetic

1 -> Diabetic

In [10]:
diabetes_data.groupby('Outcome').mean()

In [11]:
# Separating the Features and target variables
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

In [12]:
X.columns

In [13]:
y.name

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [15]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

## Model Building

### 1.) Support Vecotor Classifier

In [16]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='rbf',C=10000)

In [17]:
# Training the support vector classifier
svm_classifier.fit(x_train, y_train)

#### SVC Model Evaluation

In [18]:
svm_y_pred = svm_classifier.predict(x_test)
svm_y_pred

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

svc_accuracy = accuracy_score(y_test, svm_y_pred)
svc_precision = precision_score(y_test, svm_y_pred)
svc_recall = recall_score(y_test, svm_y_pred)
svc_f1score = f1_score(y_test, svm_y_pred)

In [20]:
print('Accuracy : {:.2f}%'.format(svc_accuracy*100))
print('Precision : {:.2f}%'.format(svc_precision*100))
print('Recall : {:.2f}%'.format(svc_recall*100))
print('F1 Score : {:.2f}%'.format(svc_f1score*100))

In [21]:
confusion_matrix(y_test, svm_y_pred)

### 2.)  Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

lgr_model = LogisticRegression()

In [23]:
# Training Logistic Regression model
lgr_model.fit(x_train, y_train)

#### Logistic Regression Evaluation

In [24]:
lgr_y_pred = lgr_model.predict(x_test)
lgr_y_pred

In [25]:
lgr_accuracy = accuracy_score(y_test, lgr_y_pred)
lgr_precision = precision_score(y_test, lgr_y_pred)
lgr_recall = recall_score(y_test, lgr_y_pred)
lgr_f1score = f1_score(y_test, lgr_y_pred)

In [26]:
print('Accuracy : {:.2f}%'.format(svc_accuracy*100))
print('Precision : {:.2f}%'.format(lgr_precision*100))
print('Recall : {:.2f}%'.format(lgr_recall*100))
print('F1 score : {:.2f}%'.format(lgr_f1score*100))

In [27]:
confusion_matrix(y_test, lgr_y_pred)

# Conclusion

Both SVC model and Logistic Regression model have same accuracy. But, In the case of looking on other evaluation metrics SVC model is performing more better than Logistic Regression model.  So, SVC has been selected as Final Model.

Final Model - svm_classifier

## Saving the Final Model for Predictive System

In [28]:
import pickle

filename = 'diabetes_model.sav'
pickle.dump(svm_classifier, open(filename, 'wb'))