In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the Libraries

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix
%matplotlib inline

## Importing the Dataset

In [11]:
df = pd.read_csv('../input/diabetes-dataset/diabetes2.csv')
df.head()

## Exploratory Data Analysis

In [12]:
df.info()

### Checking Correlation between variables

In [13]:
import matplotlib
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),annot=True,cmap='cubehelix')

### Distribution of Age in Dataset

In [14]:
sns.distplot(x=df['Age'],kde=False,bins=40)

### Distribution of BMI in the Dataset

In [15]:
sns.distplot(x=df['BMI'],color='red')

### Distribution of Blood Pressure along with Age

In [16]:
sns.jointplot(x=df['BloodPressure'],y=df['Age'],kind='hist',color='green')

### Variation of Skin Thickness with Insulin

In [17]:
sns.scatterplot(x=df['SkinThickness'],y=df['Insulin'])

### Variation of Glucose level with Insulin colored by Diabetes Outcome

In [18]:
sns.scatterplot(x=df['Glucose'],y=df['Insulin'],hue=df['Outcome'])

### Variation of BMI and Blood Pressure

In [19]:
sns.scatterplot(x=df['BMI'],y=df['BloodPressure'])

### Variation of Glucose Level and Blood Pressure

In [20]:
sns.scatterplot(x=df['Glucose'],y=df['BloodPressure'])

## Feature Engineering

### Checking Null Values

In [21]:
df.isnull().sum()

## Dividing Dataset into X and Y

In [22]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

## Dividing Dataset into Training and Test set

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Feature Scaling

In [24]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training Logistic Regression

In [25]:
classifier1 = LogisticRegression(random_state = 0)
classifier1.fit(X_train, y_train)

In [26]:
y_pred1 = classifier1.predict(X_test)
cm1 = confusion_matrix(y_test, y_pred1)
print(cm1)
print(accuracy_score(y_test, y_pred1))

## Training KNN

In [27]:
classifier2 = KNeighborsClassifier(n_neighbors = 5,metric='minkowski',p=2)
classifier2.fit(X_train, y_train)

In [28]:
y_pred2 = classifier2.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)
print(accuracy_score(y_test, y_pred2))

## Training Naive Bayes

In [29]:
classifier3 = GaussianNB()
classifier3.fit(X_train, y_train)

In [30]:
y_pred3 = classifier3.predict(X_test)
cm3 = confusion_matrix(y_test, y_pred3)
print(cm3)
print(accuracy_score(y_test, y_pred3))

## Training Kernel SVM

In [31]:
classifier4 = SVC(kernel = 'rbf', random_state = 0)
classifier4.fit(X_train, y_train)

In [32]:
y_pred4 = classifier4.predict(X_test)
cm4 = confusion_matrix(y_test, y_pred4)
print(cm4)
print(accuracy_score(y_test, y_pred4))

## Training Decision Tree 

In [33]:
classifier5 = DecisionTreeClassifier(random_state = 0,criterion='entropy')
classifier5.fit(X_train, y_train)

In [34]:
y_pred5 = classifier5.predict(X_test)
cm5 = confusion_matrix(y_test, y_pred5)
print(cm5)
print(accuracy_score(y_test, y_pred5))

## Training Random Forest

In [35]:
classifier6 = RandomForestClassifier(criterion = 'entropy',random_state = 0,n_estimators = 10)
classifier6.fit(X_train, y_train)

In [36]:
y_pred6 = classifier6.predict(X_test)
cm6 = confusion_matrix(y_test, y_pred6)
print(cm6)
print(accuracy_score(y_test, y_pred6))

## Training XGBoost 

In [37]:
classifier7 = XGBClassifier()
classifier7.fit(X_train, y_train)

In [38]:
y_pred7 = classifier7.predict(X_test)
cm7 = confusion_matrix(y_test, y_pred7)
print(cm7)
print(accuracy_score(y_test, y_pred7))

## Performing K4 Cross Validation

In [39]:
accuracies = cross_val_score(estimator = classifier7, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

# Therefore, we see that we get the most accurate predictions using Logistic Regression
## Our Best Accuracy is 82.46%




### Please upvote my notebook if you like it