# Diabetes Predictive Analysis

# Project Overview

This dataset is originally from the National Institute of Diabetes and Digestive
and Kidney Diseases. The objective of the dataset is to diagnostically predict
whether a patient has diabetes based on certain diagnostic measurements
included in the dataset. Several constraints were placed on the selection of
these instances from a larger database. In particular, all patients here are
females at least 21 years old of Pima Indian heritage.

# Importing The Dependencies

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Data Collection and Analysis

In [None]:
# loading the diabetes dataset to a pandas DataFrame
data = pd.read_csv("/kaggle/input/diabetes-dataset/diabetes.csv")

In [None]:
# printing the first 10 rows of the dataset
data.head(10)

In [None]:
# number of rows and Columns in this dataset
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
# getting the statistical measures of the data
data.describe()

In [None]:
data["Outcome"].sample(25)

In [None]:
 data["Outcome"].value_counts()

# * 0 -> Non-Diabetic
# * 1 -> Diabetic

# EDA

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
sns.pairplot(data, hue = 'Outcome')

In [None]:
data.hist(bins = 20, figsize = (20,16))
plt.show()

In [None]:
sns.countplot(x = 'Outcome', data = data)
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.title('Distribution of Outcome')
plt.show()

In [None]:
sns.scatterplot(x= 'BMI', y = 'Insulin', data = data, hue = 'Outcome')

# More on Body Mass Index
**BMI is a measure that relates body weight to height. BMI is sometimes used to measure total body fat and whether a person is a healthy weight.**

According to https://www.acbi.nlm.nih.gov/pme/articles/PMC1457375/, Having even moderately elevated BMI is associated with increased risk of developing Diabetes Mellitus complications.

For the smale the BMI will be categorized as follons 1. Underweight: BMI less than 13.5 2.
Normal weight: BMI between 18.5 and 24.9 3. Overweight: BMI between 25 and 29.9 4. Obesity: BMI of 30 or higher

In [None]:
# Creating  a function to categorise BMI
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Under Weight'
    elif 18.5 <= bmi <= 24.9:
        return 'Normal Weight'
    elif 25 <= bmi < 29.9:
        return 'Over Weight'
    else:
        return 'Obese'
        
        

In [None]:
# Applying the function
data['BMI_categories'] = data['BMI'].apply(bmi_category)

In [None]:
data.head()

In [None]:
data['BMI_categories'].value_counts()

In [None]:
BMI_category_list = ['Normal Weight', 'Obese', 'Over Weight', 'Under Weight']

In [None]:
plt.bar(BMI_category_list, data.groupby ('BMI_categories') ['Outcome'].count(),color = 'b')
plt.xlabel('BMI category')
plt.ylabel('Count')
plt.title('Distribution of BMI categories')
plt.show()

In [None]:
# Extracting data with diabetic individuals
diabetic_patients = data[data['Outcome'] == 1]

In [None]:
sns.scatterplot(x = 'BMI', y = 'Insulin', data = diabetic_patients, hue = 'Outcome')

In [None]:
sns.scatterplot(x = 'BMI', y = 'Glucose', data = diabetic_patients, hue = 'Outcome')

In [None]:
sns.scatterplot(x = 'Glucose', y = 'Insulin', data = diabetic_patients, hue = 'Outcome')

In [None]:
diabetic_patients.groupby ('BMI_categories') ['Outcome'].count()

In [None]:
plt.bar(BMI_category_list, diabetic_patients.groupby('BMI_categories')['Outcome'].count(), color = 'b')
plt.xlabel('BMI_category')
plt.ylabel('Count')
plt.title('Relationship between BMI categories and Diabetes')
plt.show()

# Model Building
> Before building the model further preparation will be done on the dataset

In [None]:
for col in data.columns:
    print (col)
    print (data[col].unique()) 
    print (data[col].nunique())
    print()


In [None]:
# Replace 0's with NAN
columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [None]:
for column in columns:
    data[column] = data[column].replace(0,np.NAN)

In [None]:
data.isnull().sum()

In [None]:
# Replacing null elements with the mean
for column in columns:
    data[column].fillna(data[column].mean(), inplace = True)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = data.drop(columns = ['Outcome', 'BMI_categories'])
y = data ['Outcome']

In [None]:
x.head()

In [None]:
x_train, x_test, y_train, y_test, = train_test_split(x, y, test_size = 0.2,random_state = 42)

In [None]:
len (x_train), len(y_train), len(x_test), len (y_test)

# Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
prediction = model.predict(x_test)

In [None]:
prediction

# KNeighborsClassifier Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
k_model = KNeighborsClassifier(n_neighbors=7)
k_model.fit(x_train, y_train)

In [None]:
k_prediction = k_model.predict(x_test)
k_prediction

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
GB_model = GradientBoostingClassifier()
GB_model.fit(x_train, y_train)

In [None]:
GB_prediction = GB_model.predict (x_test)
GB_prediction

# Measuring Performance of the Models 


# For LogisticRegression

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score, precision_score

In [None]:
metrics = {
    
     'Accuracy': accuracy_score,
     'Precision': precision_score,
     'Recall': recall_score,
     'F1 score': f1_score,
     'Confusion Matrix': confusion_matrix
    
    }

for metric_name, metric_func in metrics.items():
    if metric_name == 'Confusion Matrix':
        print (metric_name)
        print (metric_func(y_test, prediction))
    elif metric_name =='Accuracy':
        print (metric_name)
        print (metric_func(y_test, prediction))
    elif metric_name == 'Precision':
        print (metric_name)
        print (metric_func(y_test, prediction))
        print ('\n')
    elif metric_name == 'Recall':
        print (metric_name)
        print (metric_func(y_test, prediction))
        print ('\n')
    else:
        print (metric_name)
        print (metric_func(y_test, prediction))
        print ('\n')

# For KNeighborsClassifier

In [None]:
for metric_name, metric_func in metrics.items():
    if metric_name == 'Confusion Matrix':
        print (metric_name)
        print (metric_func(y_test, k_prediction))
    elif metric_name =='Accuracy':
        print (metric_name)
        print (metric_func(y_test, k_prediction))
    elif metric_name == 'Precision':
        print (metric_name)
        print (metric_func(y_test, k_prediction))
        print ('\n')
    elif metric_name == 'Recall':
        print (metric_name)
        print (metric_func(y_test, k_prediction))
        print ('\n')
    else:
        print (metric_name)
        print (metric_func(y_test, k_prediction))
        print ('\n')

#  For GradientBoostingClassifier

In [None]:
for metric_name, metric_func in metrics.items():
    if metric_name == 'Confusion Matrix':
        print (metric_name)
        print (metric_func(y_test, GB_prediction))
    elif metric_name =='Accuracy':
        print (metric_name)
        print (metric_func(y_test, GB_prediction))
    elif metric_name == 'Precision':
        print (metric_name)
        print (metric_func(y_test, GB_prediction))
        print ('\n')
    elif metric_name == 'Recall':
        print (metric_name)
        print (metric_func(y_test, GB_prediction))
        print ('\n')
    else:
        print (metric_name)
        print (metric_func(y_test, GB_prediction))
        print ('\n')