# Project: Diabetes Prediction

# Description:
Predict whether a person has diabetes or not using features like glucose levels and BMI.

# Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load Dataset

In [2]:
df = pd.read_csv("D:/diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


# Data Preprocessing Techniques:

# Checking Null Values

In [3]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

# Checking Duplicates Values

In [4]:
df.duplicated().sum()

3854

# Drop Duplicates Values

In [5]:
df_new=df.drop_duplicates()

In [6]:
df_new.duplicated().sum()

0

# Checking Datatypes

In [7]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96146 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               96146 non-null  object 
 1   age                  96146 non-null  float64
 2   hypertension         96146 non-null  int64  
 3   heart_disease        96146 non-null  int64  
 4   smoking_history      96146 non-null  object 
 5   bmi                  96146 non-null  float64
 6   HbA1c_level          96146 non-null  float64
 7   blood_glucose_level  96146 non-null  int64  
 8   diabetes             96146 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 7.3+ MB


# Statistics Summary

In [8]:
df_new.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,96146.0,96146.0,96146.0,96146.0,96146.0,96146.0,96146.0
mean,41.794326,0.077601,0.040803,27.321461,5.532609,138.218231,0.08822
std,22.462948,0.267544,0.197833,6.767716,1.073232,40.909771,0.283616
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.4,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,59.0,0.0,0.0,29.86,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


# Extract Features

In [9]:
X = df_new.drop(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'HbA1c_level', 'diabetes'],axis=1)
y= df_new['diabetes']

In [10]:
X.head()

Unnamed: 0,bmi,blood_glucose_level
0,25.19,140
1,27.32,80
2,27.32,158
3,23.45,155
4,20.14,155


In [11]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: diabetes, dtype: int64

# Split Data into Train & Test 

In [12]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=42)

In [13]:
X_train.shape

(76916, 2)

In [14]:
y_train.shape

(76916,)

# AdaBoostClassifier Model Evaluation

In [15]:
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
accuracy_score(y_test , abc.predict(X_test))

0.9457098283931358

# GradientBoostingClassifier Model Evaluation

In [16]:
gc = GradientBoostingClassifier()
gc.fit(X_train, y_train)
accuracy_score(y_test , gc.predict(X_test))

0.9457098283931358

# RandomForestClassifier Model Evaluation

In [17]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
accuracy_score(y_test , rf.predict(X_test))

0.9259490379615185

# New Input Data to Predict Diabetes

In [18]:
input_data = np.array([[25.19, 140]])
prediction = gc.predict(input_data)
print(prediction)

[0]


