In [1]:
#Importing Required Libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns 

### Read the Dataset

In [2]:
#Reading the dataset
df = pd.read_csv("diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [4]:
df.describe(include="O")


Unnamed: 0,gender,smoking_history
count,100000,100000
unique,3,6
top,Female,No Info
freq,58552,35816


## Data Processing

### Null Value Analysis

In [5]:
df.isna()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
99995,False,False,False,False,False,False,False,False,False
99996,False,False,False,False,False,False,False,False,False
99997,False,False,False,False,False,False,False,False,False
99998,False,False,False,False,False,False,False,False,False


In [6]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [7]:
# Check the Datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [8]:
#Correcting the datatypes
df.age = df.age.astype(int)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  int32  
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(2), int32(1), int64(4), object(2)
memory usage: 6.5+ MB


## Label Encoding 

In [10]:
from sklearn.preprocessing import LabelEncoder
df[["gender", "smoking_history"]] = df[["gender", "smoking_history"]].apply(LabelEncoder().fit_transform)

In [11]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80,0,1,4,25.19,6.6,140,0
1,0,54,0,0,0,27.32,6.6,80,0
2,1,28,0,0,4,27.32,5.7,158,0
3,0,36,0,0,1,23.45,5.0,155,0
4,1,76,1,1,1,20.14,4.8,155,0


## Model Training

In [12]:
X = df.drop("diabetes" , axis = 1)
y = df.diabetes

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [15]:
X_train, X_test, y_train, y_test

(       gender  age  hypertension  heart_disease  smoking_history    bmi  \
 10382       0    2             0              0                0  16.45   
 73171       0   55             0              0                4  24.59   
 30938       0   24             0              0                0  21.77   
 99310       1   30             0              0                4  27.32   
 58959       1   13             0              0                0  18.37   
 ...       ...  ...           ...            ...              ...    ...   
 21243       1   12             0              0                4  23.68   
 45891       0   66             0              0                4  26.77   
 42613       1   66             0              0                4  41.23   
 43567       0   37             0              0                3  30.18   
 68268       0   52             0              0                0  27.32   
 
        HbA1c_level  blood_glucose_level  
 10382          6.2                  159  


In [16]:
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error, classification_report, confusion_matrix

## Decision Tree Classifier

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred_dt= dt_classifier.predict(X_test)
y_pred_dt

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred_dt) * 100)
print("\nMean Squared Error: ", mean_squared_error(y_test, y_pred_dt))
print("\nF1 Score: ", f1_score(y_test, y_pred_dt))
print("\nClassification Report\n", classification_report(y_test, y_pred_dt))
print("\nConfusion Matris\n", confusion_matrix(y_test, y_pred_dt))

Accuracy Score:  95.07

Mean Squared Error:  0.0493

F1 Score:  0.7179633867276888

Classification Report
               precision    recall  f1-score   support

           0       0.98      0.97      0.97     18297
           1       0.70      0.74      0.72      1703

    accuracy                           0.95     20000
   macro avg       0.84      0.85      0.85     20000
weighted avg       0.95      0.95      0.95     20000


Confusion Matris
 [[17759   538]
 [  448  1255]]


## Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
y_pred_rf

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## XG Boost


In [20]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(learning_rate = 0.5, n_estimators = 100)
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
y_pred_xgb

array([0, 0, 0, ..., 0, 0, 0])

In [21]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred_xgb) * 100)
print("\nMean Squared Error: ", mean_squared_error(y_test, y_pred_xgb))
print("\nF1 Score: ", f1_score(y_test, y_pred_xgb))
print("\nClassification Report\n", classification_report(y_test, y_pred_xgb))
print("\nConfusion Matris\n", confusion_matrix(y_test, y_pred_xgb))

Accuracy Score:  97.03500000000001

Mean Squared Error:  0.02965

F1 Score:  0.8017385489802742

Classification Report
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     18297
           1       0.93      0.70      0.80      1703

    accuracy                           0.97     20000
   macro avg       0.95      0.85      0.89     20000
weighted avg       0.97      0.97      0.97     20000


Confusion Matris
 [[18208    89]
 [  504  1199]]
