# Diabetes Prediction in Females

## Importing Necessary Modules/Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier, ExtraTreesClassifier, VotingClassifier

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

## Loading the Dataset

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Investigating the Dataset

In [4]:
df.shape

(768, 9)

In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


## Separating the Independent and Dependent Columns

In [7]:
X = df.drop(columns='Outcome')
y = df['Outcome']

## Scaling the Data

In [8]:
scaler = StandardScaler()

In [9]:
X_sc = scaler.fit_transform(X)

In [10]:
X_sc

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]], shape=(768, 8))

## Splitting the Data into Train and Test Sets

In [11]:
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Model Training

In [12]:
svc = SVC(kernel='linear',gamma=1.0)
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear',penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=42)
abc = AdaBoostClassifier(n_estimators=50, random_state=42)
bc = BaggingClassifier(n_estimators=50, random_state=42)
etc = ExtraTreesClassifier(n_estimators=50, random_state=42)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=42)
xgb = XGBClassifier(n_estimators=50, random_state=42)

## Creating A Dictionary of Models

In [13]:
clfs = {
    'SVC' : svc,
    'KN' : knc,
    'DT' : dtc,
    'LR' : lrc,
    'RF' : rfc,
    'AdaBoost' : abc,
    'BgC' : bc,
    'ETC' : etc,
    'GBDT' : gbdt,
    'xgb' : xgb 
}

## Function to Predict Accuracy and Precision Scores

In [14]:
def train_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    return accuracy, precision

## Accuracy And Precision with Different Models

In [15]:
for name, clf in clfs.items():
    current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    print(f"{name} - Accuracy: {current_accuracy}, Precision: {current_precision}")

SVC - Accuracy: 0.7207792207792207, Precision: 0.6222222222222222
KN - Accuracy: 0.6688311688311688, Precision: 0.5306122448979592
DT - Accuracy: 0.7987012987012987, Precision: 0.7090909090909091
LR - Accuracy: 0.7077922077922078, Precision: 0.5957446808510638
RF - Accuracy: 0.7532467532467533, Precision: 0.6739130434782609
AdaBoost - Accuracy: 0.7792207792207793, Precision: 0.7
BgC - Accuracy: 0.7597402597402597, Precision: 0.6808510638297872
ETC - Accuracy: 0.7402597402597403, Precision: 0.64
GBDT - Accuracy: 0.7532467532467533, Precision: 0.6666666666666666
xgb - Accuracy: 0.7272727272727273, Precision: 0.6153846153846154


## Hyper-Parameter Tuning

In [25]:
param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 0.01, 0.1]
}

grid_svc = GridSearchCV(SVC(), param_grid_svc, cv=3, scoring='accuracy', n_jobs=-1)
grid_svc.fit(X_train, y_train)
print(grid_svc.best_params_)

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [16]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, scoring='accuracy')
grid_knn.fit(X_train, y_train)
print(grid_knn.best_params_)


{'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'}


In [17]:
param_grid_dt = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy', 'log_loss']
}

grid_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5, scoring='accuracy')
grid_dt.fit(X_train, y_train)
print(grid_dt.best_params_)


{'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 2}


In [18]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

grid_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy')
grid_lr.fit(X_train, y_train)
print(grid_lr.best_params_)


{'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}


In [19]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)
print(grid_rf.best_params_)


{'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 100}


In [20]:
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1]
}

grid_ada = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid_ada, cv=5, scoring='accuracy')
grid_ada.fit(X_train, y_train)
print(grid_ada.best_params_)


{'learning_rate': 0.5, 'n_estimators': 100}


In [21]:
param_grid_bag = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'bootstrap': [True, False]
}

grid_bag = GridSearchCV(BaggingClassifier(random_state=42), param_grid_bag, cv=5, scoring='accuracy')
grid_bag.fit(X_train, y_train)
print(grid_bag.best_params_)


{'bootstrap': True, 'max_samples': 0.7, 'n_estimators': 50}


In [22]:
param_grid_etc = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_etc = GridSearchCV(ExtraTreesClassifier(random_state=42), param_grid_etc, cv=5, scoring='accuracy')
grid_etc.fit(X_train, y_train)
print(grid_etc.best_params_)


{'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}


In [23]:
param_grid_gbdt = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

grid_gbdt = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gbdt, cv=5, scoring='accuracy')
grid_gbdt.fit(X_train, y_train)
print(grid_gbdt.best_params_)


{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}


In [24]:
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'subsample': [0.6, 0.8, 1.0]
}

grid_xgb = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=5, scoring='accuracy')
grid_xgb.fit(X_train, y_train)
print(grid_xgb.best_params_)


{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.6}
