In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
data_tree =pd.read_csv(r'D:\Study\AI Advanced\Diabetes\Data\Processed\data_preprocessed_tree.csv')
data_scaled =pd.read_csv(r'D:\Study\AI Advanced\Diabetes\Data\Processed\data_preprocessed_scaled.csv')

In [6]:
data_tree.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male
0,80.0,0,1,4,25.19,6.6,140,0,False
1,54.0,0,0,0,27.32,6.6,80,0,False
2,28.0,0,0,4,27.32,5.7,158,0,True
3,36.0,0,0,1,23.45,5.0,155,0,False
4,76.0,1,1,1,20.14,4.8,155,0,True


In [5]:
data_scaled.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1.692704,0,1,-0.32235,1.001706,0.047704,0,False,False,False,False,True,False
1,0.538006,0,0,0.000954,1.001706,-1.42621,0,False,False,False,False,False,False
2,-0.616691,0,0,0.000954,0.161108,0.489878,0,True,False,False,False,True,False
3,-0.261399,0,0,-0.586456,-0.49269,0.416183,0,False,True,False,False,False,False
4,1.515058,1,1,-1.088866,-0.67949,0.416183,0,True,True,False,False,False,False


In [7]:
X_tree = data_tree.drop("diabetes", axis=1)
y_tree = data_tree["diabetes"]

X_scaled = data_scaled.drop("diabetes", axis=1)
y_scaled = data_scaled["diabetes"]

In [8]:
y_tree.value_counts()   # data imbalance 

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

# Tree models
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(
    X_tree, y_tree, test_size=0.2, random_state=42, stratify=y_tree
)

# Scaled models
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
    X_scaled, y_scaled, test_size=0.2, random_state=42, stratify=y_scaled
)

In [10]:
X_train_tree.shape, X_test_tree.shape
X_train_scaled.shape, X_test_scaled.shape

((80000, 12), (20000, 12))

Train Tree Models (Desicion Tree)

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

dt =DecisionTreeClassifier(
    criterion='entropy',
    max_depth=None,
    random_state=42,
    class_weight="balanced"
)

dt.fit(X_train_tree, y_train_tree)

y_pred_dt = dt.predict(X_test_tree)

print("Accuracy:", accuracy_score(y_test_tree, y_pred_dt))
print("F1 Score:", f1_score(y_test_tree, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test_tree, y_pred_dt))

Accuracy: 0.95245
F1 Score: 0.7249059878507377

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97     18300
           1       0.71      0.74      0.72      1700

    accuracy                           0.95     20000
   macro avg       0.84      0.85      0.85     20000
weighted avg       0.95      0.95      0.95     20000



Random forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,         
    max_depth=None,           
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    class_weight="balanced"   
)

rf.fit(X_train_tree, y_train_tree)

y_pred_rf = rf.predict(X_test_tree)

print("Accuracy:", accuracy_score(y_test_tree, y_pred_rf))
print("F1 Score:", f1_score(y_test_tree, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test_tree, y_pred_rf))

Accuracy: 0.96995
F1 Score: 0.7960637936884968

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     18300
           1       0.94      0.69      0.80      1700

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000



Hyperparameter Tuning

In [15]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample']
}

In [16]:
rf = RandomForestClassifier(random_state=42)

tuner = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,                
    scoring='f1',             
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

tuner.fit(X_train_tree, y_train_tree)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [17]:
print(tuner.best_params_)

{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': None, 'class_weight': 'balanced', 'bootstrap': True}


In [19]:
best_params = tuner.best_params_

best_rf = RandomForestClassifier(
    **best_params,      
    random_state=42
)

best_rf.fit(X_train_tree, y_train_tree)

In [20]:
y_pred_best = best_rf.predict(X_test_tree)

from sklearn.metrics import accuracy_score, f1_score, classification_report

print("Accuracy:", accuracy_score(y_test_tree, y_pred_best))
print("F1 Score:", f1_score(y_test_tree, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test_tree, y_pred_best))

Accuracy: 0.96945
F1 Score: 0.7935113213923622

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     18300
           1       0.93      0.69      0.79      1700

    accuracy                           0.97     20000
   macro avg       0.95      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000



In [21]:
%whos

Variable                 Type                      Data/Info
------------------------------------------------------------
DecisionTreeClassifier   ABCMeta                   <class 'sklearn.tree._cla<...>.DecisionTreeClassifier'>
RandomForestClassifier   ABCMeta                   <class 'sklearn.ensemble.<...>.RandomForestClassifier'>
RandomizedSearchCV       ABCMeta                   <class 'sklearn.model_sel<...>arch.RandomizedSearchCV'>
X_scaled                 DataFrame                             age  hyperten<...>100000 rows x 12 columns]
X_test_scaled            DataFrame                             age  hyperten<...>[20000 rows x 12 columns]
X_test_tree              DataFrame                         age  hypertension<...>n[20000 rows x 8 columns]
X_train_scaled           DataFrame                             age  hyperten<...>[80000 rows x 12 columns]
X_train_tree             DataFrame                         age  hypertension<...>n[80000 rows x 8 columns]
X_tree                

In [27]:
protected_vars = {
    'pd', 'np', 'sns', 'plt',
    'data_tree', 'data_scaled',
    'X_train_tree', 'X_test_tree', 'y_train_tree', 'y_test_tree',
    'X_train_scaled', 'X_test_scaled', 'y_train_scaled', 'y_test_scaled',
    'X_tree', 'X_scaled', 'y_tree',
    'protected_vars'
}

for var in list(globals().keys()):
    if var not in protected_vars and not var.startswith("_"):
        try:
            del globals()[var]
        except:
            pass


XGBoost

In [2]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [27]:
# type: ignore
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

pos_weight = (y_train_tree.value_counts()[0] / y_train_tree.value_counts()[1])

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    scale_pos_weight=2,
    eval_metric="logloss"
)

xgb.fit(X_train_tree, y_train_tree)

y_pred_xgb = xgb.predict(X_test_tree)

print("Accuracy:", accuracy_score(y_test_tree, y_pred_xgb))
print("F1 Score:", f1_score(y_test_tree, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test_tree, y_pred_xgb))

Accuracy: 0.96985
F1 Score: 0.8045380875202593

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98     18300
           1       0.90      0.73      0.80      1700

    accuracy                           0.97     20000
   macro avg       0.94      0.86      0.89     20000
weighted avg       0.97      0.97      0.97     20000



(1) weight = 1
-------------
Precision = 0.97  , Recall = 0.69

F1 = 0.81 , Accuracy = 0.972

(2) weight = 2
----------------
Precision = 0.90  , Recall = 0.73

F1 = 0.80  , Accuracy = 0.969

(3) weight = 3
---------------
Precision = 0.79  , Recall = 0.78

F1 = 0.78 , Accuracy = 0.96

In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score
y_proba = xgb.predict_proba(X_test_tree)[:, 1]

thresholds = [0.50, 0.45, 0.40, 0.35, 0.30]

for t in thresholds:
    y_pred_thr = (y_proba >= t).astype(int)
    print(f"\n===== Threshold = {t} =====")
    print("Precision:", round(precision_score(y_test_tree, y_pred_thr), 3))
    print("Recall:", round(recall_score(y_test_tree, y_pred_thr), 3))
    print("F1:", round(f1_score(y_test_tree, y_pred_thr), 3))


===== Threshold = 0.5 =====
Precision: 0.973
Recall: 0.692
F1: 0.809

===== Threshold = 0.45 =====
Precision: 0.959
Recall: 0.701
F1: 0.81

===== Threshold = 0.4 =====
Precision: 0.936
Recall: 0.709
F1: 0.807

===== Threshold = 0.35 =====
Precision: 0.899
Recall: 0.727
F1: 0.804

===== Threshold = 0.3 =====
Precision: 0.853
Recall: 0.75
F1: 0.798


In [None]:
y_proba = xgb.predict_proba(X_test_tree)[:, 1]

threshold = 0.45
y_pred_custom = (y_proba >= threshold).astype(int)

print("Accuracy:", accuracy_score(y_test_tree, y_pred_custom))
print("F1:", f1_score(y_test_tree, y_pred_custom))
print("\nReport:\n", classification_report(y_test_tree, y_pred_custom))

Accuracy: 0.9672
F1: 0.7957658779576587

Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98     18300
           1       0.85      0.75      0.80      1700

    accuracy                           0.97     20000
   macro avg       0.91      0.87      0.89     20000
weighted avg       0.97      0.97      0.97     20000



Train Linear Models (logistic regression)

In [29]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    max_iter=2000,
    class_weight='balanced',
    solver='lbfgs'
)

log_reg.fit(X_train_scaled, y_train_scaled)

y_pred_log = log_reg.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test_scaled, y_pred_log))
print("F1 Score:", f1_score(y_test_scaled, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test_scaled, y_pred_log))

Accuracy: 0.88905
F1 Score: 0.5778961384820239

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.89      0.94     18300
           1       0.43      0.89      0.58      1700

    accuracy                           0.89     20000
   macro avg       0.71      0.89      0.76     20000
weighted avg       0.94      0.89      0.91     20000



In [30]:
data_scaled['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

Linear SVM

In [32]:
from sklearn.svm import SVC

linear_svm = SVC(
    kernel='linear',
    class_weight='balanced',
    probability=True
)

linear_svm.fit(X_train_scaled, y_train_scaled)

y_pred_svm = linear_svm.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test_scaled, y_pred_svm))
print("F1 Score:", f1_score(y_test_scaled, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test_scaled, y_pred_svm))

Accuracy: 0.88765
F1 Score: 0.5756373937677054

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.89      0.94     18300
           1       0.42      0.90      0.58      1700

    accuracy                           0.89     20000
   macro avg       0.71      0.89      0.76     20000
weighted avg       0.94      0.89      0.90     20000



SVM RBF

In [33]:
svm_rbf = SVC(
    kernel='rbf',
    class_weight='balanced',
    probability=True,   
    gamma='scale',
    C=1
)

svm_rbf.fit(X_train_scaled, y_train_scaled)

y_pred_rbf = svm_rbf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test_scaled, y_pred_rbf))
print("F1 Score:", f1_score(y_test_scaled, y_pred_rbf))
print("\nClassification Report:\n", classification_report(y_test_scaled, y_pred_rbf))

Accuracy: 0.89345
F1 Score: 0.5947898840083666

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.89      0.94     18300
           1       0.44      0.92      0.59      1700

    accuracy                           0.89     20000
   macro avg       0.72      0.91      0.77     20000
weighted avg       0.94      0.89      0.91     20000



| Model               | Precision | Recall | F1   |
| ------------------- | --------- | ------ | ---- |
| Logistic Regression | 0.43      | 0.89   | 0.58 |
| Linear SVM          | 0.42      | 0.90   | 0.57 |
| SVM RBF             | 0.44      | 0.92   | 0.59 |
| Decision Tree       | 0.71      | 0.74   | 0.72 |
| Random Forest       | 0.94      | 0.69   | 0.80 |
| XGBoost (best=1)    | 0.85      | 0.75   | 0.80 |
| XGBoost (thr=0.45)  | 0.85      | 0.75   | 0.80 |
