In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('diabetes_prediction_dataset.csv')

print("Shape:", df.shape)
print(df.head())
print(df.isna().sum())

# Target & features
target = 'diabetes'
y = df[target].astype(int)
X = df.drop(columns=[target])

# One-hot for categoricals (simple style from course)
cat_cols = ['gender', 'smoking_history']
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Positive rate in train:", y_train.mean().round(4))


Shape: (100000, 9)
   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64
Train shape: (80000, 13)  Te

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

log_clf = LogisticRegression(max_iter=2000, class_weight='balanced', solver='lbfgs', random_state=42)
log_clf.fit(X_train, y_train)

y_pred = log_clf.predict(X_test)
y_proba = log_clf.predict_proba(X_test)[:, 1]

print("=== Logistic Regression (no grid) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))



=== Logistic Regression (no grid) ===
Accuracy: 0.88885
Precision: 0.42656557146868856
Recall: 0.8935294117647059
F1: 0.5774567572704808
ROC-AUC: 0.9629510928961749
Confusion matrix:
 [[16258  2042]
 [  181  1519]]
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.89      0.94     18300
           1       0.43      0.89      0.58      1700

    accuracy                           0.89     20000
   macro avg       0.71      0.89      0.76     20000
weighted avg       0.94      0.89      0.91     20000



In [10]:
from sklearn.model_selection import GridSearchCV

log_base = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
param_grid = {
    'C': [0.01, 0.1, 1, 3, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}

gs_log = GridSearchCV(
    estimator=log_base,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)
gs_log.fit(X_train, y_train)

best_log = gs_log.best_estimator_
y_pred = best_log.predict(X_test)
y_proba = best_log.predict_proba(X_test)[:, 1]

print("=== Logistic Regression (Grid Search) ===")
print("Best params:", gs_log.best_params_)
print("Best CV F1:", gs_log.best_score_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Fitting 5 folds for each of 10 candidates, totalling 50 fits


python(24745) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24746) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24747) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24748) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24749) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24750) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24751) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24752) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24753) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24754) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


=== Logistic Regression (Grid Search) ===
Best params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV F1: 0.5696836730733158
Accuracy: 0.88885
Precision: 0.4266067920291889
Recall: 0.8941176470588236
F1: 0.5776173285198556
ROC-AUC: 0.9629591288974607


In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

svm_clf = SVC(kernel='rbf', C=1.0, gamma='scale', class_weight='balanced',
              probability=True, random_state=42)
svm_clf.fit(X_train_s, y_train)

y_pred = svm_clf.predict(X_test_s)
y_proba = svm_clf.predict_proba(X_test_s)[:, 1]

print("=== SVM (no grid) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


=== SVM (no grid) ===
Accuracy: 0.89625
Precision: 0.446167097329888
Recall: 0.9141176470588235
F1: 0.5996527107852595
ROC-AUC: 0.9643806975249116


In [12]:
svm_base = SVC(class_weight='balanced',
               probability=False,         # turn OFF for search
               cache_size=1000,
               random_state=42)

svm_grid = {
    'kernel': ['rbf'],                   # keep only rbf
    'C': [0.5, 1, 3],
    'gamma': ['scale', 0.05]             # small grid
}

gs_svm = GridSearchCV(
    estimator=svm_base,
    param_grid=svm_grid,
    scoring='f1',
    cv=3,                                # fewer folds
    n_jobs=-1,
    verbose=1
)

gs_svm.fit(X_train_s, y_train)
print("=== SVM (Grid Search, fast) ===")
print("Best params:", gs_svm.best_params_)
print("Best CV F1:", gs_svm.best_score_)

# Refit once with probabilities for ROC-AUC/report
best_params = gs_svm.best_params_.copy()
svm_final = SVC(class_weight='balanced',
                probability=True,        # turn ON only once
                cache_size=1000,
                random_state=42,
                **best_params)

svm_final.fit(X_train_s, y_train)
y_pred  = svm_final.predict(X_test_s)
y_proba = svm_final.predict_proba(X_test_s)[:, 1]

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Fitting 3 folds for each of 6 candidates, totalling 18 fits
=== SVM (Grid Search, fast) ===
Best params: {'C': 3, 'gamma': 'scale', 'kernel': 'rbf'}
Best CV F1: 0.5949629880609021
Accuracy: 0.89735
Precision: 0.44870677128741643
Recall: 0.908235294117647
F1: 0.6006613499319199
ROC-AUC: 0.9672072645451623


In [13]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)
xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
y_proba = xgb_clf.predict_proba(X_test)[:, 1]

print("=== XGBoost (no grid) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


=== XGBoost (no grid) ===
Accuracy: 0.9722
Precision: 0.9766666666666667
Recall: 0.6894117647058824
F1: 0.8082758620689655
ROC-AUC: 0.9797781742205078


In [14]:
xgb_base = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)

xgb_grid = {
    'n_estimators': [200, 400],
    'learning_rate': [0.03, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

gs_xgb = GridSearchCV(
    estimator=xgb_base,
    param_grid=xgb_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)
gs_xgb.fit(X_train, y_train)

best_xgb = gs_xgb.best_estimator_
y_pred = best_xgb.predict(X_test)
y_proba = best_xgb.predict_proba(X_test)[:, 1]

print("=== XGBoost (Grid Search) ===")
print("Best params:", gs_xgb.best_params_)
print("Best CV F1:", gs_xgb.best_score_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Fitting 5 folds for each of 72 candidates, totalling 360 fits


python(25710) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25711) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25712) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25713) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25714) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25715) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25716) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25717) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25718) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25719) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


=== XGBoost (Grid Search) ===
Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best CV F1: 0.8072521821520736
Accuracy: 0.972
Precision: 0.9765886287625418
Recall: 0.6870588235294117
F1: 0.8066298342541437
ROC-AUC: 0.9795537929926068
