In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


import joblib

data_path = '../data/cleaned/Diabetes_Cleaned.csv'
df = pd.read_csv(data_path)

df = df.dropna()  


X = df.drop('Class', axis=1)
y = df['Class']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)








In [4]:
xgboost_model = xgb.XGBClassifier(
    n_estimators=500,        # More trees
    max_depth=4,             # Control complexity
    learning_rate=0.05,      # Slower, stable learning
    subsample=0.8,           # Random row sampling
    colsample_bytree=0.8,    # Random column sampling
    gamma=1,                 # Minimum loss reduction
    reg_alpha=0.1,           # L1 regularization
    reg_lambda=1,            # L2 regularization
    eval_metric='mlogloss',
    random_state=42
)

xgboost_model.fit(X_train, y_train)

xgboost_pred = xgboost_model.predict(X_test)

xgboost_accuracy = accuracy_score(y_test, xgboost_pred)
print(f"XGBoost Accuracy: {xgboost_accuracy:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, xgboost_pred))
print("Classification Report:\n", classification_report(y_test, xgboost_pred))


XGBoost Accuracy: 0.7800
Confusion Matrix:
 [[72 27]
 [17 84]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.73      0.77        99
           1       0.76      0.83      0.79       101

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.78       200
weighted avg       0.78      0.78      0.78       200



In [None]:
decision_tree_model = DecisionTreeClassifier(
    random_state=42,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=2
)
decision_tree_model.fit(X_train, y_train)

dt_pred = decision_tree_model.predict(X_test)

# Evaluate the model
dt_accuracy = accuracy_score(y_test, dt_pred)
print("========== Decision Tree Classifier ==========")
print(f"Accuracy: {dt_accuracy:.4f}\n")

print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_pred), "\n")

print("Classification Report:")
print(classification_report(y_test, dt_pred, target_names=['Non-Diabetic', 'Diabetic']))

# Feature importances
feature_importances = pd.Series(decision_tree_model.feature_importances_, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("Top Features by Importance:")
print(feature_importances.head(10))

Accuracy: 0.7600

Confusion Matrix:
[[69 30]
 [18 83]] 

Classification Report:
              precision    recall  f1-score   support

Non-Diabetic       0.79      0.70      0.74        99
    Diabetic       0.73      0.82      0.78       101

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200

Top Features by Importance:
Glucose                     0.491188
BMI                         0.225774
Age                         0.196536
Pregnant                    0.041702
Serum_Insulin               0.040674
Skin_Fold                   0.004126
GlucoseCategory_Diabetes    0.000000
AgeGroup_Middle-aged        0.000000
BMICategory_Obese           0.000000
BMICategory_Normal          0.000000
dtype: float64


In [8]:
# Initialize and train the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Make predictions
log_reg_pred = log_reg_model.predict(X_test)

# Evaluate the model
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)
print("========== Logistic Regression ==========")
print(f"Accuracy: {log_reg_accuracy:.4f}\n")

print("Confusion Matrix:")
print(confusion_matrix(y_test, log_reg_pred), "\n")

print("Classification Report:")
print(classification_report(y_test, log_reg_pred, target_names=['Non-Diabetic', 'Diabetic']))

# Optional: Feature importance (coefficients)
feature_importances = pd.Series(log_reg_model.coef_[0], index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("Top Features by Coefficient:")
print(feature_importances.head(10))

Accuracy: 0.7650

Confusion Matrix:
[[72 27]
 [20 81]] 

Classification Report:
              precision    recall  f1-score   support

Non-Diabetic       0.78      0.73      0.75        99
    Diabetic       0.75      0.80      0.78       101

    accuracy                           0.77       200
   macro avg       0.77      0.76      0.76       200
weighted avg       0.77      0.77      0.76       200

Top Features by Coefficient:
Glucose                     1.277105
AgeGroup_Middle-aged        0.746967
Pregnant                    0.318403
BMICategory_Obese           0.244213
BMI                         0.236917
Skin_Fold                   0.176199
Serum_Insulin               0.122947
Age                        -0.017225
GlucoseCategory_Diabetes   -0.317537
BMICategory_Normal         -1.586355
dtype: float64


In [13]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# GridSearchCV for tuning
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and model
best_rf = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

# Predict on test set
rf_pred = best_rf.predict(X_test)

# Evaluate
rf_accuracy = accuracy_score(y_test, rf_pred)
print("\n========== Random Forest (Tuned) ==========")
print(f"Accuracy: {rf_accuracy:.4f}\n")

print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred), "\n")

print("Classification Report:")
print(classification_report(y_test, rf_pred, target_names=['Non-Diabetic', 'Diabetic']))

# Feature importance
feature_importances = pd.Series(best_rf.feature_importances_, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("Top Features by Importance:")
print(feature_importances.head(10))

Best Hyperparameters: {'max_depth': 8, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

Accuracy: 0.7950

Confusion Matrix:
[[69 30]
 [11 90]] 

Classification Report:
              precision    recall  f1-score   support

Non-Diabetic       0.86      0.70      0.77        99
    Diabetic       0.75      0.89      0.81       101

    accuracy                           0.80       200
   macro avg       0.81      0.79      0.79       200
weighted avg       0.81      0.80      0.79       200

Top Features by Importance:
Glucose                     0.418209
BMI                         0.219828
Age                         0.180812
Pregnant                    0.074312
Skin_Fold                   0.062393
Serum_Insulin               0.037416
AgeGroup_Middle-aged        0.003831
GlucoseCategory_Diabetes    0.001371
BMICategory_Obese           0.001100
BMICategory_Normal          0.000727
dtype: float64


In [9]:

joblib.dump(xgboost_model, '../models/xgboost_model.pkl')

joblib.dump(decision_tree_model, '../models/decision_tree_model.pkl')

joblib.dump(log_reg_model, '../models/log_reg_model.pkl')


['../models/log_reg_model.pkl']