# Diabetes Prediction - XGBoost Training

This notebook demonstrates:
1. Loading the Pima Indians Diabetes Dataset
2. Feature engineering
3. Training XGBoost with hyperparameter tuning
4. SHAP explainability

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import joblib
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd().parent))

from src.utils.load_data import load_and_preprocess_data
from src.preprocessing.preprocessing import prepare_train_test_split

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Load dataset
data_path = Path('../data/raw/diabetes.csv')
df = load_and_preprocess_data(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nDiabetes prevalence: {df['Outcome'].mean():.2%}")
df.head()

## 2. Prepare Data

In [None]:
# Split and preprocess
X_train, X_test, y_train, y_test, preprocessor = prepare_train_test_split(df, test_size=0.2)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nFeatures: {len(preprocessor.feature_names)}")

## 3. Train XGBoost with Hyperparameter Tuning

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Base model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

# Randomized search
search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=20,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost...")
search.fit(X_train, y_train)

best_model = search.best_estimator_
print(f"\nBest parameters: {search.best_params_}")
print(f"Best CV score: {search.best_score_:.4f}")

## 4. Evaluate Model

In [None]:
# Predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 5. Feature Importance

In [None]:
# Get feature importance
importance_df = pd.DataFrame({
    'feature': preprocessor.feature_names,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature')
plt.title('Top 15 Feature Importances (XGBoost)')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## 6. SHAP Explainability

In [None]:
# Initialize SHAP explainer
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

print("SHAP explainer created successfully!")

In [None]:
# Summary plot
shap.summary_plot(shap_values, X_test, feature_names=preprocessor.feature_names)

In [None]:
# Waterfall plot for a single prediction
sample_idx = 0
shap.waterfall_plot(shap.Explanation(
    values=shap_values[sample_idx],
    base_values=explainer.expected_value,
    data=X_test.iloc[sample_idx],
    feature_names=preprocessor.feature_names
))

## 7. Save Model

In [None]:
# Save artifacts
models_dir = Path('../../backend/models')
models_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(best_model, models_dir / 'diabetes_model.joblib')
joblib.dump(preprocessor, models_dir / 'diabetes_model_preprocessor.joblib')
joblib.dump(explainer, models_dir / 'diabetes_model_explainer.joblib')

print("âœ… Model artifacts saved successfully!")