In [7]:
# Diabetes Prediction using XGBoost with Hyperparameter Tuning

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE  # For handling imbalance
import warnings
warnings.filterwarnings('ignore')  # To keep output clean

# --- 1. Load the Dataset ---
try:
    df = pd.read_csv('diabetes.csv')
    print("Dataset 'diabetes.csv' loaded successfully.")
except FileNotFoundError:
    print("Error: 'diabetes.csv' not found. Please ensure the file is in the correct directory.")
    exit()

# --- 2. Explore & Clean Data ---
print(f"\nDataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())
print("\nData info:")
df.info()

# Replace zeros in certain columns where zero is not possible with NaN (to treat them as missing values)
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zeros:
    df[col] = df[col].replace(0, np.nan)

print("\nMissing values count per column:")
print(df.isnull().sum())

# Impute missing values with median (robust for outliers)
for col in cols_with_zeros:
    df[col].fillna(df[col].median(), inplace=True)

print("\nMissing values after imputation:")
print(df.isnull().sum())

# --- 3. Separate Features and Target ---
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# --- 4. Handle Class Imbalance with SMOTE ---
print("\nOriginal target distribution:")
print(y.value_counts())

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nTarget distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts())

# --- 5. Split the data ---
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.2, random_state=42,
                                                    stratify=y_resampled)

# --- 6. Scale Features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 7. Baseline XGBoost Model ---
xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)
xgb.fit(X_train_scaled, y_train)

y_pred = xgb.predict(X_test_scaled)
print(f"\nBaseline Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Baseline Classification Report:")
print(classification_report(y_test, y_pred))

# --- 8. Cross-validation ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb, X_train_scaled, y_train, cv=kf, scoring='accuracy')
print(f"\nCross-validation accuracy scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")

# --- 9. Hyperparameter Tuning ---
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.01],
    'reg_lambda': [1, 1.5],
    'min_child_weight': [1, 5]
}

grid_search = GridSearchCV(estimator=XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=2)

print("\nStarting GridSearchCV (this can take a few minutes)...")
grid_search.fit(X_train_scaled, y_train)

print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best Cross-validation Accuracy: {grid_search.best_score_:.4f}")

# --- 10. Evaluate Best Model ---
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)

print(f"\nTuned Model Accuracy: {accuracy_score(y_test, y_pred_best):.4f}")
print("Tuned Model Classification Report:")
print(classification_report(y_test, y_pred_best))



Dataset 'diabetes.csv' loaded successfully.

Dataset shape: (768, 9)

First 5 rows:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -