In [1]:
!pip install xgboost imbalanced-learn scikit-learn seaborn matplotlib



In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.feature_selection import SelectKBest, f_classif

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import random, os
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)


In [3]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

columns = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]

df = pd.read_csv(url, names=columns)

print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
df = pd.read_csv(url, names=columns)
print("Dataset Loaded Successfully!")
print(df.head())

print("\nDataset Info:")
print(df.info())
print("\nMissing Values per Column:")
print(df.isnull().sum())

Dataset Loaded Successfully!
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null  

In [5]:
scaler = StandardScaler()
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

print("\nClass distribution before SMOTE:\n", y.value_counts())

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nClass distribution after SMOTE:\n", y_resampled.value_counts())

selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X_resampled, y_resampled)

selected_features = X.columns[selector.get_support()]
print("\nSelected Features:", list(selected_features))

X_final = pd.DataFrame(X_selected, columns=selected_features)
print("\nFinal preprocessed dataset shape:", X_final.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

print("\nTraining set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Class distribution before SMOTE:
 Outcome
0    500
1    268
Name: count, dtype: int64

Class distribution after SMOTE:
 Outcome
1    500
0    500
Name: count, dtype: int64

Selected Features: ['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']

Final preprocessed dataset shape: (1000, 5)

Training set shape: (800, 5)
Testing set shape: (200, 5)


In [6]:
# Libraries for Section 3
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import numpy as np

In [None]:
# 1. Define XGBoost parameter grid
xgb_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [100, 200, 300, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

# 2. Set up cross-validation
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 3. GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid=xgb_param_grid,
    cv=cv_strategy,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)

print("Training XGBoost with GridSearchCV...")
grid_search_xgb.fit(X_train, y_train)
best_xgb = grid_search_xgb.best_estimator_

print("\nBest XGBoost Parameters:", grid_search_xgb.best_params_)
print("Best XGBoost CV AUC-ROC:", grid_search_xgb.best_score_)

# 4. Define Logistic Regression parameter grid
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

# 5. GridSearchCV for Logistic Regression
grid_search_lr = GridSearchCV(
    estimator=LogisticRegression(random_state=42),
    param_grid=lr_param_grid,
    cv=cv_strategy,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)

print("\nTraining Logistic Regression with GridSearchCV...")
grid_search_lr.fit(X_train, y_train)
best_lr = grid_search_lr.best_estimator_

print("\nBest Logistic Regression Parameters:", grid_search_lr.best_params_)
print("Best Logistic Regression CV AUC-ROC:", grid_search_lr.best_score_)

# 6. Ensemble Stacking (Meta-learner)
print("\nCreating ensemble with stacking...")

# Generate predictions from base models
xgb_train_pred = best_xgb.predict_proba(X_train)[:, 1]
lr_train_pred = best_lr.predict_proba(X_train)[:, 1]

# Stack predictions
stacked_train = np.column_stack((xgb_train_pred, lr_train_pred))

# Train meta-learner
meta_learner = LogisticRegression(random_state=42, max_iter=1000)
meta_learner.fit(stacked_train, y_train)

print("Ensemble model trained successfully!")

# 7. Feature Importance (XGBoost)
print("\nFeature Importance from XGBoost:")
feature_importance = best_xgb.feature_importances_
for feature, importance in zip(selected_features, feature_importance):
    print(f"{feature}: {importance:.4f}")

Training XGBoost with GridSearchCV...
Fitting 5 folds for each of 5184 candidates, totalling 25920 fits


In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier

pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', XGBClassifier(random_state=42, eval_metric='logloss'))
])

pipeline.fit(X_train, y_train)

In [None]:
import joblib
import json

In [None]:
joblib.dump(pipeline, 'diabetes_model.pkl')

In [None]:
model_info = {
    'selected_features': ['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age'],
    'feature_ranges': {
        'Pregnancies': (0, 17),
        'Glucose': (0, 199),
        'BMI': (0, 67.1),
        'DiabetesPedigreeFunction': (0.078, 2.42),
        'Age': (21, 81)
    }
}

with open('model_info.json', 'w') as f:
    json.dump(model_info, f)

print("✅ Model saved successfully as 'diabetes_model.pkl'")
print("✅ Model info saved as 'model_info.json'")