In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import xgboost as xgb
import joblib
import warnings

warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv(r'C:\End-to-End Project\Diabetes Risk Checker\Dataset\cleaned_data.csv')

In [22]:
print(df.isnull().sum())
print(df.shape)

SEX          0
GENHLTH      0
PHYSHLTH     0
CVDSTRK3     0
DIABETE4     0
RFHLTH       0
TOTINDA      0
_MICHD       0
RACE         0
AGEGRP       0
BMI          0
EDUCATION    0
SMOKER       0
Year         0
INCOME       0
dtype: int64
(2153724, 15)


In [23]:
#%pip install --upgrade scikit-learn imbalanced-learn xgboost

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [25]:
# Create binary target from DIABETE4 (if not already done)
df['diabetes_risk'] = np.where(df['DIABETE4'].isin([1, 4]), 1, 0)
df = df.drop(columns=['DIABETE4'])
print("Class distribution:\n", df['diabetes_risk'].value_counts(normalize=True))

# Features & Target
X = df.drop('diabetes_risk', axis=1)
y = df['diabetes_risk']



Class distribution:
 diabetes_risk
0    0.84163
1    0.15837
Name: proportion, dtype: float64


In [26]:
print(X.head(2))
print('============================================================')
print(y.head(5))

   SEX  GENHLTH  PHYSHLTH  CVDSTRK3  RFHLTH  TOTINDA  _MICHD  RACE  AGEGRP  \
0  2.0      2.0       3.0       2.0     1.0      1.0     2.0   1.0     8.0   
1  2.0      3.0      88.0       2.0     1.0      1.0     2.0   2.0    10.0   

      BMI  EDUCATION  SMOKER  Year  INCOME  
0  1660.0        4.0     1.0  2020     5.0  
1  2918.0        4.0     9.0  2020     5.0  
0    1
1    0
2    0
3    0
4    0
Name: diabetes_risk, dtype: int32


In [27]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Split on original data (no SMOTE to avoid overfit)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Class weight for imbalance
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

# Model with strong regularization (generalizes better, no overfit)
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_estimators=200,          # Fixed, finishes fast
    max_depth=4,               # Shallow trees
    learning_rate=0.1,         # Slightly higher for faster convergence
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=pos_weight
)

# Fit (no early stopping â€” just train fixed rounds)
model.fit(X_train, y_train, verbose=True)

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Threshold tuning (after y_prob is calculated)
from sklearn.metrics import classification_report

thresholds = [0.6]
print("=== Threshold Tuning Results ===")
for thresh in thresholds:
    y_pred_adj = (y_prob >= thresh).astype(int)  # Adjust predictions
    print(f"\n--- Threshold: {thresh} ---")
    print(classification_report(y_test, y_pred_adj, target_names=['Low Risk', 'High Risk']))

# Evaluation
print("AUC-ROC:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

=== Threshold Tuning Results ===

--- Threshold: 0.6 ---
              precision    recall  f1-score   support

    Low Risk       0.92      0.79      0.85    362528
   High Risk       0.36      0.65      0.47     68217

    accuracy                           0.76    430745
   macro avg       0.64      0.72      0.66    430745
weighted avg       0.83      0.76      0.79    430745

AUC-ROC: 0.8027205235945605

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.68      0.79    362528
           1       0.32      0.78      0.45     68217

    accuracy                           0.70    430745
   macro avg       0.63      0.73      0.62    430745
weighted avg       0.84      0.70      0.74    430745


Confusion Matrix:
 [[247759 114769]
 [ 15282  52935]]


In [28]:
import joblib

In [29]:
# Save the model to a pickle file
joblib.dump(model, 'xgboost_diabetes_model.pkl')
print("Model saved as xgboost_diabetes_model.pkl")

Model saved as xgboost_diabetes_model.pkl


In [30]:
# Load the model from the pickle file
loaded_model = joblib.load('xgboost_diabetes_model.pkl')


In [31]:
# Quick test: Make a prediction with sample data (replace with your features)
sample_data = [[2.0, 2.0, 3.0,2.0,1.0,1.0,2.0,1.0,8.0,1660.0,4.0,1.0,2020,5.0]]  # Example input matching your X shape
prediction = loaded_model.predict(sample_data)
print("Sample prediction from loaded model:", prediction)

Sample prediction from loaded model: [0]


In [33]:
from pathlib import Path
import joblib

BASE_DIR = Path.cwd()

def load_models():
    model = joblib.load(BASE_DIR / "xgboost_diabetes_model.pkl")
    scaler = joblib.load(BASE_DIR / "scaler.pkl")
    feature_names = joblib.load(BASE_DIR / "feature_names.pkl")
    best_threshold = joblib.load(BASE_DIR / "best_threshold.pkl")
    return model, scaler, feature_names, best_threshold
