In [43]:
# CORRECTED TRAINING NOTEBOOK - Run this in Google Colab
# Complete notebook for proper model training

# ============================================
# CELL 1: Load Data
# ============================================
from google.colab import files
import pandas as pd
import io

# Read the uploaded file into a DataFrame
filename = 'diabetes_clean.csv'
df = pd.read_csv(filename)

print("Dataset loaded successfully into Colab.")
print(f"Original shape: {df.shape}")
print(f"Original columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)
print(f"\nOriginal Diabetes value counts:")
print(df['Diabetes'].value_counts())

# CRITICAL FIX: Convert to Binary Classification
# Based on NHANES coding:
# 1.0 = YES, has Diabetes → 1 (Positive class)
# 2.0 = NO, does not have Diabetes → 0 (Negative class)
# 3.0 = Borderline → 1 (Positive class - treat as "at risk")

print("\n" + "="*60)
print("CONVERTING TO BINARY CLASSIFICATION")
print("="*60)
print("Mapping (CORRECTED for NHANES coding):")
print("  1.0 (Has Diabetes) → 1")
print("  2.0 (No Diabetes) → 0")
print("  3.0 (Borderline) → 1")

df['Diabetes'] = df['Diabetes'].map({1.0: 1, 2.0: 0, 3.0: 1})

print(f"\nNew binary Diabetes value counts:")
print(df['Diabetes'].value_counts())
print(f"Class distribution: {df['Diabetes'].value_counts(normalize=True).to_dict()}")
print("="*60)


Dataset loaded successfully into Colab.
Original shape: (29002, 12)
Original columns: ['SEQN', 'Age', 'Gender', 'Diabetes', 'FamilyHistory', 'BMI', 'SystolicBP', 'DiastolicBP', 'A1c', 'Glucose', 'TotalCholesterol', 'Triglycerides']

First few rows:
      SEQN   Age  Gender  Diabetes  FamilyHistory    BMI  SystolicBP  \
0  41479.0  52.0     1.0       2.0            2.0  27.56       112.0   
1  41485.0  30.0     2.0       2.0            2.0  25.99       108.0   
2  41486.0  61.0     2.0       2.0            2.0  31.21       126.0   
3  41487.0  27.0     1.0       2.0            2.0  23.44       120.0   
4  41489.0  40.0     2.0       2.0            2.0  36.59       106.0   

   DiastolicBP  A1c  Glucose  TotalCholesterol  Triglycerides  
0         70.0  5.7     96.2             188.0           84.0  
1         44.0  5.5    104.8             188.0          172.0  
2         64.0  6.1    103.2             194.0          233.0  
3         84.0  5.0    113.0             167.0          124.0 

In [44]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Make a copy to avoid modifying original
df_processed = df.copy()

# --- Step 1: Separate Target Variable FIRST (CRITICAL!) ---
# Remove target before any encoding to prevent data leakage
y = df_processed['Diabetes']  # Save target
X = df_processed.drop(columns=['SEQN', 'Diabetes'])  # Remove ID and target

print("\n=== Step 1: Separated Features and Target ===")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Features: {X.columns.tolist()}")

# --- Step 2: Define columns to scale (numeric only) ---
scaling_columns = [
    "Age",
    "BMI",
    "SystolicBP",
    "DiastolicBP",
    "A1c",
    "Glucose",
    "TotalCholesterol",
    "Triglycerides"
]

# --- Step 3: Scale numeric features ---
scaler = StandardScaler()
X[scaling_columns] = scaler.fit_transform(X[scaling_columns])

print("\n=== Step 2: Feature Scaling Applied ===")
print("Scaled columns:", scaling_columns)

# --- Step 4: One-Hot Encode Categorical Features ---
# Only encode Gender and FamilyHistory (NOT Diabetes!)
categorical_columns = ["Gender", "FamilyHistory"]
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

print("\n=== Step 3: One-Hot Encoding Applied ===")
print(f"Shape after encoding: {X.shape}")
print(f"Final features: {X.columns.tolist()}")
print(f"Number of features: {X.shape[1]}")


=== Step 1: Separated Features and Target ===
Features shape: (29002, 10)
Target shape: (29002,)
Features: ['Age', 'Gender', 'FamilyHistory', 'BMI', 'SystolicBP', 'DiastolicBP', 'A1c', 'Glucose', 'TotalCholesterol', 'Triglycerides']

=== Step 2: Feature Scaling Applied ===
Scaled columns: ['Age', 'BMI', 'SystolicBP', 'DiastolicBP', 'A1c', 'Glucose', 'TotalCholesterol', 'Triglycerides']

=== Step 3: One-Hot Encoding Applied ===
Shape after encoding: (29002, 10)
Final features: ['Age', 'BMI', 'SystolicBP', 'DiastolicBP', 'A1c', 'Glucose', 'TotalCholesterol', 'Triglycerides', 'Gender_2.0', 'FamilyHistory_2.0']
Number of features: 10


In [45]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n=== Train-Test Split ===")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Train the model
model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n=== Model Training Complete ===")
print(f"Logistic Regression Accuracy: {accuracy:.4f}")
print(f"Model expects {model.n_features_in_} features ✓")



=== Train-Test Split ===
Training set: (23201, 10)
Test set: (5801, 10)

=== Model Training Complete ===
Logistic Regression Accuracy: 0.9633
Model expects 10 features ✓


In [47]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# First, check how many unique classes we have
print("\n=== Checking Target Variable ===")
print(f"Unique values in y_test: {sorted(y_test.unique())}")
print(f"Number of classes: {len(y_test.unique())}")
print(f"Value counts:\n{y_test.value_counts()}")

# Calculate metrics
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

print("\n=== Detailed Performance Metrics ===")
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(report)

# Calculate ROC AUC (handle both binary and multiclass)
y_prob = model.predict_proba(X_test)
if len(y_test.unique()) == 2:
    # Binary classification
    roc_auc = roc_auc_score(y_test, y_prob[:, 1])
    print(f"ROC AUC Score: {roc_auc:.4f}")
else:
    # Multiclass - use ovr (one-vs-rest) strategy
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')
    print(f"ROC AUC Score (weighted): {roc_auc:.4f}")


=== Checking Target Variable ===
Unique values in y_test: [np.int64(0), np.int64(1)]
Number of classes: 2
Value counts:
Diabetes
0    5517
1     284
Name: count, dtype: int64

=== Detailed Performance Metrics ===

Confusion Matrix:
[[5491   26]
 [ 187   97]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      5517
           1       0.79      0.34      0.48       284

    accuracy                           0.96      5801
   macro avg       0.88      0.67      0.73      5801
weighted avg       0.96      0.96      0.96      5801

ROC AUC Score: 0.9299


In [48]:
from sklearn.metrics import confusion_matrix, classification_report

# Define the optimized threshold
NEW_THRESHOLD = 0.4

# For binary classification, use threshold on probability of positive class
if len(y_test.unique()) == 2:
    # Get probability of the positive class (higher value)
    positive_class = sorted(y_test.unique())[1]
    positive_class_idx = list(model.classes_).index(positive_class)
    y_prob_positive = y_prob[:, positive_class_idx]

    # Generate predictions using the new threshold
    y_pred_threshold = (y_prob_positive >= NEW_THRESHOLD).astype(int)
    # Map back to original class labels
    y_pred_threshold = [sorted(y_test.unique())[i] for i in y_pred_threshold]
else:
    # For multiclass, just use standard prediction
    y_pred_threshold = y_pred
    print("Note: Threshold adjustment only works for binary classification.")

# Evaluate with new threshold
print(f"\n=== Model Evaluation with Threshold = {NEW_THRESHOLD} ===")

cm_threshold = confusion_matrix(y_test, y_pred_threshold)
print("\nConfusion Matrix:")
print(cm_threshold)

report_threshold = classification_report(y_test, y_pred_threshold, zero_division=0)
print("\nClassification Report:")
print(report_threshold)

if len(y_test.unique()) == 2:
    roc_auc_threshold = roc_auc_score(y_test, y_prob_positive)
    print(f"ROC AUC Score: {roc_auc_threshold:.4f}")
else:
    print(f"ROC AUC Score: {roc_auc:.4f} (unchanged)")


=== Model Evaluation with Threshold = 0.4 ===

Confusion Matrix:
[[5476   41]
 [ 170  114]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5517
           1       0.74      0.40      0.52       284

    accuracy                           0.96      5801
   macro avg       0.85      0.70      0.75      5801
weighted avg       0.96      0.96      0.96      5801

ROC AUC Score: 0.9299


In [49]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Define paths
SAVE_DIR = '/content/'
model_path = SAVE_DIR + "final_lr_model.joblib"
scaler_path = SAVE_DIR + "final_scaler.joblib"

print("\n" + "="*60)
print("TRAINING FINAL MODEL ON COMPLETE DATASET")
print("="*60)

# --- Reload and process complete dataset ---
df_full = pd.read_csv(filename)

# Separate target
y_full = df_full['Diabetes']
X_full = df_full.drop(columns=['SEQN', 'Diabetes'])

print(f"\nFull dataset shape: {X_full.shape}")

# Create new scaler and fit on full data
final_scaler = StandardScaler()
X_full[scaling_columns] = final_scaler.fit_transform(X_full[scaling_columns])

# One-hot encode
X_full = pd.get_dummies(X_full, columns=categorical_columns, drop_first=True)

print(f"\nProcessed full dataset:")
print(f"  Shape: {X_full.shape}")
print(f"  Features: {X_full.columns.tolist()}")
print(f"  Number of features: {X_full.shape[1]}")

# Train final model
print("\nTraining final model...")
final_model = LogisticRegression(solver='liblinear', random_state=42)
final_model.fit(X_full, y_full)
print("✓ Training complete!")
print(f"✓ Model trained with {final_model.n_features_in_} features")

# Save model and scaler
joblib.dump(final_model, model_path)
print(f"\n✅ Model saved to: {model_path}")

joblib.dump(final_scaler, scaler_path)
print(f"✅ Scaler saved to: {scaler_path}")

# Display feature list for Streamlit app
print("\n" + "="*60)
print("COPY THIS TO YOUR STREAMLIT APP (app.py):")
print("="*60)
print("\nFINAL_MODEL_FEATURES = [")
for feature in X_full.columns:
    print(f"    '{feature}',")
print("]")
print("\n" + "="*60)

print("\n✨ Model and scaler are ready for deployment!")
print("\nDownloading files...")

# Download the files
try:
    files.download(model_path)
    files.download(scaler_path)
    print("✓ Files downloaded successfully!")
except:
    print("Note: Files saved in Colab. Download manually if needed.")

print("\n" + "="*60)
print("NEXT STEPS:")
print("="*60)
print("1. Upload the downloaded .joblib files to your Streamlit app")
print("2. Update FINAL_MODEL_FEATURES in app.py with the list above")
print("3. Run your Streamlit app!")
print("="*60)


TRAINING FINAL MODEL ON COMPLETE DATASET

Full dataset shape: (29002, 10)

Processed full dataset:
  Shape: (29002, 10)
  Features: ['Age', 'BMI', 'SystolicBP', 'DiastolicBP', 'A1c', 'Glucose', 'TotalCholesterol', 'Triglycerides', 'Gender_2.0', 'FamilyHistory_2.0']
  Number of features: 10

Training final model...
✓ Training complete!
✓ Model trained with 10 features

✅ Model saved to: /content/final_lr_model.joblib
✅ Scaler saved to: /content/final_scaler.joblib

COPY THIS TO YOUR STREAMLIT APP (app.py):

FINAL_MODEL_FEATURES = [
    'Age',
    'BMI',
    'SystolicBP',
    'DiastolicBP',
    'A1c',
    'Glucose',
    'TotalCholesterol',
    'Triglycerides',
    'Gender_2.0',
    'FamilyHistory_2.0',
]


✨ Model and scaler are ready for deployment!

Downloading files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Files downloaded successfully!

NEXT STEPS:
1. Upload the downloaded .joblib files to your Streamlit app
2. Update FINAL_MODEL_FEATURES in app.py with the list above
3. Run your Streamlit app!
