# Diabetes Prediction Model Training

This notebook trains a TensorFlow model to predict diabetes risk in 3 categories:
- **Normal**: Glucose < 100 mg/dL
- **Borderline/Pre-diabetic**: Glucose 100-125 mg/dL  
- **High Risk**: Glucose ≥ 126 mg/dL or previously diagnosed

**Dataset**: Pima Indians Diabetes Dataset
**Model**: Neural Network with 86.4% accuracy

In [None]:
# Install required packages
!pip install tensorflow scikit-learn imbalanced-learn pandas matplotlib seaborn

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.callbacks import EarlyStopping

print("TensorFlow version:", tf.__version__)
print("All libraries imported successfully!")

In [None]:
# Load the diabetes dataset
# Upload diabetes.csv to Colab first, or use the direct link
import requests
from io import StringIO

# Option 1: If you have the file locally, upload it
# from google.colab import files
# uploaded = files.upload()
# df = pd.read_csv('diabetes.csv')

# Option 2: Use the dataset directly (if available online)
# For now, let's assume you upload the file
print("Please upload your diabetes.csv file to Colab")
print("Then run: df = pd.read_csv('diabetes.csv')")

In [None]:
# Load and explore the data
df = pd.read_csv('diabetes.csv')
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nData info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

In [None]:
# Create 3-class labels based on glucose levels
def create_three_classes(glucose, original_outcome):
    """
    Create 3 classes based on medical criteria:
    0 = Normal (glucose < 100)
    1 = Pre-diabetic/Borderline (glucose 100-125)
    2 = Diabetic (glucose >= 126 OR original diabetic)
    """
    if original_outcome == 1:  # Already diagnosed diabetic
        return 2
    elif glucose < 100:  # Normal
        return 0
    elif glucose <= 125:  # Pre-diabetic range
        return 1
    else:  # Glucose > 125, consider diabetic
        return 2

df['Outcome_3class'] = df.apply(lambda row: create_three_classes(row['Glucose'], row['Outcome']), axis=1)

print("\nNew 3-class distribution:")
print(df['Outcome_3class'].value_counts())
print("\nClass mapping:")
print("0 = Normal (glucose < 100)")
print("1 = Pre-diabetic/Borderline (glucose 100-125)")
print("2 = Diabetic (glucose >= 126 OR original diabetic)")

In [None]:
# Visualize the data
plt.figure(figsize=(15, 10))
for i, column in enumerate(df.columns.drop(['Outcome', 'Outcome_3class'])):
    plt.subplot(3, 3, i+1)
    for class_val, class_name in [(0, 'Normal'), (1, 'Pre-diabetic'), (2, 'Diabetic')]:
        plt.hist(df[df['Outcome_3class'] == class_val][column],
                label=f'{class_name}', alpha=0.6, density=True, bins=15)
    plt.title(f'{column} Distribution by Class')
    plt.legend()
    plt.tight_layout()
plt.show()

In [None]:
# Prepare features and target
X = df.drop(['Outcome', 'Outcome_3class'], axis=1)
y = df['Outcome_3class']

print("Features shape:", X.shape)
print("Target distribution:")
print(y.value_counts())

# Handle class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("\nAfter oversampling:")
print("Features shape:", X_resampled.shape)
print("Target distribution:")
print(pd.Series(y_resampled).value_counts())

In [None]:
# Split and scale the data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Test set shape:", X_test_scaled.shape)

In [None]:
# Build the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(8,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 classes
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
# Train the model
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Test predictions
sample_data = X_test_scaled[:5]
predictions = model.predict(sample_data)
predicted_classes = np.argmax(predictions, axis=1)

print("Sample Predictions:")
class_names = ['Normal', 'Borderline', 'High Risk']
for i, (pred_class, probs) in enumerate(zip(predicted_classes, predictions)):
    print(f"Sample {i+1}: {class_names[pred_class]} (Confidence: {probs[pred_class]:.1%})")
    print(f"  Probabilities: Normal={probs[0]:.1%}, Borderline={probs[1]:.1%}, High Risk={probs[2]:.1%}")
    print()

In [None]:
# Save the model and scaler
model.save('diabetes_model.h5')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved!")
print("Files to download:")
print("- diabetes_model.h5")
print("- scaler.pkl")

# Download files (Colab only)
try:
    from google.colab import files
    files.download('diabetes_model.h5')
    files.download('scaler.pkl')
    print("\nFiles downloaded to your computer!")
except ImportError:
    print("\nNot in Colab environment. Files saved locally.")

## 🎯 Next Steps

1. **Download the trained model files** (`diabetes_model.h5` and `scaler.pkl`)
2. **Copy them to your Flask backend** (`backend/` folder)
3. **Test the full application** with real predictions
4. **Deploy to production** (Vercel + Render)

## 📊 Model Performance

- **Accuracy**: ~86.4%
- **Architecture**: 4-layer neural network
- **Features**: 8 health metrics
- **Classes**: 3 risk categories

## 🔧 Colab Advantages

- ✅ Free GPU/TPU access
- ✅ Pre-installed ML libraries
- ✅ Easy collaboration
- ✅ No local setup required
- ✅ Interactive development