### Train, make Predictions, and Evaluate ML Models against the [Pima Indians Diabetes Dataset](https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database), and deploy in Docker

In [None]:
# Install required libraries
%pip install numpy pandas scikit-learn matplotlib seaborn kagglehub xgboost joblib -q


In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
# Instantiate Classification Models
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100) # n_estimators is the number of trees in the forest


<hr />

## Load Data from Kaggle into Pandas DataFrame

In [None]:
# Install and import kagglehub
#%pip install kagglehub -q
import kagglehub

path_to_dataset = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")
df = pd.read_csv(path_to_dataset + "/diabetes.csv")

print()
print(df.info())
print(f"Shape: {df.shape}")
print()
print(df.head())


In [None]:
print(f"\nSummary Statistics:")
df.describe().T


In [None]:
# Display Correlation Values
print("\nCorrelation Values:\n")
print(df.corr())

# Display Correlation Matrix (heatmap)
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Matrix")
plt.show()


<hr />

## Data Cleansing

In [None]:
# Cleanse the Data

# Check for Missing Values
df_has_missing_data = df.isnull().values.any()
if df_has_missing_data:
    print(f"\nMissing values:\n{df.isnull().sum()}")
else:
    print("No missing values found.")

# Check all for cols that have 0 in cells
def find_cols_with_zero_for_value(df):
    """
    Function to find columns that have 0 values (excluding 'Pregnancies' and 'Outcome')
    and print the count of 0 values in each column.
    
    Parameters:
        - df: pandas DataFrame
    
    Returns:
        - List of columns that have 0 values (excluding 'Pregnancies' and 'Outcome')
    """
    skip_cols = ["Pregnancies", "Outcome"]
    cols_with_zeros = [col for col in df.columns if col not in skip_cols and (df[col] == 0).any()]
    print(f"\nColumns that have 0 values (excluding {skip_cols}): {cols_with_zeros}")

    print("\nColumns with 0 values:")
    for col in df.columns:
        num_zeros = (df[col] == 0).sum()
        if num_zeros > 0:
            print(f"{col}: {num_zeros}")
    
    return cols_with_zeros


# Replace 0 in Glucose, BloodPressure, SkinThickness, Insulin, BMI with NaN
cols_with_zeros = find_cols_with_zero_for_value(df)
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)


# Display Bar Plot of Missing of cols with 0 values
# Count missing values for columns with zeros
missing_counts = df[cols_with_zeros].isnull().sum()

# Create bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_counts.index, y=missing_counts.values, palette='viridis')
plt.title('Missing Values by Column (after replacing 0s with NaN)')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Replace cols with NaN with imputed values (median of the column)
print("\nFilling NaN values with median of each column...")
df.fillna(df.median(), inplace=True)
missing_count = find_cols_with_zero_for_value(df)


In [None]:
# Prepare Features and Target Variables
# Drop Outcome column to create feature set X and target variable y
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nX_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


<hr />

## Train Model and Evaluate Performance

In [None]:
def check_for_overfitting(model, accuracy, X_train, y_train):
    """
    Check for overfitting by comparing training and test accuracy.
    If training accuracy is significantly higher than test accuracy, it may indicate overfitting.
    
    Parameters:
        - model: Trained model to evaluate.
        - accuracy: Test accuracy of the model.
        - X_train: Training feature set.
        - y_train: Training target variable.
    """
    print(f" Checking for overfitting in model: {model.__class__.__name__}")

    # Get Model Score
    model_score = model.score(X_train, y_train) - accuracy
    
    print("=" * 40)
    print(f"Training Accuracy: {model.score(X_train, y_train):.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Difference (Training - Test): {model_score:.4f}")
    print()

    if model_score > 0.1:
        print("⚠️  OVERFITTING DETECTED: Training accuracy is significantly higher than test accuracy")
    elif model_score > 0.05:
        print("⚠️  POSSIBLE OVERFITTING: Training accuracy is moderately higher than test accuracy")
    else:
        print("✅ NO SIGNIFICANT OVERFITTING: Training and test accuracies are reasonably close")


<hr />

### Logistic Regression

In [None]:
# Train and evaluate Logistic Regression model
lr.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr.predict(X_test)

# Calculate accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print(f"\nTraining Score: {lr.score(X_train, y_train)}")
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_lr))


In [None]:
# Determine if Logistic Regression is overfitting
check_for_overfitting(lr, accuracy_lr, X_train, y_train)


<hr />

### Random Forest

In [None]:
# Train and evaluate with Random Forest Classifier
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"\nTraining Score: {rf.score(X_train, y_train)}")
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_rf))


In [None]:
# Determine if Random Forest is overfitting
check_for_overfitting(rf, accuracy_rf, X_train, y_train)


<hr />

## Make Predictions

In [None]:
def make_prediction(model, scaler, new_sample):
    """
    Make a prediction on a new sample using the provided model and scaler.

    Parameters:
        - model: Trained machine learning model.
        - scaler: Fitted scaler for feature scaling.
        - new_sample: New sample data as a 2D numpy array.
    """
    new_sample_scaled = scaler.transform(new_sample)
    prediction = model.predict(new_sample_scaled)
    print(f"Prediction for new sample {new_sample[0]}: {'Diabetic' if prediction[0] == 1 else 'Not Diabetic'}")


<hr />

In [None]:
# Reload the dataset and preprocessg
df = pd.read_csv(path_to_dataset + "/diabetes.csv")

# Cleanse the Data
# Replace 0 in Glucose, BloodPressure, SkinThickness, Insulin, BMI with NaN
cols_with_zeros = find_cols_with_zero_for_value(df)
for col in cols_with_zeros:
    df[col] = df[col].replace(0, np.nan)

# Replace cols with NaN with imputed values (mean of the column)
for col in cols_with_zeros:
    df[col].replace(np.nan, df[col].mean(), inplace=True)

# Prepare Features and Target Variables
X = df.iloc[:, :-1].to_numpy()  # Features (all columns except the last)
y = df.iloc[:, -1].to_numpy()   # Target variable (last column)

# Split features and target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pd.DataFrame(X_train_scaled, columns=df.columns[:-1]).describe().T


In [None]:
# XGBoost Classifier
#%pip install xgboost -q
from xgboost import XGBClassifier

# Instantiate XGBoost Classifier
xgb = XGBClassifier(use_label_encoder=False, verbose=0, eval_metric='logloss')

# Train and evaluate with XGBoost Classifier
xgb.fit(X_train_scaled, y_train)
y_pred = xgb.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# Determine if XGBoost is overfitting
check_for_overfitting(xgb, accuracy, X_train_scaled, y_train)

print(confusion_matrix(y_test, y_pred))
# Display confusion matrix 
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
plt.title('XGBoost Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy}")


In [None]:
# Make a prediction on a new sample (Diabetic patient)
make_prediction(xgb, scaler, np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]]))

# Make a prediction on a new sample (Non-Diabetic patient)
make_prediction(xgb, scaler, np.array([[1, 85, 66, 29, 0, 26.6, 0.351, 31]]))


<hr />

## Export / Import Model as joblib file

In [None]:
# Train a new Logistic Regression model with the properly scaled data
lr_final = LogisticRegression()
lr_final.fit(X_train_scaled, y_train)

# Save the Linear Regression model using joblib
import joblib

joblib.dump(lr_final, 'pima_diabetes_lr_predicter.joblib')

In [None]:
# Load the model from the file
lr_loaded = joblib.load('pima_diabetes_lr_predicter.joblib')

# Make a prediction on a new sample (Diabetic patient) using the loaded model
make_prediction(lr_loaded, scaler, np.array([[11, 200, 100, 50, 300, 45.0, 1.2, 60]]))

# Make a prediction on a new sample (Non-Diabetic patient) using the loaded model
make_prediction(lr_loaded, scaler, np.array([[2, 90, 70, 30, 0, 25.0, 0.500, 22]]))
