In [None]:
# ==========================================
# ML ASSIGNMENT 2 - TRAINING PIPELINE
# ==========================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
import joblib

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# 1. Load Dataset (Meeting criteria: >12 features, >500 instances)
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print(f"Dataset Shape: {df.shape}")

# 2. Preprocessing
X = df.drop('target', axis=1)
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Initialize Models [cite: 34, 35, 36, 37, 38, 39]
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Dictionary to store results and trained models
results = {}
trained_models = {}

print("\nTraining Models and Calculating Metrics...\n")

# 4. Train and Evaluate [cite: 40]
for name, model in models.items():
    # Train
    model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else y_pred

    # Calculate Metrics [cite: 41, 42, 43, 44, 45, 46]
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    results[name] = metrics
    trained_models[name] = model

# 5. Display Comparison Table
results_df = pd.DataFrame(results).T
print("=== Model Comparison Table for README ===")
print(results_df.round(4))

# 6. Save Models and Scaler for Streamlit App [cite: 55]
# We bundle everything into one file for easier handling
bundle = {
    "models": trained_models,
    "scaler": scaler,
    "feature_names": list(X.columns)
}

joblib.dump(bundle, 'model_bundle.joblib')
print("\n[SUCCESS] 'model_bundle.joblib' saved. Download this file for your Streamlit app.")

# Generate a sample CSV for testing the app later
test_sample = X_test.copy()
test_sample['target'] = y_test
test_sample.head(20).to_csv("test_sample.csv", index=False)
print("[SUCCESS] 'test_sample.csv' saved. Use this to test your Streamlit upload.")

Dataset Shape: (569, 31)

Training Models and Calculating Metrics...



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Model Comparison Table for README ===
                     Accuracy     AUC  Precision  Recall  F1 Score     MCC
Logistic Regression    0.9737  0.9974     0.9722  0.9859    0.9790  0.9439
Decision Tree          0.9474  0.9440     0.9577  0.9577    0.9577  0.8880
KNN                    0.9474  0.9820     0.9577  0.9577    0.9577  0.8880
Naive Bayes            0.9649  0.9974     0.9589  0.9859    0.9722  0.9253
Random Forest          0.9649  0.9953     0.9589  0.9859    0.9722  0.9253
XGBoost                0.9561  0.9908     0.9583  0.9718    0.9650  0.9064

[SUCCESS] 'model_bundle.joblib' saved. Download this file for your Streamlit app.
[SUCCESS] 'test_sample.csv' saved. Use this to test your Streamlit upload.


In [None]:
import joblib

for name, model in trained_models.items():
    filename = f"{name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, filename)
    print(f"[SUCCESS] '{filename}' saved.")

[SUCCESS] 'logistic_regression_model.pkl' saved.
[SUCCESS] 'decision_tree_model.pkl' saved.
[SUCCESS] 'knn_model.pkl' saved.
[SUCCESS] 'naive_bayes_model.pkl' saved.
[SUCCESS] 'random_forest_model.pkl' saved.
[SUCCESS] 'xgboost_model.pkl' saved.
