In [None]:
import os
import joblib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

# --------------------------------------------------
# Paths
# --------------------------------------------------
DATA_PATH = "breast_cancer.csv"
MODEL_DIR = "model/saved_models"
os.makedirs(MODEL_DIR, exist_ok=True)

# --------------------------------------------------
# Load UCI Dataset (CSV)
# --------------------------------------------------
column_names = [
    "id", "diagnosis",
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
    "smoothness_mean", "compactness_mean", "concavity_mean",
    "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se",
    "smoothness_se", "compactness_se", "concavity_se",
    "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst",
    "smoothness_worst", "compactness_worst", "concavity_worst",
    "concave_points_worst", "symmetry_worst",
    "fractal_dimension_worst"
]

df = pd.read_csv(DATA_PATH, header=None, names=column_names)

# --------------------------------------------------
# Preprocessing
# --------------------------------------------------
df["diagnosis"] = df["diagnosis"].map({"M": 1, "B": 0})

X = df.drop(["id", "diagnosis"], axis=1)
y = df["diagnosis"]

# --------------------------------------------------
# Train-Test Split
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# --------------------------------------------------
# Feature Scaling
# --------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))

# --------------------------------------------------
# Models
# --------------------------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(
        n_estimators=200, random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    )
}

# --------------------------------------------------
# Evaluation Function
# --------------------------------------------------
def evaluate_model(y_true, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_prob),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "MCC": matthews_corrcoef(y_true, y_pred)
    }

# --------------------------------------------------
# Train, Evaluate & Save
# --------------------------------------------------
results = []

for name, model in models.items():
    print(f"\nTraining {name}...")

    if name in ["Logistic Regression", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

    metrics = evaluate_model(y_test, y_pred, y_prob)
    metrics["Model"] = name
    results.append(metrics)

    model_file = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, os.path.join(MODEL_DIR, model_file))

# --------------------------------------------------
# Results Table
# --------------------------------------------------
results_df = pd.DataFrame(results)
results_df = results_df[
    ["Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]
]

print("\n=== MODEL COMPARISON TABLE ===\n")
print(results_df.round(4))

# --------------------------------------------------
# Save Test Data
# --------------------------------------------------
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv("test_data.csv", index=False)
print("\nTest data saved to test_data.csv")


Training Logistic Regression...

Training Decision Tree...

Training KNN...

Training Naive Bayes...

Training Random Forest...

Training XGBoost...

=== MODEL COMPARISON TABLE ===

                 Model  Accuracy     AUC  Precision  Recall      F1     MCC
0  Logistic Regression    0.9650  0.9962     0.9800  0.9245  0.9515  0.9251
1        Decision Tree    0.9580  0.9473     0.9796  0.9057  0.9412  0.9103
2                  KNN    0.9580  0.9860     0.9796  0.9057  0.9412  0.9103
3          Naive Bayes    0.9441  0.9952     1.0000  0.8491  0.9184  0.8830
4        Random Forest    0.9580  0.9955     1.0000  0.8868  0.9400  0.9118
5              XGBoost    0.9790  0.9948     1.0000  0.9434  0.9709  0.9555

Test data saved to test_data.csv
