In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score, f1_score,
    matthews_corrcoef
)

In [2]:
data = load_breast_cancer(as_frame=True)
df = data.frame.copy()

df.head(), df.shape

(   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
 0        17.99         10.38          122.80     1001.0          0.11840   
 1        20.57         17.77          132.90     1326.0          0.08474   
 2        19.69         21.25          130.00     1203.0          0.10960   
 3        11.42         20.38           77.58      386.1          0.14250   
 4        20.29         14.34          135.10     1297.0          0.10030   
 
    mean compactness  mean concavity  mean concave points  mean symmetry  \
 0           0.27760          0.3001              0.14710         0.2419   
 1           0.07864          0.0869              0.07017         0.1812   
 2           0.15990          0.1974              0.12790         0.2069   
 3           0.28390          0.2414              0.10520         0.2597   
 4           0.13280          0.1980              0.10430         0.1809   
 
    mean fractal dimension  ...  worst texture  worst perimeter  worst area  \

In [3]:
df_500 = (
    df.groupby("target", group_keys=False)
      .apply(lambda g: g.sample(
          n=int(round(500 * len(g) / len(df))),
          random_state=42
      ))
)

# Fix if rounding made it 499/501
if len(df_500) != 500:
    df_500 = df.sample(n=500, random_state=42)

df_500["target"].value_counts(), df_500.shape


  .apply(lambda g: g.sample(


(target
 1    314
 0    186
 Name: count, dtype: int64,
 (500, 31))

In [4]:
feature_cols = data.feature_names[:12].tolist()   # first 12 features
X = df_500[feature_cols]
y = df_500["target"]

X.shape, y.shape, feature_cols


((500, 12),
 (500,),
 ['mean radius',
  'mean texture',
  'mean perimeter',
  'mean area',
  'mean smoothness',
  'mean compactness',
  'mean concavity',
  'mean concave points',
  'mean symmetry',
  'mean fractal dimension',
  'radius error',
  'texture error'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()


((400, 12),
 (100, 12),
 target
 1    251
 0    149
 Name: count, dtype: int64,
 target
 1    63
 0    37
 Name: count, dtype: int64)

In [6]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    # AUC needs probability (or decision scores)
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_test)
    else:
        y_score = None

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_score) if y_score is not None else np.nan
    }
    return metrics


In [7]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=2000, random_state=42))
])

logreg.fit(X_train, y_train)

logreg_metrics = evaluate_model(logreg, X_test, y_test)
logreg_metrics


{'Accuracy': 0.9,
 'Precision': 0.9076923076923077,
 'Recall': 0.9365079365079365,
 'F1': 0.921875,
 'MCC': np.float64(0.7838182461147938),
 'AUC': np.float64(0.9725439725439725)}

In [8]:
results = []
results.append({"Model": "Logistic Regression", **logreg_metrics})

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC,AUC
0,Logistic Regression,0.9,0.907692,0.936508,0.921875,0.783818,0.972544


In [9]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(
    random_state=42,
    max_depth=None,          # baseline; we can optionally cap later
    min_samples_split=2,
    min_samples_leaf=1
)

dtree.fit(X_train, y_train)


In [10]:
dtree_metrics = evaluate_model(dtree, X_test, y_test)
dtree_metrics


{'Accuracy': 0.86,
 'Precision': 0.9016393442622951,
 'Recall': 0.873015873015873,
 'F1': 0.8870967741935484,
 'MCC': np.float64(0.7036458824610665),
 'AUC': np.float64(0.8554268554268555)}

In [11]:
results.append({"Model": "Decision Tree", **dtree_metrics})
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC,AUC
0,Logistic Regression,0.9,0.907692,0.936508,0.921875,0.783818,0.972544
1,Decision Tree,0.86,0.901639,0.873016,0.887097,0.703646,0.855427


In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier(n_neighbors=5))  # baseline k=5
])

knn.fit(X_train, y_train)


In [13]:
knn_metrics = evaluate_model(knn, X_test, y_test)
knn_metrics


{'Accuracy': 0.92,
 'Precision': 0.9230769230769231,
 'Recall': 0.9523809523809523,
 'F1': 0.9375,
 'MCC': np.float64(0.8272430796945608),
 'AUC': np.float64(0.9562419562419562)}

In [14]:
results.append({"Model": "KNN (k=5)", **knn_metrics})
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC,AUC
0,Logistic Regression,0.9,0.907692,0.936508,0.921875,0.783818,0.972544
1,Decision Tree,0.86,0.901639,0.873016,0.887097,0.703646,0.855427
2,KNN (k=5),0.92,0.923077,0.952381,0.9375,0.827243,0.956242


In [15]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)


In [16]:
gnb_metrics = evaluate_model(gnb, X_test, y_test)
gnb_metrics


{'Accuracy': 0.88,
 'Precision': 0.8805970149253731,
 'Recall': 0.9365079365079365,
 'F1': 0.9076923076923077,
 'MCC': np.float64(0.7395800507424896),
 'AUC': np.float64(0.9566709566709566)}

In [17]:
results.append({"Model": "Gaussian Naive Bayes", **gnb_metrics})
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC,AUC
0,Logistic Regression,0.9,0.907692,0.936508,0.921875,0.783818,0.972544
1,Decision Tree,0.86,0.901639,0.873016,0.887097,0.703646,0.855427
2,KNN (k=5),0.92,0.923077,0.952381,0.9375,0.827243,0.956242
3,Gaussian Naive Bayes,0.88,0.880597,0.936508,0.907692,0.73958,0.956671


In [18]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    max_depth=None
)

rf.fit(X_train, y_train)


In [19]:
rf_metrics = evaluate_model(rf, X_test, y_test)
rf_metrics


{'Accuracy': 0.9,
 'Precision': 0.9206349206349206,
 'Recall': 0.9206349206349206,
 'F1': 0.9206349206349206,
 'MCC': np.float64(0.7854997854997855),
 'AUC': np.float64(0.9708279708279708)}

In [20]:
results.append({"Model": "Random Forest", **rf_metrics})
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC,AUC
0,Logistic Regression,0.9,0.907692,0.936508,0.921875,0.783818,0.972544
1,Decision Tree,0.86,0.901639,0.873016,0.887097,0.703646,0.855427
2,KNN (k=5),0.92,0.923077,0.952381,0.9375,0.827243,0.956242
3,Gaussian Naive Bayes,0.88,0.880597,0.936508,0.907692,0.73958,0.956671
4,Random Forest,0.9,0.920635,0.920635,0.920635,0.7855,0.970828


In [21]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss"
)

xgb.fit(X_train, y_train)


In [22]:
xgb_metrics = evaluate_model(xgb, X_test, y_test)
xgb_metrics


{'Accuracy': 0.9,
 'Precision': 0.9344262295081968,
 'Recall': 0.9047619047619048,
 'F1': 0.9193548387096774,
 'MCC': np.float64(0.7885759829391675),
 'AUC': np.float64(0.9716859716859717)}

In [23]:
results.append({"Model": "XGBoost", **xgb_metrics})
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC,AUC
0,Logistic Regression,0.9,0.907692,0.936508,0.921875,0.783818,0.972544
1,Decision Tree,0.86,0.901639,0.873016,0.887097,0.703646,0.855427
2,KNN (k=5),0.92,0.923077,0.952381,0.9375,0.827243,0.956242
3,Gaussian Naive Bayes,0.88,0.880597,0.936508,0.907692,0.73958,0.956671
4,Random Forest,0.9,0.920635,0.920635,0.920635,0.7855,0.970828
5,XGBoost,0.9,0.934426,0.904762,0.919355,0.788576,0.971686


In [24]:
results_df.sort_values("AUC", ascending=False)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC,AUC
0,Logistic Regression,0.9,0.907692,0.936508,0.921875,0.783818,0.972544
5,XGBoost,0.9,0.934426,0.904762,0.919355,0.788576,0.971686
4,Random Forest,0.9,0.920635,0.920635,0.920635,0.7855,0.970828
3,Gaussian Naive Bayes,0.88,0.880597,0.936508,0.907692,0.73958,0.956671
2,KNN (k=5),0.92,0.923077,0.952381,0.9375,0.827243,0.956242
1,Decision Tree,0.86,0.901639,0.873016,0.887097,0.703646,0.855427


In [25]:
import os
import json
import joblib

SAVE_DIR = "model"
os.makedirs(SAVE_DIR, exist_ok=True)

models_to_save = {
    "logreg": logreg,
    "dtree": dtree,
    "knn": knn,
    "gnb": gnb,
    "rf": rf,
    "xgb": xgb
}

# Save models
for name, model in models_to_save.items():
    joblib.dump(model, os.path.join(SAVE_DIR, f"{name}.pkl"))

# Save metrics
results_df.to_csv(os.path.join(SAVE_DIR, "metrics.csv"), index=False)
results_df.to_json(os.path.join(SAVE_DIR, "metrics.json"), orient="records", indent=2)

# Save feature names
with open(os.path.join(SAVE_DIR, "feature_columns.json"), "w") as f:
    json.dump(feature_cols, f, indent=2)

print("Saved files:", os.listdir(SAVE_DIR))


Saved files: ['knn.pkl', 'dtree.pkl', 'xgb.pkl', 'metrics.csv', 'rf.pkl', 'metrics.json', 'gnb.pkl', 'logreg.pkl', 'feature_columns.json']
