In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
import joblib


In [2]:
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ["ID", "Diagnosis"] + [f"feature_{i}" for i in range(1, 31)]
df = pd.read_csv(url, header=None, names=columns)

# Convert target to numeric (M=1, B=0)
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0})
df = df.drop("ID", axis=1)

X = df.drop("Diagnosis", axis=1)
y = df["Diagnosis"]


In [3]:
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split on scaled data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [4]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, solver="lbfgs"),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

In [5]:
# Create project folder and model subfolder
os.makedirs("project-folder", exist_ok=True)
os.makedirs("project-folder/model", exist_ok=True)

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else None,
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    results.append(metrics)
    
    # Save trained model
    joblib.dump(model, f"project-folder/model/{name.replace(' ', '_')}.pkl")

results_df = pd.DataFrame(results)
print(results_df)


                 Model  Accuracy       AUC  Precision    Recall        F1  \
0  Logistic Regression  0.973684  0.997380   0.976190  0.953488  0.964706   
1        Decision Tree  0.938596  0.936947   0.909091  0.930233  0.919540   
2                  KNN  0.947368  0.981657   0.930233  0.930233  0.930233   
3          Naive Bayes  0.964912  0.997380   0.975610  0.930233  0.952381   
4        Random Forest  0.964912  0.994104   0.975610  0.930233  0.952381   
5              XGBoost  0.956140  0.990829   0.952381  0.930233  0.941176   

        MCC  
0  0.943898  
1  0.870056  
2  0.887979  
3  0.925285  
4  0.925285  
5  0.906379  
