In [None]:
import pandas as pd
import numpy as np
%pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay
)

import matplotlib.pyplot as plt

In [None]:
cols = [
    "id", "diagnosis",
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
    "smoothness_mean", "compactness_mean", "concavity_mean",
    "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se",
    "smoothness_se", "compactness_se", "concavity_se",
    "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst",
    "smoothness_worst", "compactness_worst", "concavity_worst",
    "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

df = pd.read_csv("wdbc.data.txt", header=None, names=cols)

print("First few rows:")
print(df.head())
print("\nClass distribution:")
print(df["diagnosis"].value_counts())


In [None]:
# Encode diagnosis: M -> 1 (malignant), B -> 0 (benign)
df["label"] = df["diagnosis"].map({"B": 0, "M": 1})

X = df.drop(columns=["id", "diagnosis", "label"])
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nTrain size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Algorithm 1: Logistic Regression
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train_scaled, y_train)

# Algorithm 2: Support Vector Machine (RBF kernel)
svm_clf = SVC(kernel="rbf", probability=True)
svm_clf.fit(X_train_scaled, y_train)


In [None]:
def evaluate_model(name, model, X_tr, X_te, y_tr, y_te):
    print(f"\n=== {name} ===")

    # Predictions (class labels)
    y_pred = model.predict(X_te)

    # Accuracy
    acc = accuracy_score(y_te, y_pred)
    print(f"Accuracy: {acc:.4f}")

    # Classification report
    print("\nClassification report:")
    print(classification_report(y_te, y_pred, target_names=["Benign", "Malignant"]))

    # Confusion matrix
    cm = confusion_matrix(y_te, y_pred)
    print("Confusion matrix:")
    print(cm)

    # ROC-AUC
    if hasattr(model, "predict_probability"):
        y_scores = model.predict_probability(X_te)[:, 1]
    else:
        y_scores = model.decision_function(X_te)

    auc = roc_auc_score(y_te, y_scores)
    print(f"ROC-AUC: {auc:.4f}")

    # Plots ROC curve
    RocCurveDisplay.from_predictions(y_te, y_scores)
    plt.title(f"ROC Curve - {name}")
    plt.show()

    return acc, auc, cm


In [None]:
acc_lr, auc_lr, cm_lr = evaluate_model(
    "Logistic Regression",
    log_reg,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

acc_svm, auc_svm, cm_svm = evaluate_model(
    "SVM (RBF)",
    svm_clf,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

print("\nSummary:")
print(f"Logistic Regression  - Accuracy: {acc_lr:.4f}, ROC-AUC: {auc_lr:.4f}")
print(f"SVM (RBF)            - Accuracy: {acc_svm:.4f}, ROC-AUC: {auc_svm:.4f}")
