In [8]:
# Breast Cancer Classification (UCI Dataset) by Arif
# Import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# XGBoost
from xgboost import XGBClassifier

In [9]:
#Load UCI Breast Cancer Dataset

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

print("Dataset Shape:", X.shape)
print("Target classes:", data.target_names)

Dataset Shape: (569, 30)
Target classes: ['malignant' 'benign']


In [10]:
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
#Feature Scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
#Train Models

models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "SVM (RBF kernel)": SVC(kernel="rbf", probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=4,
        subsample=0.9, colsample_bytree=0.9, eval_metric="logloss"
    )
}

In [13]:
#Train and evaluate
for name, model in models.items():
    print("\n")
    print(f"Training: {name}")
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))



Training: Logistic Regression
Accuracy: 0.9825
Confusion Matrix:
 [[41  1]
 [ 1 71]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



Training: SVM (RBF kernel)
Accuracy: 0.9825
Confusion Matrix:
 [[41  1]
 [ 1 71]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



Training: Random Forest
Accuracy: 0.9561
Confusion Matrix:
 [[39  3]
 [ 2 70]]

Classification Report:
               preci

In [14]:
#Feature Importance (Tree Models)
rf_model = models["Random Forest"]

importances = pd.Series(
    rf_model.feature_importances_, 
    index=data.feature_names
).sort_values(ascending=False)

print("\nTop 10 Important Features (Random Forest):")
print(importances.head(10))


Top 10 Important Features (Random Forest):
worst perimeter         0.133100
worst area              0.128052
worst concave points    0.108107
mean concave points     0.094414
worst radius            0.090639
mean radius             0.058662
mean perimeter          0.055242
mean area               0.049938
mean concavity          0.046207
worst concavity         0.035357
dtype: float64
