# SVM Classification Experiments on Breast Cancer Dataset

This notebook explores Support Vector Machine (SVM) classification using the breast cancer dataset from scikit-learn. Experiments with different kernels, hyperparameters, and analysis of results will follow:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

## 1. Load the Dataset

Load the breast cancer dataset and explor its structure, features, and target distribution.

In [None]:
# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

In [None]:
X.dtype
y.dtype

In [None]:
print(f"Number of samples vs num of features: {X.shape}")
print(f"Feature names: {data.feature_names[:5]}...")
print(f"Target names: {data.target_names}")

In [None]:
print(f"Is the data balanced? : {np.bincount(y)}") # 212 malignant cases and 357 benign

In [None]:
# df = pd.DataFrame(X, columns= data.feature_names)
# df['target'] = y


In [None]:
# Create a DataFrame
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y
df['target_name'] = df['target'].map({0: 'malignant', 1: 'benign'})

# Display basic statistics
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Visualize target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Count plot
target_counts = df['target_name'].value_counts()
print("Target counts", target_counts)
ax1.bar(target_counts.index, target_counts.values)

ax1.set_title('Target Distribution')
ax1.set_ylabel('Count')

# Pie chart
ax2.pie(target_counts.values, labels=target_counts.index, autopct='%1.1f%%')
ax2.set_title('Target Distribution (Percentage)')

plt.tight_layout()
plt.show()

## 2. Data Preprocessing and Feature Scaling

Now we'll split the data into training and testing sets and apply feature scaling, which is crucial for SVM performance.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Training target distribution M vs B:", np.bincount(y_train))
print("Testing target distribution M vs B:", np.bincount(y_test))

In [None]:
# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Compare feature distributions before and after scaling
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Before scaling
ax1.boxplot(X_train[:, :5]) # For demo, uswing 5features
ax1.set_title('Feature Distribution Before Scaling')
ax1.set_xlabel('Feature Index')
ax1.set_ylabel('Value')

# After scaling
ax2.boxplot(X_train_scaled[:, :5])
ax2.set_title('After Scaling')
ax2.set_xlabel('Feature Index')
ax2.set_ylabel('Scaled Value')

plt.tight_layout()
plt.show()


## 3. Train SVM Model with Linear Kernel

Let's start with a basic SVM model using a linear kernel, similar to your original code.

In [None]:
# Define SVM model with linear kernel
svm_linear = SVC(kernel='linear', random_state=42)
# Train
svm_linear.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on test data
y_pred_linear = svm_linear.predict(X_test_scaled)

In [None]:
accuracy_linear = accuracy_score(y_test, y_pred_linear)
print(f"Linear SVM Accuracy: {accuracy_linear:.4f}")

In [None]:
# Get support vectors information
# This means:15 training samples from the "Malignant" class are support vectors. 17 training samples from the "Benign" class are support vectors.
print(f"Number of support vectors: {svm_linear.n_support_}")

## 4. Model Evaluation and Performance Metrics

Evaluatiom the linear SVM model using various performance metrics.

In [None]:
# Confusion Matrix
cm_lin = confusion_matrix(y_test, y_pred_linear)
print("Confusion Matrix:")
print(cm_lin)

In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_lin, annot=True, fmt='d', cmap='Purples', 
            xticklabels=data.target_names, yticklabels=data.target_names)

plt.title('Confusion Matrix - Linear SVM')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Class Report
print(classification_report(y_test, y_pred_linear, target_names=data.target_names))

# Cross-validation scores
cv_scores = cross_val_score(svm_linear, X_train_scaled, y_train, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## 5.Different SVM Kernels

different SVM kernels: linear, RBF, and polynomial.

In [None]:
# Define different kernels to test
kernels = ['linear', 'rbf', 'poly']
svm_models = {}
predictions = {}
accuracies = {}

# Train and evaluate each kernel
for kernel in kernels:
    print(f"\nTraining SVM with {kernel} kernel...")
    
    # train model
    model = SVC(kernel=kernel, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    # results
    svm_models[kernel] = model
    predictions[kernel] = y_pred
    accuracies[kernel] = accuracy
    
    print(f"{kernel.upper()} SVM Accuracy: {accuracy:.4f}")
    print(f"Number of support vectors: {model.n_support_}")

## 6. Hyperparameter Tuning with Grid Search

Let's use GridSearchCV to find the optimal hyperparameters for our SVM models.

In [None]:
# Define parameter grids for different kernels
param_grids = {
    'linear': {
        'kernel': ['linear'],
        'C': [0.1, 1, 10, 100]
    },
    'rbf': {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
    },
    'poly': {
        'kernel': ['poly'],
        'C': [0.1, 1, 10],
        'degree': [2, 3, 4],
        'gamma': ['scale', 'auto']
    }
}


In [None]:
best_models = {}
best_scores = {}

# Perform grid search for each kernel
for kernel_name, param_grid in param_grids.items():
    print(f"\nPerforming Grid Search for {kernel_name} kernel.")
    
    # Create SVM model
    svm = SVC(random_state=42)
    
    # Perform grid search
    grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    # Store best model and score
    best_models[kernel_name] = grid_search.best_estimator_
    best_scores[kernel_name] = grid_search.best_score_
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

In [None]:
# Evaluate best models on test set
print("Best Model Performance on Test Set:")

test_scores = {}
for kernel_name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_scores[kernel_name] = test_accuracy
    
    print(f"\n{kernel_name.upper()} SVM:")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Parameters: {model.get_params()}")

## 7. Feature Importance

Finally, analyze feature importance.

In [None]:
# ROC Curves for all models
plt.figure(figsize=(12, 8))

for kernel_name, model in best_models.items():
    # Get prediction probabilities
    if hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test_scaled)
    else:
        y_scores = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, linewidth=2, 
             label=f'{kernel_name.upper()} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
linear_model = best_models['linear']
feature_importance = np.abs(linear_model.coef_[0])

In [None]:
feature_df = pd.DataFrame({
    'feature': data.feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

In [None]:
feature_df.head(15)

In [None]:
# Plot top 15 most important features
plt.figure(figsize=(12, 8))
top_feats = feature_df.head(15)
plt.barh(range(len(top_feats)), top_feats['importance'])
plt.yticks(range(len(top_feats)), top_feats['feature'])
plt.xlabel('Feature Importance (Absolute Coefficient)')
plt.title('Top 15 Most Important Features (Linear SVM)')

plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# 2D visualisatin using PCA
pca = PCA(n_components=2)

In [None]:
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
# Train a simple SVM on PCA-reduced data for visualization
svm_viz = SVC(kernel='rbf', random_state=42)
svm_viz.fit(X_train_pca, y_train)

# Create a mesh for decision boundary
h = 0.02
x_min, x_max = X_train_pca[:, 0].min() - 1, X_train_pca[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Plot decision boundary
plt.figure(figsize=(12, 8))
Z = svm_viz.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)

# Plot data points
scatter = plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap=plt.cm.RdYlBu)
plt.xlabel(f'First Principal Component (Explained Variance: {pca.explained_variance_ratio_[0]:.2f})')
plt.ylabel(f'Second Principal Component (Explained Variance: {pca.explained_variance_ratio_[1]:.2f})')
plt.title('SVM Decision Boundary (2D PCA Projection)')
plt.colorbar(scatter)
plt.show()

print(f"Total variance explained by first 2 components: {pca.explained_variance_ratio_.sum():.3f}")