Task 11: SVM â€“ Breast Cancer Classification

1.Load dataset and inspect features and labels distribution.
2.Apply StandardScaler to normalize feature values.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    roc_auc_score
)

import joblib
# Load dataset
df = pd.read_csv('Breast_Cancer.csv')

# Display first few rows
df.head()
# Dataset info
df.info()
# Check target distribution
df['Status'].value_counts()
# Encode target variable
df['Status'] = df['Status'].map({'M': 1, 'B': 0})

# Separate features and target
X = df.drop(columns=['Status'])
y = df['Status']



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

3.Split data into train-test sets.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


ValueError: Input y contains NaN.

In [8]:
y.isna().sum()
y.unique()
# Combine X and y temporarily
data = pd.concat([X, y], axis=1)

# Drop rows where target is NaN
data = data.dropna(subset=[target_col])

# Separate again
X = data.drop(columns=[target_col])
y = data[target_col]


NameError: name 'target_col' is not defined

4.Train baseline SVM with linear kernel and check performance.

In [None]:
# Pipeline: Scaling + Linear SVM
linear_svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='linear', probability=True, random_state=42))
])

# Train model
linear_svm_pipeline.fit(X_train, y_train)

# Predictions
y_pred_linear = linear_svm_pipeline.predict(X_test)

# Accuracy
linear_accuracy = accuracy_score(y_test, y_pred_linear)
print("Linear SVM Accuracy:", linear_accuracy)


5.Train SVM with RBF kernel and compare accuracy.

In [None]:
rbf_svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', probability=True, random_state=42))
])

rbf_svm_pipeline.fit(X_train, y_train)

y_pred_rbf = rbf_svm_pipeline.predict(X_test)
rbf_accuracy = accuracy_score(y_test, y_pred_rbf)

print("RBF SVM Accuracy:", rbf_accuracy)


6.Use GridSearchCV to tune C and gamma values.

In [None]:
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [0.01, 0.1, 1, 'scale']
}

grid_search = GridSearchCV(
    estimator=rbf_svm_pipeline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


7.Evaluate best model using confusion matrix and classification report.

In [None]:
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
print("Confusion Matrix:\n", cm)

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_best))


8.Plot ROC curve and calculate AUC score.

In [None]:
# Get probability scores
y_prob = best_model.predict_proba(X_test)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

print("AUC Score:", auc_score)

# Plot ROC Curve
plt.figure()
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - SVM')
plt.legend()
plt.show()


9.Save tuned model pipeline (scaler + svm) for reuse.

In [None]:
# Save the model
joblib.dump(best_model, 'svm_breast_cancer_model.pkl')

print("Model saved successfully as svm_breast_cancer_model.pkl")
