# Breast Cancer Prediction System - Model Development
## Support Vector Machine (SVM) Classification
**Selected Features:** radius_mean, texture_mean, area_mean, smoothness_mean, compactness_mean

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

print('✓ Libraries imported successfully')

✓ Libraries imported successfully


In [2]:
# Load the Breast Cancer Wisconsin dataset
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = pd.Series(cancer.target, name='diagnosis')

print('✓ Dataset loaded successfully')
print(f'Dataset shape: {X.shape}')
print(f'Target distribution:\n{y.value_counts()}')
print(f'\nAvailable features:\n{list(X.columns)}')

✓ Dataset loaded successfully
Dataset shape: (569, 30)
Target distribution:
diagnosis
1    357
0    212
Name: count, dtype: int64

Available features:
['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']


In [3]:
# Step 1: Check for missing values
print('STEP 1: DATA PREPROCESSING - Missing Values Check')
print('='*50)
missing_X = X.isnull().sum().sum()
missing_y = y.isnull().sum()
print(f'Missing values in X: {missing_X}')
print(f'Missing values in y: {missing_y}')
if missing_X == 0 and missing_y == 0:
    print('✓ No missing values detected')
else:
    print('✗ Missing values found - handling required')

STEP 1: DATA PREPROCESSING - Missing Values Check
Missing values in X: 0
Missing values in y: 0
✓ No missing values detected


In [5]:
# Step 2: Feature Selection - Select 5 features from approved list
print('\nSTEP 2: FEATURE SELECTION')
print('='*50)
selected_features = ['mean radius', 'mean texture', 'mean area', 'mean smoothness', 'mean compactness']
X_selected = X[selected_features]
print(f'Selected 5 features: {selected_features}')
print(f'Feature matrix shape: {X_selected.shape}')
print(f'\nFeature statistics:\n{X_selected.describe()}')


STEP 2: FEATURE SELECTION
Selected 5 features: ['mean radius', 'mean texture', 'mean area', 'mean smoothness', 'mean compactness']
Feature matrix shape: (569, 5)

Feature statistics:
       mean radius  mean texture    mean area  mean smoothness  \
count   569.000000    569.000000   569.000000       569.000000   
mean     14.127292     19.289649   654.889104         0.096360   
std       3.524049      4.301036   351.914129         0.014064   
min       6.981000      9.710000   143.500000         0.052630   
25%      11.700000     16.170000   420.300000         0.086370   
50%      13.370000     18.840000   551.100000         0.095870   
75%      15.780000     21.800000   782.700000         0.105300   
max      28.110000     39.280000  2501.000000         0.163400   

       mean compactness  
count        569.000000  
mean           0.104341  
std            0.052813  
min            0.019380  
25%            0.064920  
50%            0.092630  
75%            0.130400  
max          

In [6]:
# Step 3: CRITICAL - Train/Test Split FIRST (BEFORE scaling) to avoid data leakage
print('\nSTEP 3: TRAIN-TEST SPLIT (BEFORE SCALING - NO LEAKAGE)')
print('='*50)
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Train set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')
print(f'Train-Test Ratio: 80-20')
print(f'Train target distribution:\n{y_train.value_counts()}')
print(f'Test target distribution:\n{y_test.value_counts()}')


STEP 3: TRAIN-TEST SPLIT (BEFORE SCALING - NO LEAKAGE)
Train set size: 455
Test set size: 114
Train-Test Ratio: 80-20
Train target distribution:
diagnosis
1    285
0    170
Name: count, dtype: int64
Test target distribution:
diagnosis
1    72
0    42
Name: count, dtype: int64


In [7]:
# Step 4: Feature Scaling - FIT ONLY ON TRAINING DATA (avoid leakage)
print('\nSTEP 4: FEATURE SCALING (FIT ON TRAINING DATA ONLY)')
print('='*50)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit on training data
X_test_scaled = scaler.transform(X_test)  # Transform test using training scaler
print('✓ Feature scaling completed (NO LEAKAGE)')
print(f'Training data shape after scaling: {X_train_scaled.shape}')
print(f'Test data shape after scaling: {X_test_scaled.shape}')
print(f'\nScaled training data mean: {X_train_scaled.mean(axis=0)}')


STEP 4: FEATURE SCALING (FIT ON TRAINING DATA ONLY)
✓ Feature scaling completed (NO LEAKAGE)
Training data shape after scaling: (455, 5)
Test data shape after scaling: (114, 5)

Scaled training data mean: [-2.92806072e-16  6.24652955e-16 -1.71779562e-16  6.24652955e-17
 -2.08868332e-16]


In [8]:
# Step 5: Build and Train SVM Model
print('\nSTEP 5: MODEL TRAINING')
print('='*50)
print('Algorithm: Support Vector Machine (SVM)')
print('Kernel: RBF (Radial Basis Function)')
print('Probability: True (for confidence scores)')
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)
print('✓ SVM model trained successfully')
print(f'Number of support vectors: {len(svm_model.support_vectors_)}')


STEP 5: MODEL TRAINING
Algorithm: Support Vector Machine (SVM)
Kernel: RBF (Radial Basis Function)
Probability: True (for confidence scores)
✓ SVM model trained successfully
Number of support vectors: 111


In [9]:
# Step 6: Evaluate Model Performance
print('\nSTEP 6: MODEL EVALUATION')
print('='*50)
y_pred = svm_model.predict(X_test_scaled)
y_pred_proba = svm_model.predict_proba(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('\nClassification Metrics:')
print(f'  Accuracy:  {accuracy:.4f}')
print(f'  Precision: {precision:.4f}')
print(f'  Recall:    {recall:.4f}')
print(f'  F1-Score:  {f1:.4f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


STEP 6: MODEL EVALUATION

Classification Metrics:
  Accuracy:  0.9035
  Precision: 0.9552
  Recall:    0.8889
  F1-Score:  0.9209

Classification Report:
              precision    recall  f1-score   support

      Benign       0.83      0.93      0.88        42
   Malignant       0.96      0.89      0.92        72

    accuracy                           0.90       114
   macro avg       0.89      0.91      0.90       114
weighted avg       0.91      0.90      0.90       114

Confusion Matrix:
[[39  3]
 [ 8 64]]


In [10]:
# Step 7: Save Model Artifacts using Joblib
print('\nSTEP 7: MODEL PERSISTENCE (JOBLIB)')
print('='*50)
import os

model_dir = './'
os.makedirs(model_dir, exist_ok=True)

# Save model
model_path = os.path.join(model_dir, 'breast_cancer_model.pkl')
joblib.dump(svm_model, model_path)
print(f'✓ Model saved to: {model_path}')

# Save scaler
scaler_path = os.path.join(model_dir, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f'✓ Scaler saved to: {scaler_path}')

# Save selected features
features_path = os.path.join(model_dir, 'selected_features.pkl')
joblib.dump(selected_features, features_path)
print(f'✓ Selected features saved to: {features_path}')


STEP 7: MODEL PERSISTENCE (JOBLIB)
✓ Model saved to: ./breast_cancer_model.pkl
✓ Scaler saved to: ./scaler.pkl
✓ Selected features saved to: ./selected_features.pkl


In [11]:
# Step 8: Verify Model Reloading and Reusability
print('\nSTEP 8: MODEL RELOADING TEST')
print('='*50)
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)
loaded_features = joblib.load(features_path)

print('✓ Model reloaded successfully')
print('✓ Scaler reloaded successfully')
print(f'✓ Selected features reloaded: {loaded_features}')

# Test prediction with reloaded model
test_sample = X_test_scaled[:1]
reloaded_pred = loaded_model.predict(test_sample)[0]
reloaded_proba = loaded_model.predict_proba(test_sample)[0]
confidence = np.max(reloaded_proba) * 100

print(f'\nTest Prediction:')
print(f'  Predicted class: {reloaded_pred}')
print(f'  Confidence: {confidence:.2f}%')
print(f'  Class probabilities: {reloaded_proba}')
print('✓ Model can be reloaded and used for prediction without retraining')

print('\n' + '='*50)
print('PROJECT COMPLETE - ALL ARTIFACTS SAVED')
print('='*50)


STEP 8: MODEL RELOADING TEST
✓ Model reloaded successfully
✓ Scaler reloaded successfully
✓ Selected features reloaded: ['mean radius', 'mean texture', 'mean area', 'mean smoothness', 'mean compactness']

Test Prediction:
  Predicted class: 0
  Confidence: 96.51%
  Class probabilities: [0.96507119 0.03492881]
✓ Model can be reloaded and used for prediction without retraining

PROJECT COMPLETE - ALL ARTIFACTS SAVED
