In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Data Preparation

In [None]:
df=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.drop(columns=['id','Unnamed: 32'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
sns.set_style("darkgrid")
sns.set_palette("rainbow")

# EDA & Data Visualization

In [None]:
sns.countplot(data=df,x='diagnosis',hue='diagnosis')
plt.title('Benign and Malignant Distribution')
plt.show()

In [None]:
sns.histplot(data=df,x='radius_mean',kde=True,hue='diagnosis')
plt.title('Radius Mean Distribution')
plt.show()

In [None]:
sns.histplot(data=df,x='perimeter_mean',kde=True,hue='diagnosis')
plt.title('Perimeter Mean Distribution')
plt.show()

In [None]:
sns.histplot(data=df,x='area_mean',kde=True,hue='diagnosis')
plt.title('Area Mean Distribution')
plt.show()

In [None]:
sns.boxplot(data=df,x='diagnosis',y='radius_worst',hue='diagnosis')
plt.title('Radius Worst Distribution')
plt.show()

In [None]:
sns.boxplot(data=df,x='diagnosis',y='perimeter_worst',hue='diagnosis')
plt.title('Perimeter Worst Distribution')
plt.show()

In [None]:
sns.boxplot(data=df,x='diagnosis',y='area_worst',hue='diagnosis')
plt.title('Area Worst Distribution')
plt.show()

In [None]:
fig , ax = plt.subplots(1,3,figsize=(15,6))

sns.boxplot(data=df,x='diagnosis',y='compactness_mean',ax=ax[0],hue='diagnosis')
ax[0].set_title('Compactness Mean Distribution')

sns.boxplot(data=df,x='diagnosis',y='concavity_mean',ax=ax[1],hue='diagnosis')
ax[1].set_title('Concavity Mean Distribution')

sns.boxplot(data=df,x='diagnosis',y='concave points_mean',ax=ax[2],hue='diagnosis')
ax[2].set_title('Concave Points Mean Distribution')




In [None]:
fig , ax = plt.subplots(1,3,figsize=(15,6))

sns.boxplot(data=df,x='diagnosis',y='compactness_worst',ax=ax[0],hue='diagnosis')
ax[0].set_title('Compactness Worset Distribution')

sns.boxplot(data=df,x='diagnosis',y='concavity_worst',ax=ax[1],hue='diagnosis')
ax[1].set_title('Concavity Worset Distribution')

sns.boxplot(data=df,x='diagnosis',y='concave points_worst',ax=ax[2],hue='diagnosis')
ax[2].set_title('Concave Points Worset Distribution')


In [None]:
fig , ax = plt.subplots(1,2,figsize=(10,6))

sns.boxplot(data=df,x='diagnosis',y='fractal_dimension_mean',ax=ax[0],hue='diagnosis')
ax[0].set_title('Fractal Dimension Mean Distribution')

sns.boxplot(data=df,x='diagnosis',y='fractal_dimension_worst',ax=ax[1],hue='diagnosis')
ax[1].set_title('Fractal Dimension Worst Distribution')




In [None]:
sns.scatterplot(data=df,x='radius_mean',y='area_mean',hue='diagnosis')

In [None]:
sns.scatterplot(data=df,x='concave points_mean',y='compactness_mean',hue='diagnosis')

In [None]:
plt.figure(figsize=(17,8))
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap='Blues')

# Data Preprocessing

## Encoding Target

In [None]:
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0})

In [None]:
df['diagnosis'].unique()

In [None]:
X=df.drop(columns=['diagnosis'])
y=df['diagnosis']

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_Train , X_Test , y_Train , y_Test = train_test_split(X,y,test_size=0.25,random_state=42)

## Handling Missing Values

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_Train_filled = imputer.fit_transform(X_Train)
X_Test_filled = imputer.transform(X_Test)

## Feature Scaling(Standardization)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_Train_Full = scaler.fit_transform(X_Train_filled)
X_Test_Full = scaler.transform(X_Test_filled)

# Build SVM Classifier Model

## Linear SVM

In [None]:
from sklearn.svm import SVC
svm_clas = SVC(kernel = 'linear' , C = 1)
svm_clas.fit(X_Train_Full , y_Train)

In [None]:
y_test_prid = svm_clas.predict(X_Test_Full)
y_test_prid

In [None]:
svm_clas.coef_ , svm_clas.intercept_

### Accuracy

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_Test , y_test_prid)

### confusion_matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_Test , y_test_prid)

In [None]:
sns.heatmap(confusion_matrix(y_Test , y_test_prid),annot=True,cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### classification_report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_Test , y_test_prid))

## RBF SVM

In [None]:
from sklearn.svm import SVC
svmclas_rbf = SVC(kernel = 'rbf' , C=2 , gamma = 0.01)
svmclas_rbf.fit(X_Train_Full , y_Train)

In [None]:
y_test_prid = svmclas_rbf.predict(X_Test_Full)
y_test_prid

In [None]:
# svmclas_rbf.coef_, svmclas_rbf.intercept_

### Accuracy

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_Test , y_test_prid)

### confusion_matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_Test , y_test_prid)


In [None]:
sns.heatmap(confusion_matrix(y_Test , y_test_prid),annot=True,cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### classification_report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_Test , y_test_prid))

# Summary

üìÑ Breast Cancer Classification using SVM
EDA, Preprocessing & Model Evaluation Report
1Ô∏è‚É£ Dataset Overview

The dataset used in this project is the Breast Cancer Wisconsin Dataset, which contains medical measurements extracted from images of breast cell nuclei.
The objective is to classify tumors as:

Benign (0)

Malignant (1)

The dataset initially contained identifier columns (id, Unnamed: 32) which were removed as they do not contribute to the predictive task.

2Ô∏è‚É£ Data Quality Assessment

Several checks were performed to assess data quality:

No missing values were detected.

No duplicated records were found.

All predictive features are numerical.

The target variable (diagnosis) is categorical and was encoded into binary values.

‚úÖ The dataset is clean and suitable for machine learning without heavy preprocessing.

3Ô∏è‚É£ Exploratory Data Analysis (EDA)
üîπ Target Distribution

A count plot showed that:

Benign cases are more frequent than malignant cases.

The dataset is slightly imbalanced but still acceptable for classification.

üîπ Feature Distributions

Histograms of:

radius_mean

perimeter_mean

area_mean

revealed that malignant tumors tend to have larger values, indicating that tumor size is a strong discriminative feature.

üîπ Boxplots Analysis

Boxplots comparing benign vs malignant tumors showed:

Clear separation in:

radius_worst

perimeter_worst

area_worst

concavity and concave points (mean & worst)

üìå These features are strong indicators of malignancy.

üîπ Scatter Plots

Scatter plots such as:

radius_mean vs area_mean

concave points_mean vs compactness_mean

showed visible clustering between classes, but with non-linear boundaries, suggesting that a linear model may not fully capture the relationships.

üîπ Correlation Heatmap

The correlation matrix revealed:

Strong multicollinearity between size-related features (radius, perimeter, area).

This confirms redundancy in the data and explains why non-linear models perform better.

4Ô∏è‚É£ Data Preprocessing

The following preprocessing steps were applied:

Target encoding:
M ‚Üí 1, B ‚Üí 0

Train-test split (75% training, 25% testing)

Mean imputation (although no missing values existed)

Feature scaling using StandardScaler (critical for SVM performance)

5Ô∏è‚É£ Model Training
üîπ Linear SVM

Kernel: linear

C = 1

Result:

Accuracy ‚âà 97.2%

üìå Linear SVM performed well due to strong feature separation but is limited in handling non-linear patterns.

üîπ RBF SVM

Kernel: rbf

C = 2

gamma = 0.01

Result:

Accuracy ‚âà 98.6%

üìå RBF kernel captured non-linear decision boundaries more effectively, leading to improved performance.

6Ô∏è‚É£ Model Evaluation
üîπ Confusion Matrix

The confusion matrices showed:

Very low number of False Negatives, which is crucial in medical diagnosis.

Most malignant cases were correctly identified.

üîπ Classification Report

Key observations:

High Recall for malignant tumors, indicating the model successfully detects most cancer cases.

Balanced Precision and F1-score, confirming robust classification performance.

üìå In medical applications, recall is prioritized over accuracy to minimize missed cancer diagnoses.

7Ô∏è‚É£ Model Comparison and Selection

Two Support Vector Machine models were evaluated in this study: Linear SVM and RBF SVM.
Although the Linear SVM achieved strong performance with an accuracy of approximately 97.2%, it assumes a linear decision boundary, which limits its ability to capture complex patterns in the data.

The RBF SVM, on the other hand, demonstrated superior performance with an accuracy of approximately 98.6%. This improvement indicates that the relationship between the input features and the target variable is non-linear, and the RBF kernel is better suited to model such complexity.

Furthermore, the RBF SVM showed improved recall for malignant cases, reducing the risk of false negative predictions, which is critical in medical diagnosis tasks.

Based on overall performance, robustness, and clinical relevance, the RBF SVM model was selected as the final model for this classification task.

8Ô∏è‚É£ Conclusion

This project demonstrates that while linear models can achieve strong performance, non-linear SVM kernels (RBF) provide better generalization for medical imaging features.
The chosen preprocessing pipeline and evaluation strategy ensure reliable and clinically meaningful predictions.

üß† Final ML Engineer Insight

Model selection was guided not only by accuracy, but also by recall and error analysis, ensuring minimal false negatives in a medical diagnosis context.