In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from mlxtend.plotting import plot_decision_regions

# Load and prepare the dataset
# If the file is in the current directory
# data = pd.read_csv('breast-cancer.csv')
# Or, replace 'breast-cancer.csv' with the correct file path if known.
# Make sure the file exists and you have read permissions.
data = pd.read_csv('/content/breast-cancer.csv')
print(f'Dataset shape: {data.shape}')
print(data.head())

# ... (rest of your code) ...

Dataset shape: (569, 32)
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from mlxtend.plotting import plot_decision_regions

# Load and prepare the dataset
data = pd.read_csv('/content/breast-cancer.csv')
print(f'Dataset shape: {data.shape}')
print(data.head())

# Data preprocessing
# Extract features (X) and target (y)
X = data.drop('diagnosis', axis=1)  # Assuming 'diagnosis' is the target column
y = data['diagnosis']

# One-hot encode categorical features
X = pd.get_dummies(X)  # This will convert categorical columns to numerical using one-hot encoding

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Dataset shape: (569, 32)
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  

In [7]:
# Train SVM with Linear and RBF kernels
svm_linear = SVC(kernel='linear')
svm_rbf = SVC(kernel='rbf')
svm_linear.fit(X_train, y_train)
svm_rbf.fit(X_train, y_train)

# Predictions and evaluation
y_pred_linear = svm_linear.predict(X_test)
y_pred_rbf = svm_rbf.predict(X_test)
print('Linear Kernel Accuracy:', accuracy_score(y_test, y_pred_linear))
print('RBF Kernel Accuracy:', accuracy_score(y_test, y_pred_rbf))

# Hyperparameter tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5)
grid.fit(X_train, y_train)
print('Best Parameters:', grid.best_params_)

# Cross-validation
scores = cross_val_score(grid.best_estimator_, X_train, y_train, cv=5)
print('Cross-Validation Accuracy:', np.mean(scores))

# Visualization (for 2D data)
if X_train.shape[1] == 2:
    plt.figure(figsize=(10, 5))
    plot_decision_regions(X_train, y_train, clf=svm_rbf)
    plt.title('Decision Boundary (RBF Kernel)')
    plt.show()
else:
    print('Visualization not possible: more than 2 features.')

Linear Kernel Accuracy: 0.9766081871345029
RBF Kernel Accuracy: 0.9766081871345029
Best Parameters: {'C': 10, 'gamma': 0.01}
Cross-Validation Accuracy: 0.9697784810126582
Visualization not possible: more than 2 features.
