In [1]:
#Import modules and data
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data=load_breast_cancer()
X = data.data
y = data.target
xtrain, xtest, ytrain, ytest=train_test_split(X, y, test_size=0.2, random_state=1)

In [2]:
#Preprocess the data
scaler = StandardScaler()
new_xtrain = scaler.fit_transform(xtrain)
new_xtest = scaler.transform(xtest)

In [3]:
#First model is linear kernel
svm=SVC(kernel="linear")
svm.fit(new_xtrain, ytrain)
yhat=svm.predict(new_xtest)
print(f"Test accuracy for linear kernel: {accuracy_score(yhat, ytest)}")

Test accuracy for linear kernel: 0.9649122807017544


In [4]:
#Second model is polynomial kernel of degree 5
svm=SVC(kernel="poly", degree=5)
svm.fit(new_xtrain, ytrain)
yhat=svm.predict(new_xtest)
print(f"Test accuracy for polynomial kernel of degree 5: {accuracy_score(yhat, ytest)}")

Test accuracy for polynomial kernel of degree 5: 0.7894736842105263


In [5]:
#Final model is gaussian kernel(also known as radial basis function)
svm=SVC(kernel="rbf")
svm.fit(new_xtrain, ytrain)
yhat=svm.predict(new_xtest)
print(f"Test accuracy for gaussian kernel: {accuracy_score(yhat, ytest)}")

Test accuracy for gaussian kernel: 0.9736842105263158


In [34]:
#The data set is the breast cancer dataset which has 30 different numerical features
#I preprocessed the data by normalizing all of the features in the training set to have mean 0 and variance 1
#and using that same transformation on the x's in the testing set. This lead to better performance as SVM's are 
#sensitive to distance magnitudes due to the fact that it calculates the distance when performing calculations
#which could lead to it overfitting the larger features and underfitting the smaller features which isn't good
#which is why I normalized the data.
#The linear kernels and the gaussian kernel's both performed very well with the gaussian kernel performing the best
#With an accuracy of 97% and the linear kernel having an accuracy of 96%. The polynomial kernel of degree 5
#performed by far the worst with an accuracy of 79% which is significantly lower than the other accuracies.
#Some of the main insights that I had from doing this was the power of an SVM to high accuracy predictions and 
#also the importance of using the correct kernel as rbf had a very high accuracy while the polynomial kernel 
#of degree 5 had an accuracy of 79% so this 18% difference in error was caused purely due to the different kernels.
#Moreover, this taught me a lot about the importance of preprocessing as I found out the reason that it is so 
#important to preprocess the data before fitting it into an SVM as that could hurt the SVM's performance.
#Overall the conclusion that should be drawn is that the SVM can be used to produce a high accuracy prediction
#for the breast cancer data but that there are many different steps programmers must use to ensure that
#there models work as best that they can like using preprocessing and using the right kernel