In [4]:
#Example from: https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python
#Let's first load the required dataset you will use.
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
#Exploring Data

#After you have loaded the dataset, you might want to know a little bit more about it. You can check feature and target names.
# print the names of the 13 features
print("Features: ", cancer.feature_names)

# print the label type of cancer('malignant' 'benign')
print("Labels: ", cancer.target_names)
#Splitting Data

#To understand model performance, dividing the dataset into a training set and a test set is a good strategy.

#Split the dataset by using the function train_test_split(). you need to pass 3 parameters features, target, and test_set size. Additionally, you can use random_state to select records randomly.
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3,random_state=109) # 70% training and 30% test
#Generating Model

#Let's build support vector machine model. First, import the SVM module and create support vector classifier object by passing argument kernel as the linear kernel in SVC() function.

#Then, fit your model on train set using fit() and perform prediction on the test set using predict().
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
#Evaluating the Model

#Let's estimate how accurately the classifier or model can predict the breast cancer of patients.

#Accuracy can be computed by comparing actual test set values and predicted values.
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#Well, you got a classification rate of 96.49%, considered as very good accuracy.

#For further evaluation, you can also check precision and recall of model.
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Features:  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Labels:  ['malignant' 'benign']
Accuracy: 0.9649122807017544
Precision: 0.9811320754716981
Recall: 0.9629629629629629


In [None]:
#use a k-fold cross-validation and a test dataset. Report the final performance for the test dataset.


In [5]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import svm
from sklearn import datasets

# load the breast cancer dataset
cancer = datasets.load_breast_cancer()

# split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=109)

# create a SVM classifier with the linear kernel
clf = svm.SVC(kernel='linear')

# use 5-fold cross-validation to evaluate the classifier
scores = cross_val_score(clf, X_train, y_train, cv=5)

# report the cross-validation scores
print('Cross-validation scores: ', scores)

# report the mean cross-validation score
print('Mean cross-validation score: ', scores.mean())

# train the classifier on the full training dataset
clf.fit(X_train, y_train)

# evaluate the classifier on the test dataset
y_pred = clf.predict(X_test)

# report the accuracy of the classifier on the test dataset
test_accuracy = clf.score(X_test, y_test)
print('Test accuracy: ', test_accuracy)

Cross-validation scores:  [0.975      0.9375     0.9125     0.96202532 0.93670886]
Mean cross-validation score:  0.944746835443038
Test accuracy:  0.9649122807017544


In [6]:
##1. use a k-fold cross-validation and a test dataset. Report the final performance for the test dataset.
##2. use gridsearch or randomsearch to tune the hyperparameters in your ML classifier;

In [7]:
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer

# Load the dataset
cancer = load_breast_cancer()

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

# Create a support vector classifier object
svc = SVC()

# Define the grid of hyperparameters to search over
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(svc, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by grid search
print("Best parameters: {}".format(grid_search.best_params_))

# Use the best hyperparameters to create the final classifier
svc_final = SVC(C=grid_search.best_params_['C'], gamma=grid_search.best_params_['gamma'])
svc_final.fit(X_train, y_train)

# Use k-fold cross-validation to estimate the performance of the final classifier
cv_scores = cross_val_score(svc_final, cancer.data, cancer.target, cv=5)
print("Cross-validation scores: {}".format(cv_scores))

# Report the final performance for the test dataset
test_score = svc_final.score(X_test, y_test)
print("Test set score: {:.2f}".format(test_score))

Best parameters: {'C': 0.1, 'gamma': 0.1}
Cross-validation scores: [0.62280702 0.62280702 0.63157895 0.63157895 0.62831858]
Test set score: 0.63
