## Support vector machines


In [1]:
import numpy as np

with np.load('cifar4-train.npz', allow_pickle = False) as npz_file:
    cifar4_data = dict(npz_file.items())
    
print(list(cifar4_data.keys()))

X = cifar4_data['overfeat']
y = cifar4_data['labels']

print('X:', X.shape, X.dtype)
print('y:', y.shape, y.dtype)

['pixels', 'overfeat', 'labels', 'names', 'allow_pickle']
X: (5000, 4096) float32
y: (5000,) int64


In [2]:
from sklearn.model_selection import train_test_split

# Convert the type of the input matrix to float
X = X.astype(np.float)

# Split data into train/test sets keeping same proportion of classes
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, train_size=4000, test_size=1000, random_state=0, stratify = y)

print ('Train set:', X_tr.shape, y_tr.shape)
print('Test set:', X_te.shape, y_te.shape)

Train set: (4000, 4096) (4000,)
Test set: (1000, 4096) (1000,)


### Create an SVM classifier with a linear kernel. Tune its C  parameter.

In [3]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

LinearSVC().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'])

In [8]:
# create Pipeline 
#I use PCA as a preprocessing step  to to reduce number of dimensions and improve speed, 
    # with number of components containing 90% of the PVE to keep maximum variance of data

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components= 165)), 
    ('linear_svc', LinearSVC())
])

#Tune the parameters using grid search with cross-validation. Use the stratified 5-fold strategy
grid = {'linear_svc__C': [0.001, 0.01, 0.1, 1, 10]}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr,y_tr)
grid_cv.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_linear_svc__C', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [9]:
# Collect the results in a DataFrame with a column for the mean and the standard deviation of the accuracy values across all folds. 
#For the linear kernel, your DataFrame will have one row for each C  value.

import pandas as pd
df = pd.DataFrame.from_items([
    ('C', grid_cv.cv_results_['param_linear_svc__C']),
    ('mean_score', grid_cv.cv_results_['mean_test_score']),
    ('std_score', grid_cv.cv_results_['std_test_score'])
])
df.sort_values(by='mean_score', ascending=False)

Unnamed: 0,C,mean_score,std_score
1,0.01,0.83075,0.013337
0,0.001,0.8305,0.015604
2,0.1,0.82675,0.015524
3,1.0,0.7795,0.01459
4,10.0,0.758,0.021889


In [10]:
#Find the C value with the best mean accuracy and print it.

idx = np.argmax(df.mean_score)
top_accuracy = df.mean_score[idx]
std_score = df.std_score[idx]
c = df.C[idx]
print('Linear SVM - top accuracy across folds: {:.2f} (std:{:.3f}) with C: {}'.format(top_accuracy, std_score,c))


Linear SVM - top accuracy across folds: 0.83 (std:0.013) with C: 0.01


  return getattr(obj, method)(*args, **kwds)


### Create an SVM classifier with an RBF kernel. Tune its C and γ parameters

In [4]:
SVC().get_params().keys()

dict_keys(['C', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [11]:
#Tune the C and γ parameters using grid search with cross-validation.

pipe2 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=60)), #number of components containing 80% of the PVE (result is better than with larger number of components)
    ('svc', SVC())
])

grid2 = {
    'svc__C': [ 0.01, 0.1, 1],
    'svc__gamma': [0.001, 0.01, 0.1],
    'svc__kernel': ['rbf']
}

grid_cv2 = GridSearchCV(pipe2, grid2, cv=5)
grid_cv2.fit(X_tr,y_tr)
grid_cv2.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_svc__C', 'param_svc__gamma', 'param_svc__kernel', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [12]:
# Collect the results in a DataFrame with a column for the mean and the standard deviation of the accuracy values across all folds. 
#The DataFrame for the RBF kernel will have one row for each C  value and an additional column for the γ  values

import pandas as pd
df2 = pd.DataFrame.from_items([
    ('C', grid_cv2.cv_results_['param_svc__C']),
    ('γ_values', grid_cv2.cv_results_['param_svc__gamma']),
    ('mean_score', grid_cv2.cv_results_['mean_test_score']),
    ('std_score', grid_cv2.cv_results_['std_test_score'])
])
df2.sort_values(by='mean_score', ascending=False)


Unnamed: 0,C,γ_values,mean_score,std_score
6,1.0,0.001,0.79875,0.006275
5,0.1,0.1,0.68725,0.005668
2,0.01,0.1,0.68675,0.005734
3,0.1,0.001,0.63375,0.017357
0,0.01,0.001,0.53325,0.012263
7,1.0,0.01,0.3055,0.004783
4,0.1,0.01,0.25175,0.001275
1,0.01,0.01,0.2515,0.000935
8,1.0,0.1,0.2505,0.001


In [13]:
#Find the combination of C and γ with the best mean accuracy and print it

idx2 = np.argmax(df2.mean_score)
top_accuracy2 = df2.mean_score[idx2]
std_score2 = df2.std_score[idx2]
c2 = df2.C[idx2]
g = df2.γ_values[idx2]

print('RBF SVM - top accuracy across folds: {:.2f} (std:{:.3f}) with C: {} and gamma:{}'.format(top_accuracy2, std_score2, c2, g))


RBF SVM - top accuracy across folds: 0.80 (std:0.006) with C: 1 and gamma:0.001


  return getattr(obj, method)(*args, **kwds)


### Evaluate and report the accuracy of your (tuned) estimators on the 1,000 points from the test set

In [14]:
accuracy = grid_cv.score(X_te, y_te)
print('Linear SVM accuracy (test set): {:.3f}'.format(accuracy))

accuracy2 = grid_cv2.score(X_te, y_te)
print('SVM accuracy (test set): {:.3f}'.format(accuracy2))

Linear SVM accuracy (test set): 0.810
SVM accuracy (test set): 0.790
