# Support Vector Classifier 

This notebook implements a SVM classifier and hyperparameter optimization using the [SciKitLearn](https://scikit-learn.org/stable/index.html) library. The hyperparameter optimization is done using *Search Grid with 5-fold Cross Validation*. The search grid includes two different kernels; Linear and RBF, and four different setting for C (penalty term).

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from load_data import loadVectors
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

The training and validation set are merged, as CV creates its' own train/test split.     

In [3]:
x_train, y_train, x_validation, y_validation, x_test = loadVectors(dataset='AlexNet')
x = pd.concat([x_train, x_validation])
y = np.concatenate((y_train, y_validation))

In [7]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.000000,5.7890,0.000000,0.566120,0.991310,4.57170,0.00000,5.17030,3.926600,3.49120,...,0.000000,0.00000,0.000000,0.0,1.53810,0.0,0.000000,0.00000,0.00000,0.00000
1,0.000000,3.9499,0.000000,0.290120,0.000000,3.16790,0.00000,5.59880,1.682400,0.00000,...,0.000000,0.00000,0.000000,0.0,2.25540,0.0,1.260000,0.33829,0.00000,0.00000
2,0.000000,3.8154,0.000000,0.000000,0.000000,2.85670,0.00000,4.15960,0.000000,3.31560,...,0.000000,0.00000,0.000000,0.0,2.79000,0.0,0.380050,0.00000,0.00000,1.49000
3,0.000000,4.7163,0.000000,2.667200,0.000000,3.92620,0.00000,5.61870,0.000000,0.00000,...,0.000000,0.00000,0.000000,0.0,2.22570,0.0,3.270600,0.00000,0.00000,0.00000
4,0.000000,5.4124,0.000000,3.093400,1.965900,3.38470,0.00000,4.32340,2.098400,0.00000,...,0.000000,0.00000,0.000000,0.0,2.00150,0.0,2.076700,0.00000,0.00000,0.00000
5,0.000000,0.0000,0.000000,1.202200,1.485400,3.64580,0.00000,2.90080,0.278280,1.91810,...,0.000000,0.00000,0.000000,0.0,0.00000,0.0,2.059400,0.42641,0.00000,0.00000
6,0.000000,5.6737,0.008881,0.000000,0.516980,4.05620,0.00000,5.46130,2.538300,1.78780,...,0.000000,0.00000,0.000000,0.0,2.51220,0.0,1.030900,0.00000,0.00000,2.63030
7,0.000000,3.2042,0.000000,2.395600,0.000000,3.28250,0.00000,5.23560,0.000000,3.98050,...,0.000000,0.00000,0.000000,0.0,1.78330,0.0,1.690600,0.00000,0.00000,0.00000
8,0.000000,2.9760,0.000000,0.858220,0.524980,0.91876,0.00000,2.99010,0.000000,1.06370,...,0.000000,0.00000,0.000000,0.0,0.00000,0.0,4.468100,0.00000,0.00000,0.00000
9,0.000000,0.0000,0.000000,0.000000,0.000000,2.17730,0.00000,4.04580,0.000000,1.51590,...,0.000000,0.00000,0.000000,0.0,2.85720,0.0,0.000000,0.00000,2.61480,0.00000


Finding the best classifier requires setting up the hyperparameters and the wanted SVM classifier. 

In [4]:
# Support Vector Classifier
svc = SVC(gamma="scale", kernel='rbf')

pca = PCA()

pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])

# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'pca__n_components': [2200],
    'svc__C': [5, 10, 15],
}

# GridSearch returns the best classifier for the given hyperparameters
clf = GridSearchCV(pipe, param_grid, iid=False, cv=5,
                      return_train_score=False, verbose=20, n_jobs=-1)

Training the classifier using the entire labeled dataset.

In [5]:
clf.fit(x, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done   9 out of  15 | elapsed: 20.1min remaining: 13.4min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 20.2min remaining: 10.1min
[Parallel(n_jobs=-1)]: Done  11 out of  15 | elapsed: 20.3min remaining:  7.4min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 20.3min remaining:  5.1min
[Parallel(n_jobs=-1)]: Done  13 out of  15 | elapsed: 24.3min remaining:  3.7min
[Parallel(n_jobs=-1)]: 

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'pca__n_components': [2200], 'svc__C': [5, 10, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=20)

In [6]:
print("Best parameter (CV score=%0.3f):" % clf.best_score_)
print(clf.best_params_)

Best parameter (CV score=0.837):
{'pca__n_components': 2200, 'svc__C': 5}


In [None]:
# Plot the PCA spectrum
pca.fit(x)

## Grid Search Results

In [None]:
Results = pd.DataFrame(clf.cv_results_)
Results

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(12, 8))
ax0.plot(pca.explained_variance_ratio_, linewidth=2)
ax0.set_ylabel('PCA explained variance')
ax0.axvline(clf.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
components_col = 'param_pca__n_components'
best_clfs = Results.groupby(components_col).apply(
    lambda g: g.nlargest(1, 'mean_test_score'))

best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
               legend=False, ax=ax1)
ax1.set_ylabel('Classification accuracy (val)')
ax1.set_xlabel('n_components')

plt.tight_layout()
plt.show()

The found hyperparameter settings are used to predict the classes of the unlabeled test set.

In [None]:
predictions = clf.predict(x_test)

In [None]:
predictions

In [None]:
submission = pd.DataFrame({'id': np.arange(1,3461), 'label': predictions})
submission.to_csv("submissionSVM.csv",index=False)