# Support Vector Classifier 

This notebook implements a SVM classifier and hyperparameter optimization using the [SciKitLearn](https://scikit-learn.org/stable/index.html) library. The hyperparameter optimization is done using *Search Grid with 5-fold Cross Validation*. The search grid includes two different kernels; Linear and RBF, and four different setting for C (penalty term).

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from load_data import loadVectors
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

The training and validation set are merged, as CV creates its' own train/test split.     

In [2]:
x_train, y_train, x_validation, y_validation, x_test = loadVectors(dataset='InceptionV3')

In [3]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5820,5821,5822,5823,5824,5825,5826,5827,5828,5829
0,1.371431,2.092545,1.093020,1.387524,1.438760,3.063610,1.591292,2.267667,2.857259,2.672630,...,1.367458,2.249361,1.064652,0.602162,1.747270,2.590123,1.138218,2.534963,1.644900,0.996088
1,1.502923,0.510090,0.929712,2.768782,0.965717,1.670165,0.803191,1.261082,1.090649,2.597242,...,1.700694,0.656707,2.706970,3.093016,2.711579,0.718074,1.482449,1.516199,1.961031,0.696918
2,0.887489,1.707803,1.957168,0.620260,2.430724,0.999203,0.415900,0.546068,1.828152,3.308660,...,1.067614,1.303069,1.990455,1.516998,2.904774,2.155441,5.318796,2.841984,1.229717,1.781883
3,5.226216,0.914659,2.783728,2.591378,1.230552,3.760301,2.365233,2.847474,2.204790,3.074935,...,1.331793,1.295771,1.285972,2.076959,0.587514,1.990263,0.678568,0.722305,0.645525,0.850026
4,1.765071,0.551044,1.837835,0.974528,1.127581,1.048469,0.979929,0.908846,0.750170,0.190195,...,1.072921,2.785437,1.254221,2.062121,1.681897,0.777145,1.250086,2.138139,1.384363,1.106558
5,4.107825,1.946057,2.327149,2.631435,2.018524,6.304446,2.702466,2.617238,2.334027,2.247442,...,2.924062,2.244819,2.815198,0.935240,1.395645,1.651482,0.747044,1.625701,1.205267,2.867030
6,2.626557,1.457486,2.153209,1.250985,3.381415,2.691927,1.607120,2.220424,1.873830,2.832218,...,2.386810,1.946781,2.117358,0.829164,2.038742,1.729182,3.416971,1.252684,1.540117,1.699632
7,0.448966,0.449818,3.027831,2.118118,3.981237,1.275095,1.544943,2.078724,1.354450,1.660231,...,0.427746,0.736098,0.742746,1.510058,1.990876,0.614202,1.594426,0.455506,0.547961,0.672731
8,3.137981,1.422489,1.757298,1.977100,1.443895,1.146972,2.088117,2.310195,2.482125,2.916892,...,0.482835,0.293675,3.072261,2.858102,1.149712,0.214998,1.545879,1.056745,1.030098,0.545902
9,0.744199,2.580198,2.231435,1.634035,3.355349,0.305408,0.627817,0.642131,2.709630,1.128094,...,0.564465,1.782054,5.405999,2.643258,0.683008,0.687777,1.718324,1.001517,2.209727,1.391773


In [4]:
x = pd.concat([x_train, x_validation], axis=1)
y = np.concatenate((y_train, y_validation))

In [5]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2288,2289,2290,2291,2292,2293,2294,2295,2296,2297
0,1.371431,2.092545,1.093020,1.387524,1.438760,3.063610,1.591292,2.267667,2.857259,2.672630,...,2.155714,1.302677,1.427715,2.808421,1.709521,1.066557,3.797564,1.063629,0.835364,1.792195
1,1.502923,0.510090,0.929712,2.768782,0.965717,1.670165,0.803191,1.261082,1.090649,2.597242,...,1.558514,1.707109,0.760898,1.912181,1.661762,1.267790,1.059502,0.697177,0.631462,1.564279
2,0.887489,1.707803,1.957168,0.620260,2.430724,0.999203,0.415900,0.546068,1.828152,3.308660,...,2.216427,0.885924,2.866622,1.127969,0.683133,3.808644,2.971708,2.867411,1.471288,2.241024
3,5.226216,0.914659,2.783728,2.591378,1.230552,3.760301,2.365233,2.847474,2.204790,3.074935,...,1.512526,0.751838,0.612303,2.738192,2.216061,1.109517,1.702540,1.274872,0.677820,1.332681
4,1.765071,0.551044,1.837835,0.974528,1.127581,1.048469,0.979929,0.908846,0.750170,0.190195,...,1.207686,1.540576,0.714643,1.463374,1.221905,0.113417,1.128181,1.117305,1.281804,2.044552
5,4.107825,1.946057,2.327149,2.631435,2.018524,6.304446,2.702466,2.617238,2.334027,2.247442,...,1.842758,0.780906,1.131464,2.484944,3.173844,2.105814,4.482261,3.190054,0.984614,2.040031
6,2.626557,1.457486,2.153209,1.250985,3.381415,2.691927,1.607120,2.220424,1.873830,2.832218,...,2.566863,1.211082,1.246376,3.317627,1.980058,1.594512,2.214745,1.114199,1.827001,0.946288
7,0.448966,0.449818,3.027831,2.118118,3.981237,1.275095,1.544943,2.078724,1.354450,1.660231,...,1.331672,1.077602,0.973955,2.027631,0.902070,0.852686,0.952020,1.398310,0.833094,1.588427
8,3.137981,1.422489,1.757298,1.977100,1.443895,1.146972,2.088117,2.310195,2.482125,2.916892,...,1.636316,0.427852,1.657695,2.177005,3.270567,2.545782,1.924869,0.381427,1.448429,0.913392
9,0.744199,2.580198,2.231435,1.634035,3.355349,0.305408,0.627817,0.642131,2.709630,1.128094,...,1.722844,0.963948,0.935739,-0.062635,2.679134,0.922713,0.324756,1.491180,1.017643,1.774796


Finding the best classifier requires setting up the hyperparameters and the wanted SVM classifier. 

In [None]:
svc = SVC()

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ('linear', 'rbf'),
    'gamma': [0.0001, 0.001]
}

clf = GridSearchCV(svc, param_grid, cv=5, return_train_score=False, verbose=30, n_jobs=-1)

clf.fit(x.T, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed: 22

In [28]:
# Support Vector Classifier
svc = SVC(gamma="scale")

pca = PCA()

pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])

# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'pca__n_components': [1000, 1500, 2000],
    'svc__C': [1000, 10000],
    'svc__kernel': ('linear', 'rbf')
}

# GridSearch returns the best classifier for the given hyperparameters
clf = GridSearchCV(pipe, param_grid, iid=False, cv=5,
                      return_train_score=False, verbose=20, n_jobs=-1)

Training the classifier using the entire labeled dataset.

In [29]:
clf.fit(x.T, y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  9

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'pca__n_components': [1000, 1500, 2000], 'svc__C': [1000, 10000], 'svc__kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=20)

In [None]:
print("Best parameter (CV score=%0.3f):" % clf.best_score_)
print(clf.best_params_)

In [None]:
# Plot the PCA spectrum
pca.fit(x)

## Grid Search Results

In [None]:
Results = pd.DataFrame(clf.cv_results_)
Results

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(12, 8))
ax0.plot(pca.explained_variance_ratio_, linewidth=2)
ax0.set_ylabel('PCA explained variance')
ax0.axvline(clf.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
components_col = 'param_pca__n_components'
best_clfs = Results.groupby(components_col).apply(
    lambda g: g.nlargest(1, 'mean_test_score'))

best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
               legend=False, ax=ax1)
ax1.set_ylabel('Classification accuracy (val)')
ax1.set_xlabel('n_components')

plt.tight_layout()
plt.show()

The found hyperparameter settings are used to predict the classes of the unlabeled test set.

In [None]:
predictions = clf.predict(x_test)

In [None]:
predictions

In [None]:
submission = pd.DataFrame({'id': np.arange(1,3461), 'label': predictions})
submission.to_csv("submissionSVM.csv",index=False)