# Support Vector Machine (SVM)

### Different strategies would be considered:
* Cross-validation with stratified k-fold of 5. We will apply a stratified approach for keeping the proportion of different classes during the process. Stratified kfolds performs the train/test split once while stratifiedshufflesplit will shuffle data each time before the split. The latter method would imply some overalp of the test sets. At this stage, we will focus on the stratified kfolds.

* PCA will be applied to optimize the computation time (c# of componenets explaining 80% and 95% of the variance will be tested)

* Standarscaler will be applied to optimize model perform, especially if SGDCClassifier will be performed

* In terms of models, we will perform first a SVC with linear kernel tuning C which will set a baseline. We will move to SVC with RBF kernel optimizing C and Gamma. 

* We would consider to perfom SGDClassifier depending on the results of the the first 2 approaches. It appears to be more adapted to large set of data but would require more effort for hyperparameter tuning

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid

from sklearn import datasets
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz
from sklearn.datasets import make_circles
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve

import random
from sklearn import ensemble

from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

# Activate Seaborn style
sns.set()

## Import data

In [2]:
# Load the npz file
with np.load('Cifar4-train.npz', allow_pickle=False) as cifar_df:
    # It's a dictionary-like object
    cifar4 = dict(cifar_df.items())

In [3]:
X=cifar4['overfeat']
y=cifar4['labels']
names=cifar4['names']

print(X.shape, X.dtype)
print(y.shape, y.dtype)
print(names.shape, names.dtype)

(5000, 4096) float32
(5000,) int64
(4,) <U8


### Split data in Train (4000) / Test (1000) data sets

In [4]:
# Convert the type of the input matrix to float
X = X.astype(np.float)

# Create train set
X_tr, X_te, y_tr, y_te = train_test_split(X,
    y,
    test_size=1000, random_state=0)

print('Train:', X_tr.shape, y_tr.shape)
print('Test:', X_te.shape, y_te.shape)

Train: (4000, 4096) (4000,)
Test: (1000, 4096) (1000,)


In [5]:
# Image distribution y_tr
pd.value_counts(y_tr, normalize=True)

1    0.25575
2    0.25075
3    0.24800
0    0.24550
dtype: float64

In [6]:
# Image distribution y_te
pd.value_counts(y_te, normalize=True)

0    0.268
3    0.258
2    0.247
1    0.227
dtype: float64

### **Create a SVM with linear kernel pipeline**

In [7]:
# Create SVM pipeline
pipe1 = Pipeline([
    ('scaler', StandardScaler()), # with standardization StandardScaler()
    ('PCA', PCA()),
    ('svc_linear', SVC(kernel='linear', random_state=0))  
])

In [8]:
pipe1.get_params()

{'memory': None,
 'steps': [('scaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('PCA',
   PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
       svd_solver='auto', tol=0.0, whiten=False)),
  ('svc_linear', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
       decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
       kernel='linear', max_iter=-1, probability=False, random_state=0,
       shrinking=True, tol=0.001, verbose=False))],
 'verbose': False,
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'PCA': PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False),
 'svc_linear': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
     kernel='linear', max_iter=-1, probability=False, random_state=0,
     shrinking=True, tol=0.001, verbose=False),
 'scale

### Define a cross-validation object with a grid of parameters for the SVC with linear kernel

In [9]:
# Create cross-validation object
grid_svc_linear = GridSearchCV(pipe1, [{
    'PCA__n_components':[175], # nb of components explaining 95% of the variance (previous run with 61 components; 80% explained has been tested)
    'svc_linear__C':[0.001,0.01,0.1], # range of C defining the moel complexity
    'svc_linear__decision_function_shape':['ovo', 'ovr'], # OnevsOne (ovo) and OnevsRest (ovr). The latter beiing more appropriate for multi-class strategy
}],return_train_score=True, cv=5)

# Fit estimator
grid_svc_linear.fit(X_tr, y_tr)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('PCA',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc_linear',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_...
         

In [10]:
# Get the results with "cv_results_"
grid_svc_linear.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_PCA__n_components', 'param_svc_linear__C', 'param_svc_linear__decision_function_shape', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [11]:
# Collect results in a DataFrame
df_svc_linear = pd.DataFrame.from_items([
    ('PCA', grid_svc_linear.cv_results_['param_PCA__n_components']),
    ('C', grid_svc_linear.cv_results_['param_svc_linear__C']),
    ('Decision', grid_svc_linear.cv_results_['param_svc_linear__decision_function_shape']),
    ('mean_tr', grid_svc_linear.cv_results_['mean_train_score']),
    ('mean_te', grid_svc_linear.cv_results_['mean_test_score']),
    ('std_te_score', grid_svc_linear.cv_results_['std_test_score']),
    
])
df_svc_linear.sort_values(by='mean_te', ascending=False)

Unnamed: 0,PCA,C,Decision,mean_tr,mean_te,std_te_score
1,175,0.001,ovr,0.886064,0.833,0.014781
0,175,0.001,ovo,0.886814,0.83225,0.014909
3,175,0.01,ovr,0.91594,0.8205,0.014741
2,175,0.01,ovo,0.915564,0.8155,0.013574
5,175,0.1,ovr,0.936628,0.79725,0.012635
4,175,0.1,ovo,0.937502,0.79675,0.015706


### Find the best combination of parameters

In [12]:
# Find best combination
idx = np.argmax(df_svc_linear['mean_te'])
print('Top test accuracy on set {:}'.format(idx))

print('Accuracy on test set {:.3f}'.format(
     df_svc_linear.mean_te[idx]
     ))



Top test accuracy on set 1
Accuracy on test set 0.833


### Save SVM Linear results

In [13]:
SVC_Linear=0.829
%store SVC_Linear

Stored 'SVC_Linear' (float)


### **Create a SVM with RBF kernel pipeline**

In [14]:
# Create SVM pipeline
pipe2 = Pipeline([
    ('scaler', StandardScaler()), # with standardization StandardScaler()
    ('PCA', PCA(n_components=175)), # 175 components to explain 95% of the variance (ref. Data Exploration section)
    ('svc_RBF', SVC(kernel='rbf', random_state=0))  
])

In [15]:
pipe2.get_params()

{'memory': None,
 'steps': [('scaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('PCA',
   PCA(copy=True, iterated_power='auto', n_components=175, random_state=None,
       svd_solver='auto', tol=0.0, whiten=False)),
  ('svc_RBF', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
       decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
       kernel='rbf', max_iter=-1, probability=False, random_state=0,
       shrinking=True, tol=0.001, verbose=False))],
 'verbose': False,
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'PCA': PCA(copy=True, iterated_power='auto', n_components=175, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False),
 'svc_RBF': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
     kernel='rbf', max_iter=-1, probability=False, random_state=0,
     shrinking=True, tol=0.001, verbose=False),
 'scaler__copy': True

### Define a cross-validation object with a grid of parameters for the SVC with RBF kernel

In [16]:
# Create cross-validation object
grid_svc_rbf = GridSearchCV(pipe2, [{
    'PCA__n_components':[175], # nb of components explaining 95% of the variance (previous run with 61 components; 80% explained has been tested)
    'svc_RBF__C':[10, 50], # range of C defining the model complexity (tested but not good: 1, 0.1)
    'svc_RBF__gamma':[0.001, 0.0001,0.00001], # try lower than 0.001
    'svc_RBF__decision_function_shape':['ovr', 'ovo'], # OnevsRest (ovr)
}],cv=5)

# Fit estimator
grid_svc_rbf.fit(X_tr, y_tr)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('PCA',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=175, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc_RBF',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_func...
                                            kernel='r

In [18]:
# Get the results with "cv_results_"
grid_svc_rbf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_PCA__n_components', 'param_svc_RBF__C', 'param_svc_RBF__decision_function_shape', 'param_svc_RBF__gamma', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

### Collect results in a dataframe

In [19]:
# Collect results in a DataFrame
df_svc_rbf = pd.DataFrame.from_items([
    ('PCA', grid_svc_rbf.cv_results_['param_PCA__n_components']),
    ('C', grid_svc_rbf.cv_results_['param_svc_RBF__gamma']),
    ('Gamma', grid_svc_rbf.cv_results_['param_svc_RBF__C']),
    ('Decision', grid_svc_rbf.cv_results_['param_svc_RBF__decision_function_shape']),
    ('mean_te', grid_svc_rbf.cv_results_['mean_test_score']),
    ('std_te_score', grid_svc_rbf.cv_results_['std_test_score']),
    
])
df_svc_rbf.sort_values(by='mean_te', ascending=False)

Unnamed: 0,PCA,C,Gamma,Decision,mean_te,std_te_score
4,175,0.0001,10,ovo,0.8425,0.01228
1,175,0.0001,10,ovr,0.83875,0.01209
5,175,1e-05,10,ovo,0.83425,0.005819
8,175,1e-05,50,ovr,0.83325,0.010828
7,175,0.0001,50,ovr,0.83175,0.010381
10,175,0.0001,50,ovo,0.83175,0.009067
2,175,1e-05,10,ovr,0.83125,0.006849
11,175,1e-05,50,ovo,0.82975,0.013941
3,175,0.001,10,ovo,0.78575,0.006465
9,175,0.001,50,ovo,0.78575,0.00693


### Save SVC RBF results

In [1]:
SVC_RBF=0.843
%store SVC_RBF

Stored 'SVC_RBF' (float)
