# 02 (Cont'd)

## 02.01 Importing modules and styles

In [1]:
import pandas as pd #used
import numpy as np #used
import pickle
import os
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline #used
from sklearn.preprocessing import StandardScaler #used
from sklearn.decomposition import PCA #used
from sklearn.model_selection import StratifiedShuffleSplit #used
from sklearn.svm import SVC #used
from sklearn.model_selection import GridSearchCV #used

## 02.02 Importing data


In [2]:
# Load train data
with np.load(os.path.join('data', 'cifar4-train.npz'), allow_pickle=False) as npz_file:
    cifar_tr = dict(npz_file.items())
with np.load(os.path.join('data', 'cifar4-test.npz'), allow_pickle=False) as npz_file:
    cifar_te = dict(npz_file.items())
X_cifar_pixels_tr=cifar_tr['pixels']
X_cifar_overfeat_tr=cifar_tr['overfeat']
y_cifar_tr=cifar_tr['labels']
X_cifar_pixels_te=cifar_te['pixels']
X_cifar_overfeat_te=cifar_te['overfeat']

## 02.03 Splitting data

In [3]:
# Creating a test set
ssplit = StratifiedShuffleSplit(n_splits=500, test_size=1000, random_state=0)

for train_index, test_index in ssplit.split(X_cifar_pixels_tr, y_cifar_tr):
    X_pixels_tr, X_pixels_te = X_cifar_pixels_tr[train_index], X_cifar_pixels_tr[test_index]
    X_overfeat_tr, X_overfeat_te = X_cifar_overfeat_tr[train_index], X_cifar_overfeat_tr[test_index]
    y_tr, y_te = y_cifar_tr[train_index], y_cifar_tr[test_index]

In [4]:
# Verifying shapes
print('Train sets shapes')
print('Pixels:',X_pixels_tr.shape)
print('Overfeat features:',X_overfeat_tr.shape)
print('Labels:',y_tr.shape)
print()
print('Test sets shapes')
print('Pixels:',X_pixels_te.shape)
print('Overfeat features:',X_overfeat_te.shape)
print('Labels:',y_te.shape)

Train sets shapes
Pixels: (4000, 3072)
Overfeat features: (4000, 4096)
Labels: (4000,)

Test sets shapes
Pixels: (1000, 3072)
Overfeat features: (1000, 4096)
Labels: (1000,)


# 04 Support vector machines

## 04.01 PCA preprocessing

Due to the limited processing capabilities available, and the heavy computational requirement of SVM, which adds space dimensions, I will use again a PCA of 175 components explaining 90% of the variance.

In [5]:
# Reducing the Overfeat features with the PCA using 175 components
scaler=StandardScaler()
X_Of_tr_r=scaler.fit_transform(X_overfeat_tr) #standardising features
pca175=PCA(n_components=175)
X_train=pca175.fit_transform(X_Of_tr_r) #WARNING: X_train here corresponds to X_Of_tr_175r in previous model tasks
pve = pca175.explained_variance_ratio_ # Variance explained by components
print('Shape of preprocessed Overfeat matrix:',X_train.shape)
print('Percentage of variance explained: {:.2f}'.format(np.max(np.cumsum(pve))))

Shape of preprocessed Overfeat matrix: (4000, 175)
Percentage of variance explained: 0.90


## 04.02 SVM with linear kernel and GridSearch CV tuning

In [6]:
# Defining the grid

n_jobs=-1

pipe=Pipeline([('scaler',None),('svc',SVC())]) # Pipeline with Support vector machine

grid_cv_lr=GridSearchCV(pipe,[{'svc__kernel':['linear'], # setting the linear model in the gridsearch
                            'svc__C':[0.001,0.005,0.01], # settiung parameters for C constant
                            }],
                     verbose=10,
                     cv=5) #5 folds

After a first run attempt with C=[0.1, 1, 10], which revealed eternal, I stopped the process and reset smaller values instead.

In [7]:
# Fitting the grid

grid_cv_lr.fit(X_train,y_tr)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] svc__C=0.001, svc__kernel=linear ................................
[CV] .. svc__C=0.001, svc__kernel=linear, score=0.82875, total=   1.2s
[CV] svc__C=0.001, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s


[CV] .. svc__C=0.001, svc__kernel=linear, score=0.81625, total=   1.1s
[CV] svc__C=0.001, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.2s remaining:    0.0s


[CV] .. svc__C=0.001, svc__kernel=linear, score=0.83875, total=   1.1s
[CV] svc__C=0.001, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.4s remaining:    0.0s


[CV] .. svc__C=0.001, svc__kernel=linear, score=0.83625, total=   1.2s
[CV] svc__C=0.001, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.5s remaining:    0.0s


[CV] .. svc__C=0.001, svc__kernel=linear, score=0.81125, total=   1.1s
[CV] svc__C=0.005, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.6s remaining:    0.0s


[CV] ... svc__C=0.005, svc__kernel=linear, score=0.8125, total=   1.2s
[CV] svc__C=0.005, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   12.8s remaining:    0.0s


[CV] ... svc__C=0.005, svc__kernel=linear, score=0.8125, total=   1.2s
[CV] svc__C=0.005, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   14.9s remaining:    0.0s


[CV] .. svc__C=0.005, svc__kernel=linear, score=0.82875, total=   1.2s
[CV] svc__C=0.005, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   17.0s remaining:    0.0s


[CV] .. svc__C=0.005, svc__kernel=linear, score=0.82625, total=   1.2s
[CV] svc__C=0.005, svc__kernel=linear ................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   19.1s remaining:    0.0s


[CV] ... svc__C=0.005, svc__kernel=linear, score=0.8125, total=   1.2s
[CV] svc__C=0.01, svc__kernel=linear .................................
[CV] ... svc__C=0.01, svc__kernel=linear, score=0.80875, total=   1.4s
[CV] svc__C=0.01, svc__kernel=linear .................................
[CV] ... svc__C=0.01, svc__kernel=linear, score=0.81125, total=   1.4s
[CV] svc__C=0.01, svc__kernel=linear .................................
[CV] ... svc__C=0.01, svc__kernel=linear, score=0.81875, total=   1.5s
[CV] svc__C=0.01, svc__kernel=linear .................................
[CV] ... svc__C=0.01, svc__kernel=linear, score=0.82125, total=   1.5s
[CV] svc__C=0.01, svc__kernel=linear .................................
[CV] ..... svc__C=0.01, svc__kernel=linear, score=0.795, total=   1.4s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   32.6s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', None), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'svc__kernel': ['linear'], 'svc__C': [0.001, 0.005, 0.01]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [8]:
# Creating the DataFrame with the grid values

lr_df=pd.DataFrame.from_items([('c',grid_cv_lr.cv_results_['param_svc__C']),
                           ('mean_te',grid_cv_lr.cv_results_['mean_test_score']), #mean accuracy
                           ('std_te',grid_cv_lr.cv_results_['std_test_score']), #stdv of set folds accuracy
                           ])

In [9]:
lr_df.sort_values(by='mean_te',ascending=False)

Unnamed: 0,c,mean_te,std_te
0,0.001,0.82625,0.01084
1,0.005,0.8185,0.007391
2,0.01,0.811,0.009233


The accuracy uis relatively high with a low regularization.

## 04.03 SVM with RBF kernel and GridSearch CV tuning

In [10]:
# Defining the grid
n_jobs=-1

pipe=Pipeline([('scaler',None),('svc',SVC())])

grid_cv_rbf=GridSearchCV(pipe,[{'svc__kernel':['rbf'], #setting the RBF kernel trick
                            'svc__gamma':[0.0001,100],
                            'svc__C':[0.001,1],
                            }],
                     verbose=10,
                     cv=5)

In [11]:
# Fitting the grid onto the train set

grid_cv_rbf.fit(X_train,y_tr)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf ................
[CV]  svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf, score=0.6975, total=   4.6s
[CV] svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf, score=0.66875, total=   4.6s
[CV] svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.2s remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf, score=0.68625, total=   4.6s
[CV] svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf ................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   21.3s remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf, score=0.70125, total=   4.7s
[CV] svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf ................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   28.4s remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=0.0001, svc__kernel=rbf, score=0.68375, total=   4.6s
[CV] svc__C=0.001, svc__gamma=100, svc__kernel=rbf ...................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.5s remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=100, svc__kernel=rbf, score=0.25, total=   5.5s
[CV] svc__C=0.001, svc__gamma=100, svc__kernel=rbf ...................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   44.0s remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=100, svc__kernel=rbf, score=0.25, total=   5.5s
[CV] svc__C=0.001, svc__gamma=100, svc__kernel=rbf ...................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   52.5s remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=100, svc__kernel=rbf, score=0.25, total=   5.5s
[CV] svc__C=0.001, svc__gamma=100, svc__kernel=rbf ...................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.0min remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=100, svc__kernel=rbf, score=0.25, total=   5.5s
[CV] svc__C=0.001, svc__gamma=100, svc__kernel=rbf ...................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.2min remaining:    0.0s


[CV]  svc__C=0.001, svc__gamma=100, svc__kernel=rbf, score=0.25, total=   5.5s
[CV] svc__C=1, svc__gamma=0.0001, svc__kernel=rbf ....................
[CV]  svc__C=1, svc__gamma=0.0001, svc__kernel=rbf, score=0.82625, total=   1.8s
[CV] svc__C=1, svc__gamma=0.0001, svc__kernel=rbf ....................
[CV]  svc__C=1, svc__gamma=0.0001, svc__kernel=rbf, score=0.82375, total=   1.8s
[CV] svc__C=1, svc__gamma=0.0001, svc__kernel=rbf ....................
[CV]  svc__C=1, svc__gamma=0.0001, svc__kernel=rbf, score=0.8375, total=   1.8s
[CV] svc__C=1, svc__gamma=0.0001, svc__kernel=rbf ....................
[CV]  svc__C=1, svc__gamma=0.0001, svc__kernel=rbf, score=0.82875, total=   1.8s
[CV] svc__C=1, svc__gamma=0.0001, svc__kernel=rbf ....................
[CV]  svc__C=1, svc__gamma=0.0001, svc__kernel=rbf, score=0.8175, total=   1.8s
[CV] svc__C=1, svc__gamma=100, svc__kernel=rbf .......................
[CV]  svc__C=1, svc__gamma=100, svc__kernel=rbf, score=0.25, total=   5.5s
[CV] svc__C=1, sv

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  2.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', None), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'svc__kernel': ['rbf'], 'svc__gamma': [0.0001, 100], 'svc__C': [0.001, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [12]:
# Creating the DataFrame with the grid values
rbf_df=pd.DataFrame.from_items([('c',grid_cv_rbf.cv_results_['param_svc__C']),
                            ('gamma',grid_cv_rbf.cv_results_['param_svc__gamma']),
                            ('mean_te',grid_cv_rbf.cv_results_['mean_test_score']),
                            ('std_te',grid_cv_rbf.cv_results_['std_test_score'])
                           ])

In [13]:
rbf_df.sort_values(by='mean_te',ascending=False)

Unnamed: 0,c,gamma,mean_te,std_te
2,1.0,0.0001,0.82675,0.006548
0,0.001,0.0001,0.6875,0.011456
1,0.001,100.0,0.25,0.0
3,1.0,100.0,0.25,0.0


In [18]:
# Extracting the best SVM parameters from the DataFrames
c_best_rbf=rbf_df.loc[rbf_df['mean_te'].argmax(),'c']
gamma_best_rbf=rbf_df.loc[rbf_df['mean_te'].argmax(),'gamma']
c_best_lr=lr_df.loc[lr_df['mean_te'].argmax(),'c']
mean_te_best_lr=lr_df.loc[lr_df['mean_te'].argmax(),'mean_te']
mean_te_best_rbf=rbf_df.loc[rbf_df['mean_te'].argmax(),'mean_te']
std_te_best_lr=lr_df.loc[lr_df['mean_te'].argmax(),'std_te']
std_te_best_rbf=rbf_df.loc[rbf_df['mean_te'].argmax(),'std_te']

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  


In [19]:
# Printing the best SVM parameters
print ('Best parameters')
print ('Linear SVM - top accuracy accross folds:{:.2f}'.format(mean_te_best_lr),'(std:{:.3f}'.format(std_te_best_lr),') with C:',c_best_lr)
print ('RBF SVM    - top accuracy accross folds:{:.2f}'.format(mean_te_best_rbf),'(std:{:.3f}'.format(std_te_best_rbf),') with C:',c_best_rbf,'     and Gamma:',gamma_best_rbf)

Best parameters
Linear SVM - top accuracy accross folds:0.83 (std:0.011 ) with C: 0.001
RBF SVM    - top accuracy accross folds:0.83 (std:0.007 ) with C: 1      and Gamma: 0.0001


Note that we obtain the best score of the RBF method with a large regularization and a very small gamma.
With these parameters the RBF method behaves like a linear method. Actually the accuracies of the linear and the RBF
methods are similar.

## 04.04 Evaluating the tuned estimators on the test set

In [20]:
# Reducing the Overfeat features of the test set with the PCA using 175 components tuned on the train set
X_Of_te_r=scaler.fit_transform(X_overfeat_te) #standardising features
X_test=pca175.fit_transform(X_Of_te_r) #WARNING: X_test here corresponds to X_Of_te_175r in previous model tasks
pve = pca175.explained_variance_ratio_ # Variance explained by components
print('Shape of preprocessed Overfeat matrix:',X_test.shape)
print('Percentage of variance explained: {:.2f}'.format(np.max(np.cumsum(pve))))

Shape of preprocessed Overfeat matrix: (1000, 175)
Percentage of variance explained: 0.92


In [22]:
svc=SVC(kernel='linear',C=c_best_lr)
svc.fit(X_train,y_tr)
print('Linear SVM - top accuracy (test set):',svc.score(X_test,y_te))

rbf=SVC(kernel='rbf',C=c_best_rbf,gamma=gamma_best_rbf)
rbf.fit(X_train,y_tr)
print('RBF SVM - top accuracy (test set):',rbf.score(X_test,y_te))


Linear SVM - top accuracy (test set): 0.345
RBF SVM - top accuracy (test set): 0.358
