In [3]:
import math
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import csv
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import polynomial_kernel
from numpy.linalg import inv, norm
from mpl_toolkits.mplot3d import axes3d
from numpy import mean
%matplotlib inline

### Import data from csv
Store the data as a numpy array. \
Shape of the full dataset: 
9298 observations (rows), 257 cols, where column 1 corresponds to the true label of the digit, while the cols 2:257 store greyscale values.


In [4]:
# Import to df
data_df = pd.read_csv("zipcombo2.csv")

# Convert to array to facilitate vectorization later
data = data_df.to_numpy()
X = data[:, 1:]
y = data[:, 0]

### SVC IMPLEMENTATION

In [67]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

### Time complexity

In [66]:
times = []
for i in range(20):
    startTime = time.perf_counter()

    clf = SVC(C = c_, gamma='auto')
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error

    t = time.perf_counter() - startTime
    print("Run: " + str(i+1) + "t: " + str(t))
    times.append(t)

In [69]:
print("Mean computational time: " + str(np.mean(times)) +"+/-" + str(np.std(times)) + " seconds.")

Mean computational time: 14.084167625001282+/-1.043109053152576 seconds.


## Experimenting with different parameters

We will start our model selection by selecting whether we will use the polynomial kernel or the gaussian kernel. To make this decision, we train the model on both, gaussian and polynomial kernel for a few different parameters of gamma and c.

In [79]:
## Code used to compare gaussian and polynomial kernel

### RBF Kernel
train_errs = []
test_errs = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

trial_c = [0.000001, 0.00001, 0.001, 0.1 ]

for i, c_ in enumerate(trial_c):
    
    clf = SVC(C = c_, gamma='auto', kernel = 'rbf')
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error
    train_errs.append(train_error)
    test_errs.append(test_error)
    print("Run: " + str(i) + " Train Error " + str(train_error)+ " Test Error: " + str(test_error))
 

Run: 0 Train Error 83.51707448238774 Test Error: 82.41935483870968
Run: 1 Train Error 83.51707448238774 Test Error: 82.41935483870968
Run: 2 Train Error 83.51707448238774 Test Error: 82.41935483870968
Run: 3 Train Error 5.041677870395262 Test Error: 6.075268817204304


In [78]:
### Polynomial Kernel
train_errs = []
test_errs = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

trial_c = [0.000001, 0.00001, 0.001, 0.1 ]

for i, c_ in enumerate(trial_c):
    
    clf = SVC(C = c_, gamma='auto', kernel = 'poly')
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error
    train_errs.append(train_error)
    test_errs.append(test_error)
    print("Run: " + str(i) + " Train Error " + str(train_error)+ " Test Error: " + str(test_error))

Run: 0 Train Error 83.14062920139823 Test Error: 83.9247311827957
Run: 1 Train Error 83.14062920139823 Test Error: 83.9247311827957
Run: 2 Train Error 83.14062920139823 Test Error: 83.9247311827957
Run: 3 Train Error 5.754235009411133 Test Error: 6.72043010752688


In [19]:
## Code used to explore heuristic ranges of c

train_errs = []
test_errs = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

trial_c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]

for i, c_ in enumerate(trial_c):
    
    clf = SVC(C = c_, gamma=0.1, kernel = 'rbf')
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error
    train_errs.append(train_error)
    test_errs.append(test_error)

In [20]:
test_errs

[82.95698924731182,
 82.95698924731182,
 24.408602150537632,
 6.344086021505376,
 3.225806451612897,
 2.4731182795699027,
 2.4731182795699027]

In [27]:
## Code used to explore heuristic ranges of gamma

train_errs = []
test_errs = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

trial_c = [0.0001, 0.001, 0.01, 0.1, 1, 10, 10^3]

for i, c_ in enumerate(trial_c):
    
    clf = SVC(C = 10, gamma=c_)
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error
    train_errs.append(train_error)
    test_errs.append(test_error)
    
    print("Run: " + str(i) + " Train Error " + str(train_error) + " Test Error: " + str(test_error))
 

Run: 0 Train Error 4.127453616563599 Test Error: 5.430107526881727
Run: 1 Train Error 1.1158913686474818 Test Error: 3.4408602150537604
Run: 2 Train Error 0.026888948642110222 Test Error: 2.4193548387096797
Run: 3 Train Error 0.013444474321062216 Test Error: 52.47311827956989
Run: 4 Train Error 0.0 Test Error: 75.75268817204301
Run: 5 Train Error 0.0 Test Error: 82.41935483870968
Run: 6 Train Error 0.0 Test Error: 82.41935483870968


In [29]:
## Narrowing further the range of gamma

train_errs = []
test_errs = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

trial_c = [ 0.001, 0.005, 0.01, 0.025, 0.05]

for i, c_ in enumerate(trial_c):
    
    clf = SVC(C = 10, gamma=c_)
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error
    train_errs.append(train_error)
    test_errs.append(test_error)
    
    print("Run: " + str(i) + " gamma: "+ str(c_) + " Train Error " + str(train_error) + " Test Error: " + str(test_error))
 

Run: 0 gamma: 0.001 Train Error 1.3847808550685698 Test Error: 2.741935483870975
Run: 1 gamma: 0.005 Train Error 0.026888948642110222 Test Error: 2.1505376344086073
Run: 2 gamma: 0.01 Train Error 0.0 Test Error: 1.8817204301075208
Run: 3 gamma: 0.025 Train Error 0.0 Test Error: 3.0645161290322562
Run: 4 gamma: 0.05 Train Error 0.0 Test Error: 15.752688172043008


In [58]:
## Picking the optimum gamma value

train_errs = []
test_errs = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

trial_c = [0.009, 0.01, 0.011, 0.012, 0.013, 0.014, 0.015, 0.017, 0.018]

for i, c_ in enumerate(trial_c):
    
    clf = SVC(C = 10, gamma=c_)
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error
    train_errs.append(train_error)
    test_errs.append(test_error)
    
    print("Run: " + str(i) + " gamma: "+ str(c_) + " Train Error " + str(train_error) + " Test Error: " + str(test_error))
 

Run: 0 gamma: 0.009 Train Error 0.026888948642110222 Test Error: 1.7204301075268802
Run: 1 gamma: 0.01 Train Error 0.026888948642110222 Test Error: 1.827956989247312
Run: 2 gamma: 0.011 Train Error 0.026888948642110222 Test Error: 1.774193548387089
Run: 3 gamma: 0.012 Train Error 0.026888948642110222 Test Error: 1.774193548387089
Run: 4 gamma: 0.013 Train Error 0.026888948642110222 Test Error: 1.8817204301075208
Run: 5 gamma: 0.014 Train Error 0.026888948642110222 Test Error: 1.9354838709677438
Run: 6 gamma: 0.015 Train Error 0.026888948642110222 Test Error: 2.0430107526881613
Run: 7 gamma: 0.017 Train Error 0.026888948642110222 Test Error: 2.0967741935483843
Run: 8 gamma: 0.018 Train Error 0.026888948642110222 Test Error: 2.0967741935483843


In [59]:
## Code used to explore heuristic ranges of c

train_errs = []
test_errs = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

trial_c = [1000, 10**4,10**7, 10**15]

for i, c_ in enumerate(trial_c):
    
    clf = SVC(C = c_, gamma=0.01)
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error
    train_errs.append(train_error)
    test_errs.append(test_error)
    
    print("Run: " + str(i) + " c: "+ str(c_) + " Train Error " + str(train_error) + " Test Error: " + str(test_error))
 

Run: 0 c: 1000 Train Error 0.0 Test Error: 2.0967741935483843
Run: 1 c: 10000 Train Error 0.0 Test Error: 2.0967741935483843
Run: 2 c: 10000000 Train Error 0.0 Test Error: 2.0967741935483843
Run: 3 c: 1000000000000000 Train Error 0.0 Test Error: 2.0967741935483843


In [42]:
## Rough grid search

train_errs = []
test_errs = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

trial_c = [10, 100, 1000]
trial_gamma = [0.009, 0.01, 0.011]

for i, c_ in enumerate(trial_c):
    for gamma_ in trial_gamma:
        clf = SVC(C = c_, gamma = gamma_)
        clf.fit(X_train, y_train)
        train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
        test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error
        train_errs.append(train_error)
        test_errs.append(test_error)

        print("Run: " + str(i) + " c: "+ str(c_) + " gamma : " + str(gamma_) + " Train Error " + str(train_error) + " Test Error: " + str(test_error))


Run: 0 c: 10 gamma : 0.009 Train Error 0.026888948642110222 Test Error: 2.6344086021505433
Run: 0 c: 10 gamma : 0.01 Train Error 0.026888948642110222 Test Error: 2.6344086021505433
Run: 0 c: 10 gamma : 0.011 Train Error 0.026888948642110222 Test Error: 2.6344086021505433
Run: 1 c: 100 gamma : 0.009 Train Error 0.013444474321062216 Test Error: 2.6344086021505433
Run: 1 c: 100 gamma : 0.01 Train Error 0.013444474321062216 Test Error: 2.6344086021505433
Run: 1 c: 100 gamma : 0.011 Train Error 0.013444474321062216 Test Error: 2.6344086021505433
Run: 2 c: 1000 gamma : 0.009 Train Error 0.0 Test Error: 2.688172043010752
Run: 2 c: 1000 gamma : 0.01 Train Error 0.0 Test Error: 2.688172043010752
Run: 2 c: 1000 gamma : 0.011 Train Error 0.0 Test Error: 2.688172043010752


In [None]:
# Hence we choose a gaussian kernel with gamma = 0.01 and iterate over the c-values

### 1.1 Basic Results

In [None]:
### using gamma = 0.017

In [60]:
## 20 runs for d = {1,..,8}

train_errors = []
train_sd = []
test_errors = []
test_sd = []

for c_ in [1,10,20,30,40,50,60,70,80]:
    
    run_train_errors = []
    run_test_errors = []
    
    for run in range(20):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
        
        clf = SVC(C = c_, gamma = 0.017)
        clf.fit(X_train, y_train)
        train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
        test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error

        run_train_errors.append(train_error)
        run_test_errors.append(test_error)
        
    train_errors.append(np.mean(run_train_errors))
    print("C-value: " + str(c_) + " Average % train error: "+ str(np.mean(run_train_errors)))
    test_errors.append(np.mean(run_test_errors))
    test_sd.append(np.std(run_test_errors))
    train_sd.append(np.std(run_train_errors))

C-value: 1 Average % train error: 0.15595590212422863
C-value: 10 Average % train error: 0.02218338262974271
C-value: 20 Average % train error: 0.010755579456849773
C-value: 30 Average % train error: 0.010083355740796662
C-value: 40 Average % train error: 0.010755579456849773
C-value: 50 Average % train error: 0.010755579456849773
C-value: 60 Average % train error: 0.010755579456849773
C-value: 70 Average % train error: 0.008738908308690441
C-value: 80 Average % train error: 0.008738908308690441


In [61]:
# Create a df to store the error results
errors_df = pd.DataFrame()
errors_df['d'] = [1,10,20,30,40,50,60,70,80]
errors_df['Train Error %'] = train_errors
errors_df['+/- Train %'] = train_sd
errors_df['Test Error %'] = test_errors
errors_df['+/- Test %'] = test_sd

In [62]:
errors_df

Unnamed: 0,d,Train Error %,+/- Train %,Test Error %,+/- Test %
0,1,0.155956,0.030183,2.497312,0.269072
1,10,0.022183,0.007694,2.209677,0.241577
2,20,0.010756,0.005378,2.376344,0.356059
3,30,0.010083,0.005822,2.276882,0.368299
4,40,0.010756,0.005378,2.370968,0.304559
5,50,0.010756,0.005378,2.311828,0.327913
6,60,0.010756,0.005378,2.395161,0.292245
7,70,0.008739,0.006413,2.241935,0.286565
8,80,0.008739,0.006413,2.486559,0.260003


In [53]:
# Export to csv
errors_df.to_csv('svd_basic_errors_df.csv',index=False)

### 1.2 Cross-Validation

In [63]:
d_stars = []
test_errors = []

for run in range(20):

    # Split the data into 80% training, 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    
    # Initialise
    best_error = 500
    best_d = 0

    for d_ in [1,10,30,50,70,100]:
       
        error = 0 
        
        # Implement cross-validation
        kfold = KFold(5, True, 1)

        for train_index, test_index in kfold.split(X_train):
            Xtrain, Xtest = X_train[train_index], X_train[test_index]
            ytrain, ytest = y_train[train_index], y_train[test_index]
            clf = SVC(C = d_, gamma = 0.017)
            clf.fit(Xtrain, ytrain)
            train_error = 100 - clf.score(Xtrain, ytrain)*100 # the function 'score' gives accuracy: 1 - test_error
            test_error = 100 - clf.score(Xtest, ytest)*100 # the function 'score' gives accuracy: 1 - test_error

            error += test_error
        
        if error/5 < best_error:
            best_error = error/5
            best_d = d_
            
    # Once all the polynomial orders considered, retrain on full 80% using d*
    clf = SVC(C = best_d, gamma = 0.01)
    clf.fit(X_train, y_train)
    train_error = 100 - clf.score(X_train, y_train)*100 # the function 'score' gives accuracy: 1 - test_error
    test_error = 100 - clf.score(X_test, y_test)*100 # the function 'score' gives accuracy: 1 - test_error

    print("Run: " + str(run) + " Test Error: " + str(test_error) + " d*:  " + str(best_d))
    test_errors.append(test_error)
    d_stars.append(best_d)

Run: 0 Test Error: 2.0430107526881613 d*:  10
Run: 1 Test Error: 2.8494623655913927 d*:  10
Run: 2 Test Error: 1.6666666666666714 d*:  10
Run: 3 Test Error: 1.6666666666666714 d*:  10
Run: 4 Test Error: 2.0430107526881613 d*:  30
Run: 5 Test Error: 2.6344086021505433 d*:  10
Run: 6 Test Error: 2.204301075268816 d*:  10
Run: 7 Test Error: 2.4731182795699027 d*:  10
Run: 8 Test Error: 2.204301075268816 d*:  10
Run: 9 Test Error: 1.8817204301075208 d*:  10
Run: 10 Test Error: 2.4193548387096797 d*:  10
Run: 11 Test Error: 2.4731182795699027 d*:  10
Run: 12 Test Error: 2.311827956989248 d*:  10
Run: 13 Test Error: 1.8817204301075208 d*:  10
Run: 14 Test Error: 2.0967741935483843 d*:  10
Run: 15 Test Error: 2.204301075268816 d*:  10
Run: 16 Test Error: 1.827956989247312 d*:  10
Run: 17 Test Error: 2.258064516129039 d*:  10
Run: 18 Test Error: 1.827956989247312 d*:  10
Run: 19 Test Error: 2.5806451612903203 d*:  50


In [64]:
print("Mean test error: " + str(np.mean(test_errors)) + " +/- " + str(np.std(test_errors)))
print("Mean d*: " + str(np.mean(d_stars)) + " +/- " + str(np.std(d_stars)))

Mean test error: 2.1774193548387095 +/- 0.32145862212638276
Mean d*: 13.0 +/- 9.539392014169456
