In [4]:
#1 Python Configuration and Data Loading
import sys

if sys.version_info[0] < 3:
    raise Exception("Python 3 not detected.")
    
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
from scipy import io
# import pandas as pd

for data_name in ["mnist", "spam", "cifar10"]:
    data = io.loadmat("data/%s_data.mat" % data_name)
    print("\nloaded %s data!" % data_name)
    fields = "test_data", "training_data", "training_labels"
    for field in fields:
        print(field, data[field].shape)


loaded mnist data!
test_data (10000, 784)
training_data (60000, 784)
training_labels (60000, 1)

loaded spam data!
test_data (5857, 32)
training_data (5172, 32)
training_labels (5172, 1)

loaded cifar10 data!
test_data (10000, 3072)
training_data (50000, 3072)
training_labels (50000, 1)


In [5]:
#2 Data Partitioning
np.random.seed(1)

def shuffle_train_val_split(name, val_amt=0, percent=0):
    data = io.loadmat("data/%s_data.mat" % name)
    total_num_ex = data["training_data"].shape[0]
    shuffle = np.random.permutation(total_num_ex) 
    #shuffle before split
    data_xtrain, data_ytrain = data["training_data"][shuffle], data["training_labels"][shuffle]
    #split
    if not val_amt:
        #spam
        val_amt = int(percent * total_num_ex)
    data_xval, data_yval = data_xtrain[:val_amt,:], data_ytrain[:val_amt,:]
    data_xtrain, data_ytrain = data_xtrain[val_amt:,:], data_ytrain[val_amt:,:]
    return data_xtrain, data_ytrain, data_xval, data_yval

In [6]:
#2 Data Partitioning
mnist_xtrain, mnist_ytrain, mnist_xval, mnist_yval = shuffle_train_val_split("mnist", 10000)
spam_xtrain, spam_ytrain, spam_xval, spam_yval = shuffle_train_val_split("spam", percent=0.2)
cifar10_xtrain, cifar10_ytrain, cifar10_xval, cifar10_yval = shuffle_train_val_split("cifar10", 5000)

In [8]:
#3 Support Vector Machines: Coding

#preprocessing
mnist_xtrain, mnist_xval = (mnist_xtrain-np.mean(mnist_xtrain))/np.std(mnist_xtrain), (mnist_xval-np.mean(mnist_xval))/np.std(mnist_xval)
#remember to normalize test set as well

def train(data_xtrain, data_ytrain, data_xval, data_yval, num_ex, kernel):
    data_ytrain = data_ytrain.reshape(-1,)
    data_yval = data_yval.reshape(-1,)
    model = svm.SVC(gamma=0.05, kernel=kernel, cache_size=2000)
    model.fit(data_xtrain[:num_ex],data_ytrain[:num_ex])
    return 1-accuracy_score(data_ytrain, model.predict(data_xtrain)), 1-accuracy_score(data_yval,model.predict(data_xval))


In [None]:
#3a, MNIST
num_exs = [100, 200, 500, 1000, 2000, 5000, 10000]
mnist_train_err = []
mnist_val_err = []
for num_ex in num_exs:
    train_err, val_err = train(mnist_xtrain, mnist_ytrain, mnist_xval, mnist_yval, num_ex, "linear")
    print(train_err,val_err,num_ex)
    mnist_train_err.append(train_err)
    mnist_val_err.append(mnist_val_err)
    
plt.plot(mnist_train_err, mnist_val_err, 'ro')
plt.axis(num_exs)
plt.show()

In [None]:
#3b, spam
num_exs = [100, 200, 500, 1000, 2000, spam_xtrain.shape[0]]
spam_train_err = []
spam_val_err = []
for num_ex in num_exs:
    train_err, val_err = train(spam_xtrain, spam_ytrain, spam_xval, spam_yval, num_ex, "linear")
    print(train_err,val_err,num_ex)
    spam_train_err.append(train_err)
    spam_val_err.append(spam_val_err)
    
plt.plot(spam_train_err, spam_val_err, 'ro')
plt.axis(num_exs)
plt.show()

0.221604639922668 0.21179883945841393 100
0.20807153214113094 0.20889748549323017 200
0.20082165297245047 0.18471953578336553 500
0.20372160463992262 0.17988394584139267 1000
0.20372160463992262 0.17988394584139267 2000


In [None]:
#3c, cifar10
num_exs = [100, 200, 500, 1000, 2000, 5000]
cifar10_train_err = []
cifar10_val_err = []
for num_ex in num_exs:
    train_err, val_err = train(cifar10_xtrain, cifar10_ytrain, cifar10_xval, cifar10_yval, num_ex, "linear")
    print(train_err,val_err,num_ex)
    cifar10_train_err.append(cifar10_err)
    cifar10_val_err.append(cifar10_val_err)
    
plt.plot(cifar10_train_err, cifar10_val_err, 'ro')
plt.axis(num_exs)
plt.show()

In [None]:
#4 Hyperparameter Tuning
def train_hyper(data_xtrain, data_ytrain, data_xval, data_yval, num_ex, kernel, C):
    data_ytrain = data_ytrain.reshape(-1,)
    data_yval = data_yval.reshape(-1,)
    model = svm.SVC(gamma=0.001, kernel=kernel, C=C)
    model.fit(data_xtrain[:num_ex],data_ytrain[:num_ex])
    return 1-accuracy_score(data_yval,model.predict(data_xval))

num_ex = 10000
C = [0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 1]
mnist_val_err = []
for c in C:
    val_err = train_hyper(mnist_xtrain, mnist_ytrain, mnist_xval, mnist_yval, num_ex, "linear", c)
    print(val_err,c)
    mnist_val_err.append(mnist_val_err)
print("best C value is ", C[mnist_val_err.index(min(mnist_val_err))])
plt.plot(mnist_val_err, 'ro')
plt.axis(C)
plt.show()