In [4]:
import numpy
from gzip import GzipFile
import struct
import numpy as np
import pickle
from sklearn import svm, metrics
from sklearn.model_selection import GridSearchCV

# http://yann.lecun.com/exdb/mnist/

<center> <h1> TRAINING SET IMAGE FILE</h1> </center>

|[offset] | [type]       |   [value]   |       [description] |
|---------|--------------|-------------|---------------------|
|0000     |32 bit integer  | 0x00000803(2051)| magic number 
|0004     |32 bit integer  | 60000           | number of images 
|0008     |32 bit integer  |28               |number of rows 
|0012     |32 bit integer  |28               |number of columns 
|0016     |unsigned byte   |??               |pixel 
|0017     |unsigned byte   |??               |pixel 

In [24]:
def load_mnist_features(path):
    # 16 first bytes, MSB first, 4 integers.
    t = GzipFile(path, 'rb')
    magic, n_samples, n_rows, n_columns = struct.unpack(">iiii", t.read(16))
    assert(magic == 2051)

    print(magic, n_samples, n_rows, n_columns)
    # samples x features matrix
    # samples are the images (60k)
    # features are each pixel value (28x28=784 features)
    m = np.zeros((n_samples, n_rows * n_columns), dtype = np.uint8)
    for i in range(n_samples):
        digit_data = t.read(n_rows * n_columns)
        m[i,:] = np.frombuffer(digit_data, dtype = np.uint8)
    m = m.astype(np.float) / 255.0
    return m

<center> <h1> TRAINING SET LABEL FILE </h1> </center>

|[offset] |[type]        |    [value]      |    [description] 
|---------|--------------|-----------------|---------------------|
|0000     |32 bit integer | 0x00000801(2049)| magic number (MSB first) 
|0004     |32 bit integer  |60000            |number of items 
|0008     |unsigned byte   |??               |label 
|0009     |unsigned byte   |??               |label 

In [25]:
#integers are in big-endian, so we need to use >
def load_mnist_labels(path):
	t = GzipFile(path, 'rb')
	magic, n_samples = struct.unpack(">ii", t.read(8))
	assert(magic == 2049)

	print(magic, n_samples)

	m = np.zeros(n_samples, dtype = np.uint8)
	# each label is a unsigned byte, the number of labels is n_samples
	label_data = t.read(n_samples)
	m[:] = np.frombuffer(label_data, dtype = np.uint8)
	return m

In [5]:
!wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz

--2019-03-22 19:58:54--  http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Resolving yann.lecun.com (yann.lecun.com)... 216.165.22.6
Connecting to yann.lecun.com (yann.lecun.com)|216.165.22.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9912422 (9.5M) [application/x-gzip]
Saving to: ‘train-images-idx3-ubyte.gz’


2019-03-22 19:59:15 (449 KB/s) - ‘train-images-idx3-ubyte.gz’ saved [9912422/9912422]



In [6]:
!wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz

--2019-03-22 19:59:23--  http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Resolving yann.lecun.com (yann.lecun.com)... 216.165.22.6
Connecting to yann.lecun.com (yann.lecun.com)|216.165.22.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28881 (28K) [application/x-gzip]
Saving to: ‘train-labels-idx1-ubyte.gz’


2019-03-22 19:59:23 (200 KB/s) - ‘train-labels-idx1-ubyte.gz’ saved [28881/28881]



In [7]:
!wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz

--2019-03-22 19:59:51--  http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Resolving yann.lecun.com (yann.lecun.com)... 216.165.22.6
Connecting to yann.lecun.com (yann.lecun.com)|216.165.22.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1648877 (1.6M) [application/x-gzip]
Saving to: ‘t10k-images-idx3-ubyte.gz’


2019-03-22 19:59:56 (397 KB/s) - ‘t10k-images-idx3-ubyte.gz’ saved [1648877/1648877]



In [8]:
!wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz

--2019-03-22 20:00:19--  http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Resolving yann.lecun.com (yann.lecun.com)... 216.165.22.6
Connecting to yann.lecun.com (yann.lecun.com)|216.165.22.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4542 (4.4K) [application/x-gzip]
Saving to: ‘t10k-labels-idx1-ubyte.gz’


2019-03-22 20:00:19 (74.7 MB/s) - ‘t10k-labels-idx1-ubyte.gz’ saved [4542/4542]



In [12]:
train_features = load_mnist_features("train-images-idx3-ubyte.gz")
train_labels   = load_mnist_labels  ("train-labels-idx1-ubyte.gz")
test_features  = load_mnist_features("t10k-images-idx3-ubyte.gz")
test_labels	= load_mnist_labels  ("t10k-labels-idx1-ubyte.gz")

DO_TRAIN=False

DO_RUN_BASIC=True

if DO_TRAIN:
	clf = svm.SVC(kernel='rbf')
	clf.fit(train_features, train_labels)
	with open("svm_model_basic.pkl", 'wb') as file:
		pickle.dump(clf, file)
else:
	clf = pickle.load(open("svm_model_basic.pkl", 'rb'))

if DO_RUN_BASIC:
	print("====== Basic SVC ======")
	y_train = clf.predict(train_features)
	print("Training Accuracy:", metrics.accuracy_score(train_labels, y_train))

	y_test = clf.predict(test_features)
	print("Test Accuracy:", metrics.accuracy_score(test_labels, y_test))

# using grid-search to find best meta-parameters

2051 60000 28 28
2049 60000
2051 10000 28 28
2049 10000
Training Accuracy: 0.943
Test Accuracy: 0.9446


### Parameter Selection
* As we increase C, we obtain a larger margin for our SVM. 

* A "narrow cushion translates into little / no mistakes" and a "wide cushion, quite a few mistakes"

* A large gamma can cause the model to overfit and be prone to low bias/high variance. 

* A small gamma means the model is less prone to overfitting, but you may risk not learning a decision boundary that captures your data.

* If gamma='scale' is passed then it uses 1 / (n_features * X.var()) as value of gamma

In [16]:
def svc_param_selection(X, y, nfolds):
	Cs = [10, 100, 1000]
	param_grid = {'C': Cs }
	grid_search = GridSearchCV(svm.SVC(kernel='rbf', gamma='scale'), param_grid, cv=nfolds, n_jobs=4, verbose=True)
	grid_search.fit(X, y)
	print(grid_search.best_params_)
	return grid_search

In [17]:
best_clf = svc_param_selection(train_features, train_labels, 2)
with open("svm_model_best.pkl", 'wb') as file:
	pickle.dump(best_clf, file)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed: 20.3min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed: 20.3min finished


{'C': 10}


In [18]:
best_clf = pickle.load(open("svm_model_best.pkl", 'rb'))

print("====== Grid-Search SVC ======")
y_test = best_clf.predict(test_features)
print("Test Accuracy:", metrics.accuracy_score(test_labels, y_test))

y_train = best_clf.predict(train_features)
print("Training Accuracy:", metrics.accuracy_score(train_labels, y_train))

Test Accuracy: 0.9837
Training Accuracy: 0.9999
