# Softmax Classifier 


## Source Data Input

In [91]:
import h5py
import numpy as np
import scipy as sp
import time
import pandas as pd


with h5py.File('./data/train/images_training.h5','r') as H:
    data_train = np.copy(H['datatrain'])
with h5py.File('./data/train/labels_training.h5','r') as H:
    label_train = np.copy(H['labeltrain'])

# using H['datatest'], H['labeltest'] for test dataset.
print(data_train.shape,label_train.shape)

(30000, 784) (30000,)


## Pre-Processing

First we seperate 5000 records from the train data set to use as a validation set. We verify that the last 5000 records of the training set are a representative sample of the 10 classes in Fashion MNIST

In [92]:
unique, counts = np.unique(label_train[25000:,], return_counts=True)
print(dict(zip(unique, counts))) #display counts of classes in candidate validation set

#separate training and validation set 
data_val = data_train[25000:,]
data_partial_train = data_train[:25000,]

{0: 507, 1: 478, 2: 523, 3: 511, 4: 467, 5: 508, 6: 499, 7: 514, 8: 490, 9: 503}


Next we perform the same train/val split on the labels training set. We then one hot encode the labels vector, changing its shape from size (samples,) to (samples, classes). 

In [93]:
#one hot encode y for softmax output 
def oneHot(y):
    zeroesY = np.zeros((y.size, y.max() + 1)) #generate matrix of zeroes shape (samples, classes)
    zeroesY[np.arange(y.size), y] = 1 #insert value 1 at label y's scalar class value
    return zeroesY

label_val = oneHot(label_train[25000:,]) 
label_partial_train = oneHot(label_train[: 25000,])

print(data_partial_train.shape)

(25000, 784)


We then reshape the train and val data tensors of shape (samples,28,28) to matrices of shape (samples, 784)

In [94]:
print(data_partial_train.shape)

data_partial_train = data_partial_train.reshape(data_partial_train.shape[0], 28 * 28)
data_val = data_val.reshape(data_val.shape[0], 28 * 28)

print(data_partial_train.shape, data_val.shape)


(25000, 784)
(25000, 784) (5000, 784)


Following this we define the SVD fitting function. This takes in two parameters (data matrix, number of components), performs SVD decomposition and returns the right singular value V of shape (m,k) where m is the dimension of features and k is the choice of number of leading components from the SVD decomposition. When k < m dimension reduction has been performed. 
We perform this fitting procedure on the training set only (taking 150 components) and use the same V for the validation and test set.

In [95]:
def svd_fit(A, comps):   
    U, s, Vt = np.linalg.svd(A, full_matrices=False)
    #the matrix XV = UD therefore only need right singular value for decomposition
    V_tilde = Vt.T[:,0:comps] #create parameter to pick number of leading components to take from V
    return V_tilde

v = svd_fit(flat_data_partial_train, 150)

print(v.shape)

#take dot product of train and val set with V to perform dimension reduction
dim_partial_train = data_partial_train.dot(v)
dim_data_val = data_val.dot(v)

print( dim_partial_train.shape, dim_data_val.shape)

(784, 150)
(25000, 150) (5000, 150)


## Classifier 

We then move onto the building blocks of the softmax classifer. <br>
<br>
First we define the softmax function. it takes in the matrix X.W which has shape (samples,classes) and produces a matrix of shape (samples, classes) that sums to one per sample. 

In [96]:
def softmax(X): 
    eX = np.exp(X)
    A = eX / eX.sum(axis = 1, keepdims = True) #here we sum along classes per sample and broadcast this as the denominator
    return A

Next we define the loss function taking in the paramaters (data matrix X, one hot encoded labels y, weight matrix W, regulariser penalty l): 

$$
L(\boldsymbol{w}, l)=-\sum_{n} \sum_{k} y_{n k} \log \left(\sigma_{\boldsymbol{w}}(\boldsymbol{x})\right)+\frac{l}{2}\|\boldsymbol{w}\|_{2}^{2}
$$

Where sigma denotes the softmax transformation. The sum over all samples and all classes is captured in np.sum() over the matrix with shape (samples,classes)

In [97]:
#define softmax loss 
def softmax_loss(X, Y, W, l):
    A = softmax(X.dot(W))
    n = len(X)
    snorm = np.linalg.norm(W)**2
    return (-1/n) * np.sum(Y * np.log(A)) + (l/2)*snorm #np.sum over the matrix captures summing over both classes and samples

We define the direct gradient of the above loss function by creating a function that takes in the same paramaters:
$$
\frac{1}{n} \sum_{n}\left(\sigma_{w}\left(\boldsymbol{x}_{n}\right)-y_{n}\right) \boldsymbol{x}^{T}+l \boldsymbol{w}
$$

The output takes the shape (features, classes), preparing to be minused from the identical shaped weight vector W.
We slightly change the above formulation to compute the gradient over the whole data matrix with shape (samples, features). This requires a dot product with the matrix (A-Y) that has shape (samples, classes). 

In [98]:
#define softmax gradient 
def softmax_grad(X, Y, W, l):
    A = softmax(X.dot(W))  
    n = len(X) 
    return  (np.dot(X.T,(A - Y))/n) + l*W

With the softmax loss and gradients defined we can now define the fitting procedure. Here we employ batch gradient descent. This procedure involves choosing a randomly sampled without replacement batch size iteratively to perform gradient operations on. This value times a learning rate alpha is then minused from a weight vector. This procedure is then repeated over the dataset for a certain number of epochs. 

This function takes in a data matrix, the one hot encoded labels, a vector to optimise, a ridge penalty l, a number of epochs to optmise over and a number of batches per epoch. 

In [99]:

def softmax_fit(X, Y, W, l, alpha, epoch,  batch):
    n = len(X) #sample size
    loss_hist = [softmax_loss(X, Y, W, l)] #generate intital loss history
    steps = int(np.ceil(n/batch)) #define the number of steps per epoch, determined by how many batches fit in the sample
    for ep in range(epoch): 
        p_ids = np.random.permutation(n) #generate a set of shuffled ids the size of the sample
        shuffle_X = X[p_ids] #apply this shuffled id to data matrix
        shuffle_Y = Y[p_ids] #apply this shuffled id to label matrix
        for i in range(steps): 
            # get the i-th batch
            X_batch = shuffle_X[i:i + batch, :] #subset data matrix to size batch starting at step point i
            Y_batch = shuffle_Y[i:i + batch] #perform same operation to label matrix
            W -=  alpha * softmax_grad(X_batch, Y_batch, W, l) #minus gradient for current w times learning rate from next w
        loss_hist.append(softmax_loss(X, Y, W, l)) #persist loss over runs
    return W, loss_hist

Having defined the softmax fitting algorithm we now generate initial values for the weight vector with shape (features, classes) and run the fitting algorithm on our chosen hyperparameters

In [100]:
W_rand = np.random.randn(dim_partial_train.shape[1], len(label_partial_train[1]))
W, loss_hist = softmax_fit(dim_partial_train, 
                                      label_partial_train , 
                                      W_rand, 
                                      epoch = 1500, 
                                      batch = 1000,
                                      alpha = 0.1, 
                                      l = 0)

In [107]:
# predict fuction
def pred(W, X):
    A = softmax(X.dot(W))
    return np.argmax(A, axis = 1)

#accuracy
def accuracy(y_pred,y):
    results = pd.DataFrame({'label': y, 'answer': y == y_pred})
    return results['answer'].sum()/len(results)


In [108]:
y_pre = pred(W,dim_data_val)
print("Accurancy of model on test set:",accuracy(y_pre,label_train[25000:,]))

Accurancy of model on test set: 0.841


## Generate Test Result

In [112]:
with h5py.File('./data/test/images_testing.h5','r') as H:
    data_test = np.copy(H['datatest'])
with h5py.File('./data/test/labels_testing_2000.h5','r') as H:
    label_test = np.copy(H['labeltest'])

In [113]:
print(data_test.shape)

#subset to labelled set
data_test_label = data_test[:2000,:]

data_test_label = data_test_label.reshape(data_test_label.shape[0], 28 * 28)

dim_data_test = data_test.dot(v)


(10000, 784)


In [None]:
pred(W, dim_data_test )