## Building the classifier

Import the package and extract all the testing and training data in to different NumPy matrices.<br/>
Calculate the number of total different labels, store the number into variable C.

In [None]:
import h5py
import time
import numpy as np
import matplotlib.pyplot as plt

# Import h5py into numpy matrices
with h5py.File('./data/train/images_training.h5', 'r') as H:
    data_train = np.copy(H['datatrain'])
with h5py.File('./data/train/labels_training.h5', 'r') as H:
    label_train = np.copy(H['labeltrain'])
with h5py.File('./data/test/images_testing.h5', 'r') as T:
    data_test = np.copy(T['datatest'])
with h5py.File('./data/test/labels_testing_2000.h5', 'r') as T:
    label_test = np.copy(T['labeltest'])

# Number of different classes.
labels = np.unique(label_train)
C = len(labels)

Preprocess the input data by adding an extra column with 1 to input X martix before run the fit method.

In [None]:
# Preprocess the input data.
def preprocess(X):
    temp = np.ones((X.shape[0], 1))
    X_ = np.c_[temp, X]
    return X_

Softmax function.
$$Softmax = \sigma({W^T}X) = \frac{e^{{W^T}X}}{\displaystyle\sum^{n}_{k=1}{e^{{W^T}X}}}$$

In [None]:
# SoftMax function.
def softmax(wx):
    e_wx = np.exp(wx)
    sfm = e_wx / e_wx.sum(axis=1, keepdims=True)
    return sfm

Calculate the Cross-Entropy loss with regularization parameter.
$$CrossEntropy = - \displaystyle \sum^{n}_{i=1}{y_i\log(\sigma({{w_i}^T}x_i))} + \frac{\lambda}{2N}\displaystyle\sum^{n}_{i=1}{{w_i}^2}$$

In [None]:
# Cross-Entropy loss function with regularization term.
def cross_entropy(X, y, W, lmd=0.01):
    P = softmax(X.dot(W))
    pred_prob = np.log(P)
    N = pred_prob.shape[0]
    row_n = range(N)

    Y = np.zeros((N, C))  # N x C
    Y[row_n, y] = 1

    loss_sum = 0
    for row_p, row_y in zip(pred_prob, Y):
        loss_sum += np.vdot(row_p, row_y)
    
    # Regularization term.
    R = (lmd / 2*N) * np.sum(np.square(W[1:]))
    return -(loss_sum / N) + R

Calculate the gradient with regularization parameter.
$$gradient = \bigtriangledown =  - \frac{1}{n} \big( \sigma({W^T}X)-Y \big)X + \frac{\lambda}{N}W[1:]$$

In [None]:
# Calculate gradient of loss function with regularization term.
def gradient(X, y, W, lmd=0.01):
    # Calculate the conditianal probability using softmax function.
    try:
        P = softmax(X.dot(W))  # N x C
    except ValueError as e:
        print(e)
        print('Please check the input data.')
    else:
        N = P.shape[0]
        row_n = range(N)

        # Vectorize the label array, each row has C columns.
        # a.k.a. OneHot encoding.
        Y = np.zeros((N, C))  # N x C
        Y[row_n, y] = 1

        P_Y = P - Y  # A - Y
        gre = X.T.dot(P_Y) / N  # K x C, same with W.
        
        # Regularization term.
        R = (lmd/N) * (np.r_[np.zeros((1, W.shape[1])), W[1:]])
        return gre + R

Use mini-batch gredient descent to update the weight parameter.
$$W_{i+1} = W_i - \eta\bigtriangledown $$

In [None]:
# Mini-Batch Gradient Descent method.
def mini_batch_GD(X, y, W, lr=0.005, iterlimit=100, batch_size=10, lmd=0.01):
    W_old = W.copy()
    itercount = 0
    N = X.shape[0]
    
    # Record loss history.
    loss_hist = [cross_entropy(X, y, W, lmd)] 
    nbatches = int(np.ceil(float(N) / batch_size)) 
    
    #Stochastic Gradient Descent.
    while itercount < iterlimit:
        itercount += 1
        mix_ids = np.random.permutation(N)  # Randomize the smaller datasets.

        # Batch Gradient Descent.
        for i in range(nbatches):
            batch_ids = mix_ids[batch_size * i : min(batch_size * (i + 1), N)]
            X_batch, y_batch = X[batch_ids], y[batch_ids]
            W -= lr * gradient(X_batch, y_batch, W, lmd)
            
        # Record the loss of every iteration.
        loss_hist.append(cross_entropy(X, y, W, lmd))
        
        # Jump out of loop when weight parameters converge.
        delta = np.linalg.norm(W - W_old)
        if np.sqrt(delta) < 1e-5:
            print('Converged.\n')
            break
        W_old = W.copy()
    return W, loss_hist

Predict the labels and calculate the accuracy.

In [None]:
# Predict the labels.
def predict(W, X):
    out = softmax(X.dot(W))
    return np.argmax(out, axis=1)

# Calculate total accuracy.
def accuracy(y_pred, y_ture):
    try:
        out = sum(y_pred == y_ture)
        ratio = out / len(y_pred)
    except TypeError as e:
        print(format(e))
        print('Please check arugments in fit method.')
    else:
        return ratio

## Tuning the hyperparameter

Make the fit and predict part into a sigle module.

In [None]:
def run(para):
    data_train_ = preprocess(data_train)
    data_test_ = preprocess(data_test)
    
    # Record the training time.
    time_start = time.time()
    W_init = np.random.randn(data_train_.shape[1], 10)
    W, loss_hist = mini_batch_GD(data_train_, label_train, W_init, batch_size=20, iterlimit=800, lr=0.005, lmd=para)
    time_end = time.time()

    label_predict = predict(W, data_test_)
    print("Accuracy of model on test set: {:.2%}".format(accuracy(label_predict, label_test[:2000])))
    print("Time: {:.3f} s.".format(time_end - time_start))
    
    acc = accuracy(label_predict, label_test[:2000])
    loss_avg = np.mean(loss_hist[-5])
    
    return loss_avg, acc

Get the line chart of accuracy and loss

In [None]:
def tune():
    paralist = [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]
    losslist = []
    acclist = []

    for para in paralist:
        l, a = run(para)
        losslist.append(l)
        acclist.append(a)
    
    # Draw the line chart of lambda and loss.
    plt.plot(paralist, losslist)
    plt.xlabel('learning rate', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.show()
    
    # Draw the line chart of lambda and accuracy.
    plt.plot(paralist, acclist)
    plt.xlabel('learning rate', fontsize=12)
    plt.ylabel('accuracy', fontsize=12)
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.show()

## Run classifier on test data.

Launch the code and write predicted labels to a file.

In [None]:
# Preprocess the input data.
data_train_ = preprocess(data_train)
data_test_ = preprocess(data_test[:5000])

# Record the training time.
time_start = time.time()
W_init = np.random.randn(data_train_.shape[1], 10)
W, _losshistory = mini_batch_GD(data_train_, label_train, W_init, batch_size=20, iterlimit=800, lr=0.005, lmd=0.01)
time_end = time.time()

# Make predictions.
label_predict = predict(W, data_test_)

# Write prediction into file.
h5file = h5py.File('./Output/predicted_labels.h5', 'w')
h5file.create_dataset('output', data=label_predict)

## Code performance.
Predicting the 5000 rows of test_data.
- Runtime: 302s

Accuracy on the first 2000 rows of test_data.
- Accuracy: 85%