In [1]:
import numpy as np
import theano
import theano.tensor as T
from theano import function
import cPickle as pickle
import operator
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# TODO 
1. Changed the categorical cross entropy to binary cross entropy
2. Save the best model

In [2]:
class layer_rnn:
    def __init__(self, n_steps, input_dim, output_dim, hidden_dim, preload_model=None):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_steps = np.float32(n_steps)
        self.preload_model = preload_model
        self.test_acc = None
        self.conf_matrix = None
        np.random.seed(12345)
        if preload_model is None:
            #Wxh
            U = np.random.uniform(-np.sqrt(1./input_dim), np.sqrt(1./input_dim), (input_dim, hidden_dim))
            #Why
            V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, output_dim))
            #Whh
            W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        else:
            U,V,W = self.load_model()
        self.U = theano.shared(name="U", value=U.astype("f"))
        self.V = theano.shared(name="V", value=V.astype("f"))
        self.W = theano.shared(name="W", value=W.astype("f"))
        
        self.define_network()
    
    def step(self, x_t, s_t_prev, act=T.nnet.softmax):
        s_t = T.tanh(T.dot(x_t, self.U) + T.dot(s_t_prev, self.W))
        o_t = act(T.dot(s_t, self.V))
        return o_t, s_t
    
    def define_network(self):
        U, V, W = self.U, self.V, self.W
        x = T.ftensor3("input")
        onehot_y = T.fmatrix("onehot_labels")
        step_idx = 0
        
        sum_states = T.zeros((x.shape[0], self.output_dim), dtype="f")
        states = T.zeros((x.shape[0], self.hidden_dim), dtype="f")
        for step_idx in range(0, self.n_steps):
            o_t, states = self.step(x[:, step_idx, :], states)
            sum_states += o_t
        mean_states = sum_states/self.n_steps
        
        prediction = T.argmax(mean_states, axis=1)
        accuracy = T.mean(T.eq(T.argmax(mean_states, axis=1), T.argmax(onehot_y, axis=1)))
        to_label = T.argmax(onehot_y, axis=1)
        cost = T.mean(T.nnet.categorical_crossentropy(mean_states, onehot_y))
        
        self.get_accuracy = function([x, onehot_y, states], accuracy, on_unused_input='warn')
        self.get_cost = function([x, onehot_y, states], cost, on_unused_input="warn")
        self.get_prediction = function([x], prediction)
        self.from_onehot_to_label = function([onehot_y], to_label)
        
        #Gradients
        dU = T.grad(cost, U)
        dV = T.grad(cost, V)
        dW = T.grad(cost, W)
        
        learning_rate = T.scalar("learning_rate", dtype="float32")
        self.sgd_step = theano.function([x, onehot_y, learning_rate],[cost], updates=[
                (U, U - learning_rate*dU),
                (V, V - learning_rate*dV),
                (W, W - learning_rate*dW)])
        self.states = states
        self.sum_states = sum_states
    
    def train(self, X, Y, mini_batch, learning_rate, num_epochs, 
              evaluation_log="train.log", dump_model_name=None,Xv=[],Yv=[],Xt=[],Yt=[]):
        n_train = X.shape[0]
        n_batch = np.int(n_train/mini_batch)
        tr_acc = []
        val_acc = []
        test_acc = None
        tr_loss = []
        for epoch_idx in range(num_epochs):
            perm = np.random.permutation(len(X))
            X = X[perm]
            Y = Y[perm]
            for batch_idx in range(n_batch):
                start = batch_idx * mini_batch
                end = min(start + mini_batch, n_train)
                self.sgd_step(X[start:end], Y[start:end], learning_rate)
            tr_loss.append(self.calc_loss(X, Y).item(0))
            tr_acc.append(self.calc_accuracy(X, Y).item(0))
            print("Epoch:{}\tLoss:{}".format(epoch_idx, tr_loss[-1]))
            if len(Xv) > 0:
                val_acc.append(self.calc_accuracy(Xv, Yv).item(0))
                print('Train_Acc:{}\tVal_acc:{})'.format(tr_acc[-1], val_acc[-1]))
        
        if len(Xt) > 0:
            self.test_acc = self.calc_accuracy(Xt, Yt).item(0)
            print("Test accuracy:{}".format(self.test_acc))
            pred_test_y = self.get_prediction(Xt)
            self.conf_matrix = confusion_matrix(self.from_onehot_to_label(Yt), pred_test_y)
        
        self.tr_acc = tr_acc
        self.val_acc = val_acc
        self.dump_model(dump_model_name)
        self.dump_training_log(evaluation_log)
        
        print("Finished Training. Model dumped in {}".format(dump_model_name))
        print("Training log dumped in {}".format(evaluation_log))
    
    def calc_accuracy(self, X, Y):
        return self.get_accuracy(X, Y, np.zeros((len(X), self.hidden_dim), dtype="f"))
    
    def calc_loss(self, X, Y):
        return self.get_cost(X, Y, np.zeros((len(X), self.hidden_dim), dtype="f"))
    
    def dump_training_log(self, filename):
        with open(filename, 'wb') as outfile:
            if len(self.tr_acc) > 0:
                pickle.dump(self.tr_acc, outfile, protocol=pickle.HIGHEST_PROTOCOL)
            if len(self.val_acc) > 0:
                pickle.dump(self.val_acc, outfile, protocol=pickle.HIGHEST_PROTOCOL)
            if self.test_acc is not None:
                pickle.dump(self.test_acc, outfile, protocol=pickle.HIGHEST_PROTOCOL)
            if self.conf_matrix is not None:
                pickle.dump(self.conf_matrix, outfile, protocol=pickle.HIGHEST_PROTOCOL)

    def dump_model(self, filename):
        with open(filename, 'wb') as outfile:
            pickle.dump(self.U.get_value(), outfile, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.V.get_value(), outfile, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.W.get_value(), outfile, protocol=pickle.HIGHEST_PROTOCOL)

    def load_model(self):
        with open(self.preload_model, 'rb') as infile:
            U = pickle.load(infile)
            assert(U.shape[0] == self.input_dim and U.shape[1] == self.hidden_dim)
            V = pickle.load(infile)
            assert(V.shape[0] == self.hidden_dim and V.shape[1] == self.output_dim)
            W = pickle.load(infile)
            assert(W.shape[0] == self.hidden_dim and W.shape[1] == self.hidden_dim)
            return U, V, W
    
    def viz_U(self, row_indices):
        U = self.U.get_value()
        n_rows = len(row_indices)
        fig = plt.figure()
        for i, row in enumerate(row_indices, 1):
            a = fig.add_subplot(n_rows, 1, i)
            a.set_title('row %d in U' % row)
            a.set_yticks([])
            U_row = U[:, row].reshape(1, U.shape[0])
            U_row = (U_row - np.min(U_row)) / (np.max(U_row) - np.min(U_row))
            plt.imshow(U_row, cmap='gray', interpolation='nearest')
        plt.tight_layout()
        plt.savefig('viz_U.png')
        plt.show()

In [None]:

from data_handler import load_data, onehot
import numpy as np
import theano
from util import load_training_log, plot_confusion_matrix

theano.config.floatX = 'float32'
theano.config.exception_verbosity = 'high'

params = []

# Read MNIST training set, validation set, and test set
(X, Y), (Xv, Yv), (Xt, Yt) = load_data('mnist.pkl.gz')
Y = onehot(Y)
Yv = onehot(Yv)
Yt = onehot(Yt)

input_dim = X.shape[2]
output_dim = Y.shape[1]
hidden_dim = 200
mini_batch = 100
num_epochs = 100
lr = np.float32(0.01)
n_steps = X.shape[1]

# define theano network
rnn = layer_rnn(n_steps=n_steps,
                input_dim=X.shape[2], output_dim=Y.shape[1],
                hidden_dim=hidden_dim)

rnn.train(X, Y, mini_batch=mini_batch, learning_rate=lr, num_epochs=num_epochs,
            dump_model_name='rnn.model', Xv=Xv, Yv=Yv, Xt=Xt, Yt=Yt)

In [None]:
print(X.shape, Y.shape)

In [3]:
X = np.load("../../train_input_128_int32.npy")
Y = np.load("../../train_target_128_int32.npy")
X = X.astype(np.float32)
Y = Y.astype(np.float32)
print(X.shape, Y.shape)

((8704, 128, 12), (8704, 24))


In [4]:
theano.config.floatX = 'float32'
theano.config.exception_verbosity = 'high'

params = []

input_dim = X.shape[2]
output_dim = Y.shape[1]
hidden_dim = 200
mini_batch = 128
num_epochs = 100
lr = np.float32(0.01)
n_steps = X.shape[1]

# define theano network
rnn = layer_rnn(n_steps=n_steps,
                input_dim=X.shape[2], output_dim=Y.shape[1],
                hidden_dim=hidden_dim)

rnn.train(X, Y, mini_batch=mini_batch, learning_rate=lr, num_epochs=num_epochs,
            dump_model_name='rnn.model', Xv=[], Yv=[], Xt=[], Yt=[])

Epoch:0	Loss:1.44560360909
Epoch:1	Loss:1.4207098484
Epoch:2	Loss:1.38999915123
Epoch:3	Loss:1.34529662132
Epoch:4	Loss:1.26035785675
Epoch:5	Loss:1.21140086651
Epoch:6	Loss:1.19641709328
Epoch:7	Loss:1.18950366974
Epoch:8	Loss:1.18510937691
Epoch:9	Loss:1.18184173107
Epoch:10	Loss:1.1793115139
Epoch:11	Loss:1.17718434334
Epoch:12	Loss:1.17545425892
Epoch:13	Loss:1.17351567745
Epoch:14	Loss:1.17184734344
Epoch:15	Loss:1.17022073269
Epoch:16	Loss:1.16840004921
Epoch:17	Loss:1.16667103767
Epoch:18	Loss:1.16479432583
Epoch:19	Loss:1.16287446022
Epoch:20	Loss:1.16089355946
Epoch:21	Loss:1.15896916389
Epoch:22	Loss:1.15649724007
Epoch:23	Loss:1.15417468548
Epoch:24	Loss:1.15162622929
Epoch:25	Loss:1.14891862869
Epoch:26	Loss:1.14583647251
Epoch:27	Loss:1.14273035526
Epoch:28	Loss:1.13938331604
Epoch:29	Loss:1.1359089613
Epoch:30	Loss:1.13248312473
Epoch:31	Loss:1.12846362591
Epoch:32	Loss:1.12505030632
Epoch:33	Loss:1.12149059772
Epoch:34	Loss:1.11919689178
Epoch:35	Loss:1.11651861668
Epoch