In [24]:
import numpy as np
from random import uniform
import sys


# Since numpy doesn't have a function for sigmoid
# We implement it manually here
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


# The derivative of the sigmoid function
def dsigmoid(y):
    return y * (1 - y)


# The derivative of the tanh function
def dtanh(x):
    return 1 - x * x


# The numerically stable softmax implementation
def softmax(x):
    # assuming x shape is [feature_size, batch_size]
    e_x = np.exp(x - np.max(x, axis=0))
    return e_x / e_x.sum(axis=0)

In [32]:
data = open('data/input.short.txt', 'r').read()
chars = sorted(list(set(data)))

data_size, vocab_size = len(data), len(chars)

In [33]:
chars

['\n',
 ' ',
 ',',
 '.',
 ':',
 ';',
 '?',
 'C',
 'F',
 'I',
 'L',
 'M',
 'S',
 'W',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z']

In [26]:
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

In [27]:
# hyper-parameters deciding the network size
emb_size = 4  # word/character embedding size
seq_length = 32  # number of steps to unroll the RNN for the truncated back-propagation algorithm
hidden_size = 32
# learning rate for the Adagrad algorithm. (this one is not 'optimized', only required to make the model learn)
learning_rate = 0.02
std = 0.02  # The standard deviation for parameter initilization
batch_size = 4

In [28]:
# model parameters
# Here we initialize the parameters based an random uniform distribution, with the std of 0.01

# word embedding: each character in the vocabulary is mapped to a vector with $emb_size$ neurons
# Transform one-hot vectors to embedding X
Wex = np.random.randn(emb_size, vocab_size) * std

# weight to transform input X to hidden H
Wxh = np.random.randn(hidden_size, emb_size) * std

# weight to transform previous hidden states H_{t-1} to hidden H_t
Whh = np.random.randn(hidden_size, hidden_size) * std  # hidden to hidden

# Output layer: transforming the hidden states H to output layer
Why = np.random.randn(vocab_size, hidden_size) * std  # hidden to output

# The biases are typically initialized as zeros. But sometimes people init them with uniform distribution too.
bh = np.random.randn(hidden_size, 1) * std  # hidden bias
by = np.random.randn(vocab_size, 1) * std  # hidden bias

# These variables are momentums for the Adagrad algorithm
# Each parameter in the network needs one momentum correspondingly
mWex, mWxh, mWhh, mWhy = np.zeros_like(Wex), np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)

In [29]:
# this will load the data into memory
data_stream = np.asarray([char_to_ix[char] for char in data])
print(data_stream.shape)

bound = (data_stream.shape[0] // (seq_length * batch_size)) * (seq_length * batch_size)
cut_stream = data_stream[:bound]
cut_stream = np.reshape(cut_stream, (batch_size, -1))

(606,)


In [30]:
data_stream

array([ 0,  0,  8, 22, 30, 31, 32,  1,  7, 22, 32, 22, 37, 18, 27,  4,  0,
       13, 18,  1, 14, 30, 18,  1, 14, 16, 16, 28, 33, 27, 32, 18, 17,  1,
       29, 28, 28, 30,  1, 16, 22, 32, 22, 37, 18, 27, 31,  2,  1, 32, 21,
       18,  1, 29, 14, 32, 30, 22, 16, 22, 14, 27, 31,  1, 20, 28, 28, 17,
        3,  0, 13, 21, 14, 32,  1, 14, 33, 32, 21, 28, 30, 22, 32, 36,  1,
       31, 33, 30, 19, 18, 22, 32, 31,  1, 28, 27,  1, 35, 28, 33, 25, 17,
        1, 30, 18, 25, 22, 18, 34, 18,  1, 33, 31,  4,  1, 22, 19,  1, 32,
       21, 18, 36,  0, 35, 28, 33, 25, 17,  1, 36, 22, 18, 25, 17,  1, 33,
       31,  1, 15, 33, 32,  1, 32, 21, 18,  1, 31, 33, 29, 18, 30, 19, 25,
       33, 22, 32, 36,  2,  1, 35, 21, 22, 25, 18,  1, 22, 32,  1, 35, 18,
       30, 18,  0, 35, 21, 28, 25, 18, 31, 28, 26, 18,  2,  1, 35, 18,  1,
       26, 22, 20, 21, 32,  1, 20, 33, 18, 31, 31,  1, 32, 21, 18, 36,  1,
       30, 18, 25, 22, 18, 34, 18, 17,  1, 33, 31,  1, 21, 33, 26, 14, 27,
       18, 25, 36,  5,  0

In [45]:
def forward(inputs, labels, memory, batch_size=1):
    prev_h = memory
    """
    # here we use dictionaries to store the activations over time
    # note from back-propagation implementation:
    # back-propagation uses dynamic programming to estimate gradients efficiently
    # so we need to store the activations over the course of the forward pass
    # in the backward pass we will use the activations to compute the gradients
    # (otherwise we will need to recompute them)
    """

    # those variables stand for:
    # xs: inputs to the RNNs at timesteps (embeddings)
    # cs: characters at timesteps
    # hs: hidden states at timesteps
    # ys: output layers at timesteps
    # ps: probability distributions at timesteps
    xs, cs, hs, os, ps, ys = {}, {}, {}, {}, {}, {}

    # the first memory (before training) is the previous (or initial) hidden state
    hs[-1] = np.copy(prev_h)

    # the loss will be accumulated over time
    loss = 0
    
    print("inputs:", inputs)

    for t in range(inputs.shape[1]):

        # one-hot vector representation for character input at time t
        cs[t] = np.zeros((vocab_size, batch_size))

        for b in range(batch_size):
            cs[t][inputs[b][t]][b] = 1
            
#         print("cs:", cs)
        

        # transform the one hot vector to embedding
        # x = Wemb x c
        xs[t] = np.dot(Wex, cs[t])

        # computation for the hidden state of the network
        # H = tanh ( Wh . H + Wx . x )
        h_pre_activation = np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh
        hs[t] = np.tanh(h_pre_activation)

        # output layer:
        # this is the unnormalized log probabilities for next chars (across all chars in the vocabulary)
        os[t] = np.dot(Why, hs[t]) + by

        # softmax layer to get normalized probabilities:
        ps[t] = softmax(os[t])

        # the label is also an one-hot vector
        ys[t] = np.zeros((vocab_size, batch_size))
        for b in range(batch_size):
            ys[t][labels[b][t]][b] = 1
            
#         print("ys:", ys)

        # cross entropy loss at time t:
        loss_t = np.sum(-np.log(ps[t]) * ys[t])

        loss += loss_t

    # packaging the activations to use in the backward pass
    activations = (xs, cs, hs, os, ps, ys)
    last_hidden = hs[inputs.shape[1] - 1]
    return loss, activations, last_hidden


In [43]:
def backward(activations, clipping=True, scale=True):
    """
    during the backward pass we follow the track of the forward pass
    the activations are needed so that we can avoid unnecessary re-computation
    """

    # Gradient initialization
    # Each parameter has a corresponding gradient (of the loss with respect to that gradient)
    dWex, dWxh, dWhh, dWhy = np.zeros_like(Wex), np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)

    xs, cs, hs, os, ps, ys = activations

    # here we need the gradient w.r.t to the hidden layer at the final time step
    # since this hidden layer is not connected to any future (final time step)
    # then we can initialize it as zero vectors
    dh = np.zeros_like(hs[0])
    bsz = dh.shape[-1]

    # the backward pass starts from the final step of the chain in the forward pass
    for t in reversed(range(inputs.shape[1])):

        # first, we need to compute the gradients of the variable closest to the loss function,
        # which is the softmax output p
        # but here I skip it directly to the gradients of the unnormalized scores o because
        # basically dL / do = p - y
        # from the cross entropy gradients. (the explanation is a bit too long to write here)
        do = ps[t] - ys[t]

        if scale:
            do = do / bsz

        # the gradients w.r.t to the weights and the bias that were used to create o[t]
        dWhy += np.dot(do, hs[t].T)
        dby += np.sum(do, axis=-1, keepdims=True)

        # because h is connected to both o and the next h, we sum the gradients up
        dh = np.dot(Why.T, do) + dh

        # backprop through the activation function (tanh)
        dtanh_h = 1 - hs[t] * hs[t]
        dh_pre_activation = dtanh_h * dh  # because h = tanh(h_pre_activation)

        # next, since  H = tanh ( Wh . H + Wx . x + bh )
        # we use dh to backprop to dWh and dWx

        # gradient of the bias and weight, this is similar to dby and dWhy
        # for the H term
        dbh += np.sum(dh_pre_activation, axis=-1, keepdims=True)
        dWhh += np.dot(dh_pre_activation, hs[t - 1].T)
        # we need this term for the recurrent connection (previous bptt step needs this)
        dh = np.dot(Whh.T, dh_pre_activation)

        # similarly for the x term
        dWxh += np.dot(dh_pre_activation, xs[t].T)

        # backward through the embedding
        dx = np.dot(Wxh.T, dh_pre_activation)

        # finally backward to the embedding projection
        dWex += np.dot(dx, cs[t].T)

    if clipping:
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -1, 1, out=dparam)  # clip to mitigate exploding gradients

    gradients = (dWex, dWxh, dWhh, dWhy, dbh, dby)
    return gradients


In [44]:
def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    """
    c = np.zeros((vocab_size, 1))
    c[seed_ix] = 1
    generated_chars = []
    for t in range(n):
        x = np.dot(Wex, c)
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        o = np.dot(Why, h) + by
        p = softmax(o)

        # the the distribution, we randomly generate samples:
        ix = np.random.multinomial(1, p.ravel())
        c = np.zeros((vocab_size, 1))

        for j in range(len(ix)):
            if ix[j] == 1:
                index = j
        c[index] = 1
        generated_chars.append(index)

    return generated_chars

## Train 

In [48]:
n, p = 0, 0
data_length = cut_stream.shape[1]

# I am not perfectly sure about this (learnt from others that the initial "perplexity" of the model
# should be the vocabulary for every position. So this is the loss at iteration 0
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0

while n < 5:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p + seq_length + 1 >= data_length or n == 0:
        hprev = np.zeros((hidden_size, batch_size))  # reset RNN memory
        p = 0  # go back to start of data

    inputs = cut_stream[:, p:p + seq_length]
    targets = cut_stream[:, p + 1:p + 1 + seq_length]

    # sample from the model now and then
    if n % 1000 == 0:
        h_zero = np.zeros((hidden_size, 1))
        sample_ix = sample(h_zero, inputs[0][0], 1500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt,))

    # forward seq_length characters through the net and fetch gradient
    loss, activations, hprev = forward(inputs, targets, hprev, batch_size=batch_size)
    gradients = backward(activations)
    dWex, dWxh, dWhh, dWhy, dbh, dby = gradients
    smooth_loss = smooth_loss * 0.999 + loss / batch_size * 0.001
    if n % 20 == 0:
        print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wex, Wxh, Whh, Why, bh, by],
                                  [dWex, dWxh, dWhh, dWhy, dbh, dby],
                                  [mWex, mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

    p += seq_length  # move data pointer
    n += 1  # iteration counter
