In [None]:
import mxnet as mx
from mxnet import nd, autograd
import numpy as np
mx.random.seed(1)
ctx = mx.gpu(0)

In [None]:
with open("../data/nlp/timemachine.txt") as f:
    time_machine = f.read()

In [None]:
print(time_machine[0: 500])

In [None]:
print(time_machine[-38075:-37500])
time_machine = time_machine[:-38083]

In [None]:
character_list = list(set(time_machine))
vocab_size = len(character_list)
print(character_list)
print("Length of vocab: %s" % vocab_size)

In [None]:
character_dict = {}
for e, char in enumerate(character_list):
    character_dict[char] = e
print(character_dict)

In [None]:
time_numerical = [character_dict[char] for char in time_machine]

In [None]:
print(len(time_numerical))

print(time_numerical[:20])

print("".join([character_list[idx] for idx in time_numerical[:39]]))

In [None]:
def one_hots(numerical_list, vocab_size=vocab_size):
    result = nd.zeros((len(numerical_list), vocab_size), ctx=ctx)
    for i, idx in enumerate(numerical_list):
        result[i, idx] = 1.0
    return result

In [None]:
print(one_hots(time_numerical[:2]))

In [None]:
def textify(embedding):
    result = ""
    indices = nd.argmax(embedding, axis=1).asnumpy()
    for idx in indices:
        result += character_list[int(idx)]
    return result

In [None]:
textify(one_hots(time_numerical[0: 40]))

In [None]:
seq_length = 64
num_samples = (len(time_numerical) - 1) // seq_length
dataset = one_hots(time_numerical[: seq_length * num_samples]).reshape((num_samples, seq_length, vocab_size))
textify(dataset[0])

In [None]:
batch_size = 32

In [None]:
print("# of sequences in dataset: ", len(dataset))
num_batches = len(dataset) // batch_size
print("# of batches: ", num_batches)
train_data = dataset[: num_batches * batch_size].reshape((batch_size, num_batches, seq_length, vocab_size))

train_data = nd.swapaxes(train_data, 0, 1)
train_data = nd.swapaxes(train_data, 1, 2)
print("Shape of data set: ", train_data.shape)

In [None]:
for i in range(3):
    print("***Batch %s:***\n %s \n %s \n\n" % (i, textify(train_data[i, :, 0]), textify(train_data[i, :, 1])))

In [None]:
labels = one_hots(time_numerical[1: seq_length * num_samples + 1])
train_label = labels.reshape((batch_size, num_batches, seq_length, vocab_size))
train_label = nd.swapaxes(train_label, 0, 1)
train_label = nd.swapaxes(train_label, 1, 2)
print(train_label.shape)

In [None]:
print(textify(train_data[10, :, 3]))
print(textify(train_label[10, :, 3]))

In [None]:
num_inputs = vocab_size
num_hidden = 256
num_outputs = vocab_size

Wxh = nd.random_normal(shape=(num_inputs, num_hidden), ctx=ctx) * 0.01

Whh = nd.random_normal(shape=(num_hidden, num_hidden), ctx=ctx) * 0.01

bn = nd.random_normal(shape=num_hidden, ctx=ctx) * 0.01

Why = nd.random_normal(shape=(num_hidden, num_outputs), ctx=ctx) * 0.01

by = nd.random_normal(shape=num_outputs, ctx=ctx) * 0.01

In [None]:
params = [Wxh, Whh, bh, Why, by]
for param in params:
    param.attach_grad()

In [None]:
def softmax(y_linear, temperature=1.0):
    lin = (y_linear - nd.max(y_linear, axis=1).reshape((-1, 1))) / temperature
    exp = nd.exp(lin)
    partition = nd.sum(exp, axis=1).reshape((-1, 1))
    return exp / partition

In [None]:
softmax(nd.array([[1, -1], [-1, 1]]), temperature=1.0)

In [None]:
softmax(nd.array([[1,-1],[-1,1]]), temperature=1000.0)

In [None]:
softmax(nd.array([[10,-10],[-10,10]]), temperature=.1)

In [None]:
def simple_rnn(inputs, state, temperature=1.0):
    outputs = []
    h = state
    for X in inputs:
        h_linear = nd.dot(X, Wxh) + nd.dot(h, Whh) + bn
        h = nd.tanh(h_linear)
        yhat_linear = nd.dot(h, Why) + by
        yhat = softmax(yhat_linear, temperature=temperature)
        outputs.append(yhat)
    return (outputs, h)

In [None]:
def cross_entropy(yhat, y):
    return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))

In [None]:
cross_entropy(nd.array([[.2,.5,.3], [.2,.5,.3]]), nd.array([[1.,0,0], [0, 1.,0]]))

In [None]:
def average_ce_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0
    for (output, label) in zip(outputs, labels):
        total_loss = total_loss + cross_entropy(output, label)
    return total_loss / len(outputs)

In [None]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

In [None]:
def sample(prefix, num_chars, temperature=1.0):
    string = prefix
    
    prefix_numerical = [character_dict[char] for char in prefix]
    input = ont_hots(prefix_numerical)
    
    sample_state = nd.zeros(shape=(1, num_hidden), ctx=ctx)
    
    for i in range(num_chars):
        outputs, sample_state = simple_rnn(input, sample_state, temperature=temperature)
        choice = np.random.choice(vocab_size, p=outputs[-1][0].asnumpy())
        string += character_list[choice]
        input = one_hots([choice])
    return string

In [None]:
epochs = 2000
moving_loss = 0.

learning_rate = .5

# state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
for e in range(epochs):
    ############################
    # Attenuate the learning rate by a factor of 2 every 100 epochs.
    ############################
    if ((e+1) % 100 == 0):
        learning_rate = learning_rate / 2.0
    state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
    for i in range(num_batches):
        data_one_hot = train_data[i]
        label_one_hot = train_label[i]
        with autograd.record():
            outputs, state = simple_rnn(data_one_hot, state)
            loss = average_ce_loss(outputs, label_one_hot)
            loss.backward()
        SGD(params, learning_rate)

        ##########################
        #  Keep a moving average of the losses
        ##########################
        if (i == 0) and (e == 0):
            moving_loss = np.mean(loss.asnumpy()[0])
        else:
            moving_loss = .99 * moving_loss + .01 * np.mean(loss.asnumpy()[0])

    print("Epoch %s. Loss: %s" % (e, moving_loss))
    print(sample("The Time Ma", 1024, temperature=.1))
    print(sample("The Medical Man rose, came to the lamp,", 1024, temperature=.1))