## Vanilla char-RNN
Implement, train, and evaluate a simple RNN where the input are sequences of characters from sentences. This RNN can then be used to generate sentences char-by-char.

NO Pytorch here but raw numpy codes. The idea is to have you go deeper into the model.

Explore the explosion and vanishing gradient issues in this example and have a deeper understanding of RNN before moving to fancier recurrent networks such as LSTM (bidirectinal possibly with attention) in the following sections.

The equation numbers are referring to the deep learning book (online version).
Chap10 https://www.deeplearningbook.org/contents/rnn.html

In [0]:
# import os
# from google.colab import drive
# drive.mount('/content/gdrive')
# !pwd
# os.chdir('gdrive/My Drive/research/Datasets/Amazon_UCSD')
# !pwd
# !ls

In [0]:
# First import necessary packages and define global variables
"""
    Codes borrowed from Karpathy:
    https://gist.github.com/karpathy/d4dee566867f8291f086
"""
import numpy as np
import pandas as pd
import pickle


char_to_ix = {}
ix_to_char = {}
review_in_ix = None # training corpus
valid_review_in_ix = None # validation corpus
test_review_in_ix = None # test corpus
texts = None
avg_len = None


# model and training
hidden_size = 128
learning_rate = 1e-4
input_size = None   # the number of unique characters in the alphabet
output_size = None  # same as input_size


# model parameters: the naming follows the "Deep Learning" book RNN notation.
U = None    # mapping from x to h
W = None    # mapping from previous h to current h
b = None    # bias in mapping from x to h
V = None    # mapping from h to o
c = None    # bias in mapping from h to o

# Then define utility functions for saving, loading, and initializing models.

def save_model(epoch_id, iteration_id, state_dict):

    with open(f'assignment2/char_rnn_model_{epoch_id}_{iteration_id}.pkl', 'wb') as out_f:
        pickle.dump(state_dict, out_f)


def load_model(epoch_id, iteration_id):
    global U, V, W, b, c
    global mU, mV, mW, mb, mc
    global epoches, n
    global smooth_loss

    print(f'Loading pre-trained model from epoch {epoch_id}, iteration {iteration_id}')
    with open(f'assignment2/char_rnn_model_{epoch_id}_{iteration_id}.pkl', 'rb') as in_f:
        params = pickle.load(in_f)
    
        U = params['U']
        V = params['V']
        W = params['W']
        b = params['b']
        c = params['c']

        mU = params['mU']
        mV = params['mV']
        mW = params['mW']
        mb = params['mb']
        mc = params['mc']

        epoches = epoch_id
        n = iteration_id

        smooth_loss = params['smooth_loss']


def init_model():
    """
    Randomly initialize all model parameters
    """
    global U, V, W, b, c
    global mU, mV, mW, mb, mc
    global epoches, n
    global smooth_loss

    W = np.random.randn(hidden_size, hidden_size) * 0.01  # mapping from h to h
    U = np.random.randn(hidden_size, input_size) * 0.01  # mapping from x to h
    V = np.random.randn(output_size, hidden_size) * 0.01  # mapping from h to o
    b = np.zeros((hidden_size, 1))  # bias term in the hidden unit
    c = np.zeros((output_size, 1))  # bias term in the output unit

    # sentence counter and batch counter
    epoches, n = 0, 0
    mU, mV, mW = np.zeros_like(U), np.zeros_like(V), np.zeros_like(W)
    mb, mc = np.zeros_like(b), np.zeros_like(c)
    smooth_loss = -np.log(1.0/output_size) # * avg_len

# Functions for handling text data

def read_reviews(path):
    """
    Read a csv file of Amazon reviews and put the reviews in texts.
    path: pointing to the csv file.
    """
    print(path)
    df = pd.read_csv(path)
    texts = []
    for r in range(df.shape[0]):
        try:
            texts.append(df.iloc[r, 0])
        except Exception as inst:
            break
    print('{} reviews loaded.'.format(df.shape[0]))
    return texts


def reviews_to_ix(texts):
    """
    Build indices of the characters (the alphabet)
    and turn the sentences into sequences of char indices.
    """

    global char_to_ix, ix_to_char, review_in_ix, avg_len
    global input_size, output_size

    chars = set()
    for review in texts:
        try:
            chars.update(set(review))
        except:
            pass
    char_to_ix = {ch: i for i, ch in enumerate(chars)}
    char_to_ix['eos'] = len(char_to_ix)
    ix_to_char = {i: ch for i, ch in enumerate(chars)}
    ix_to_char[char_to_ix['eos']] = 'eos'

    # turn reviews into sequence of character indices.
    review_in_ix = []
    for review in texts:
        try:
            review_in_ix.append([char_to_ix[c] for c in review])
        except:
            pass

    input_size = len(char_to_ix)
    output_size = input_size    # the output space is the same as the input space

    print(f'{len(review_in_ix)} reviews; '
          f'{len(char_to_ix)} unique chars (input_size); '
          f'{sum([len(s) for s in review_in_ix])} chars in total')
    avg_len = np.mean([len(s) for s in review_in_ix])


def more_review_to_ix(more_texts):
    """
    Assume that the training reviews have been indexed and the char_to_ix is built,
    now index additional reviews (for validation and testing).
    """
    global char_to_ix

    # turn reviews into sequence of character indices.
    more_review_in_ix = []
    for review in more_texts:
        try:
            more_review_in_ix.append([char_to_ix[c] for c in review])
        except:
            pass
    return more_review_in_ix

# Define the RNN loss function.

def lossFunc(inputs, targets, hprev = None):
    """
    Define the loss function and implement the BPTT algorithm.
    Note that this is not the built-in PyTorch loss.

    inputs: an array of integers, representing char indices in a review.
    targets: an array of integers, which is the shift of inputs to the right by one time step.
    hprev: initial hidden states. If None, start from all zeros.
    
    return: the loss, the gradients on model parameters, and the last hidden state
    """
    # RNN parameters
    global U, V, W, b, c

    assert len(inputs) == len(targets), f'inputs and targets have different lengths,\
        {len(inputs)} != {len(targets)}'
    
    # use dictionaries to ease indexing of the time steps
    # key = time-steps, value = the vectors at those steps.
    xs, hs, os, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev) # the initial hidden units before the 0-th step
    loss = 0

    # forward pass: go from the 0-th to the second last entries
    for t in range(len(inputs)):
        # create an one-hot vector for the char at position t
        xs[t] = np.zeros((input_size, 1))
        xs[t][inputs[t]] = 1
        # compute the hidden units (Eq. 10.9 of the DL book)
        hs[t] = np.tanh(b + np.dot(W, hs[t - 1]) + np.dot(U, xs[t]))
        # the unnormalized log output probabilities (Eq. 10.10 of the DL book)
        os[t] = np.dot(V, hs[t]) + c
        # the output probabilities of the next character (Eq. 10.11 of the DL book)
        ps[t] = np.exp(os[t]) / np.sum(np.exp(os[t]))

        # cross entropy loss when predicting the next word index
        # this is the so-called "Perplexity" in the SLP book.
        loss += -np.log(ps[t][targets[t], 0])

    # backward pass: implementing BPTT (Back-Propagation through Time), DL Section 10.2.2
    dU, dV, dW = np.zeros_like(U), np.zeros_like(V), np.zeros_like(W)
    db, dc = np.zeros_like(b), np.zeros_like(c)
    dhnext = np.zeros_like(hs[0])

    # starting from the last word.
    for t in reversed(range(len(inputs))):
        # at the current t, find dL/do = P(y) - y. Eq. 10.18 in the DL book.
        do = np.copy(ps[t])
        do[targets[t]] = -1

        # Eq. 10.24 in the DL book for a single t.
        dV += np.dot(do, hs[t].T)
        # Eq. 10.22 in the DL book for a single t.
        dc += do

        # Eq. (20) in the DL book
        dh = np.dot(V.T, do) + dhnext
        dhraw = (1 - hs[t] * hs[t]) * dh    # backprop through the tanh calculating hs[t]

        # Eq. (10.25) and (10.27)
        dU += np.dot(dhraw, xs[t].T)
        dW += np.dot(dhraw, hs[t - 1].T)

        # Eq. (10.23)
        db += dhraw

        # prepare dh in the horizontal direction for the next iteration
        dhnext = np.dot(W.T, dhraw)

    # gradient clipping
    for dparam in [dU, dV, dW, db, dc]:
        np.clip(dparam, -1, 1, out=dparam)

    return loss, dU, dV, dW, db, dc, hs[len(inputs) - 1]


def sample(h, seed_idx, n):
    """
    sample a sentence of length n from the current model.
    Use teacher forcing so that a predicted char will be fed into the
    RNN as input at the next step.
    :param h: memmory state
    :param seed_idx: seed letter for the first time step
    :param n: length of the sentence
    """
    global U, V, W, b, c

    x = np.zeros((input_size, 1))
    x[seed_idx] = 1
    ixes = []

    for t in range(n):
        # update h
        h = np.tanh(np.dot(U, x) + np.dot(W, h) + b)
        # unnormalized log probability
        o = np.dot(V, h) + c
        # output probability
        p = np.exp(o) / np.sum(np.exp(o))
        # sample one word
        ix = np.random.choice(range(output_size), p=p.ravel())
        # prepare the one-hot vector for the next location
        # this is teacher forcing
        x = np.zeros((input_size, 1))
        x[ix] = 1
        ixes.append(ix)

    return ixes


def run_model(corpus_in_ix):
    global U, V, W, b, c
    total_loss = 0
    total_targets = 0

    for n in range(len(corpus_in_ix)):
        inputs = corpus_in_ix[n]
        targets = inputs + [char_to_ix['eos']]
        targets = targets[1:]
        hprev = np.zeros((hidden_size, 1))
        loss, _, _, _, _, _, _ = lossFunc(inputs, targets, hprev)
        total_loss += loss
        total_targets += len(targets)
    return total_loss / total_targets


def train():
    """
    SGD with Adam training.
    Each iteration of the SGD takes one review and then foward-backprop to update the parameters, using Adam.

    The model parameters, the Adam variables, the epoches, n, and smooth_loss
    should have been initialized somewhere outside train().
    """
    global U, V, W, b, c
    global mU, mV, mW, mb, mc
    global epoches, n
    global smooth_loss
    global review_in_ix, valid_review_in_ix, test_review_in_ix

    while True:
        # prepare inputs and the targets.
        if n == len(review_in_ix) or n == 0:
            epoches += 1
            n = 0

        # They are of the same lengths, and targets is inputs shift to the right by one step with a special eos symbol
        inputs = review_in_ix[n]
        # print(inputs)
        targets = inputs + [char_to_ix['eos']]
        targets = targets[1:]

        # forward pass
        hprev = np.zeros((hidden_size, 1))
        loss, dU, dV, dW, db, dc, hprev = lossFunc(inputs, targets, hprev)
        loss /= len(inputs)
        smooth_loss = smooth_loss * 0.999 + loss * 0.001

        for param, dparam, mem in zip([U, V, W, b, c],
                                      [dU, dV, dW, db, dc],
                                      [mU, mV, mW, mb, mc]):
            mem += dparam * dparam  # element-wise squares of the parameters
            param -= learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

        # sample a sentence from the current model
        if n % 2000 == 0:
            ixes = sample(np.zeros_like(hprev), inputs[0], 200)
            print(f'-----{[ix_to_char[ix] for ix in ixes]}-----')
            valid_loss = run_model(valid_review_in_ix[:30])
            test_loss = run_model(test_review_in_ix[:30])
            print(f'iter {n}, training loss: {smooth_loss}, validation loss: {valid_loss}, test loss: {test_loss}')
            save_model(epoches, n, {'U':U, 'V':V, 'W':W, 'b':b, 'c':c, 'mU':mU, 'mV': mV, 'mW': mW, 'mb': mb, 'mc': mc, 'smooth_loss':smooth_loss})

        n += 1

# Main codes for model training

if __name__ == '__main__':

    used_pretrained = False

    tr_reviews = read_reviews('train_small.csv')
    reviews_to_ix(tr_reviews)
    
    test_reviews = read_reviews('test_small.csv')
    test_review_in_ix = more_review_to_ix(test_reviews)

    valid_reviews = read_reviews('valid_small.csv')
    valid_review_in_ix = more_review_to_ix(valid_reviews)
    
    # optionally load a pre-trained model
    if used_pretrained:
        pre_trained_epoches = 1
        pre_trained_iterations = 800
        load_model(pre_trained_epoches, pre_trained_iterations)
    else:
        # randomly initialized the model
        init_model()

    # valid_loss = run_model(valid_review_in_ix[:30])
    # test_loss = run_model(test_review_in_ix[:30])
    # print(f'validation loss: {valid_loss}, test loss: {test_loss}')
    train()


train_small.csv
15989 reviews loaded.
15989 reviews; 94 unique chars (input_size); 9418362 chars in total
test_small.csv
4997 reviews loaded.
valid_small.csv
3998 reviews loaded.
-----['-', 'v', 'X', '~', 'H', 'M', 'R', '~', 'U', '^', 'h', ')', '3', '!', 'r', 'E', 'r', '6', '6', '+', 'eos', '{', 'X', '/', 'n', 'M', 'B', 'X', 'l', '&', 'G', 'G', 'C', '`', 's', '2', '%', 't', 'x', ';', 'o', 'M', '.', '}', '9', '5', '1', 'q', 'K', '?', '-', '|', 'V', '2', '%', 'h', 'R', 'j', 'G', '>', '!', 'L', '7', 'V', 'A', '5', '_', '/', 'K', '.', 'K', '~', '7', 'I', 'M', 'l', 'g', '}', 'e', 'Y', 'D', '@', 'N', 'U', 'm', 'b', 'q', '0', 'b', 'N', '-', '=', 'Z', ';', 'g', 'W', 'B', 'W', '{', 'x', 'Q', 'V', 'M', "'", 'z', 'w', 'x', '?', '*', '^', 'h', 't', '`', 'K', 'j', '%', '4', '?', 'F', 'v', 'G', '7', 'U', 'v', 'M', 'T', 'S', 'W', '=', ' ', '8', 'm', 'z', '!', '0', 'f', 'C', '8', 'T', '6', '0', 'eos', 'M', 'T', '+', '}', 'G', 'u', '{', 'x', 'q', '}', 'm', 'F', 't', 'h', '1', 'b', 'W', '5', 'g', 'y', '