# 6. Recurrent Neural Networks and Language Models

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture8.pdf
* https://arxiv.org/pdf/1504.00941.pdf
* https://arxiv.org/pdf/1609.07843.pdf
* https://github.com/pytorch/examples/tree/master/word_language_model
* https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/language_model

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<unk>"], seq))
    return LongTensor(idxs)

### Data load and Preprocessing

In [4]:
def prepare_ptb_dataset(filename,word2index=None):
    corpus = open(filename,'r',encoding='utf-8').readlines()
    corpus = flatten([co.strip().split() + ['</s>'] for co in corpus])
    
    if word2index==None:
        vocab = list(set(corpus))
        word2index={'<unk>':0}
        for vo in vocab:
            if vo not in word2index.keys():
                word2index[vo]=len(word2index)
    
    return prepare_sequence(corpus,word2index), word2index

In [5]:
# borrowed code from https://github.com/pytorch/examples/tree/master/word_language_model

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).contiguous()
    if USE_CUDA:
        data = data.cuda()
    return data

def get_batch(source, i, evaluation=False):
    seq_len = min(SEQ_LENGTH, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile=evaluation)
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i in range(0, data_source.size(0) - 1, args.bptt):
        data, targets = get_batch(data_source, i, evaluation=True)
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(data_source)

In [6]:
train_data, word2index= prepare_ptb_dataset('../dataset/ptb/ptb.train.txt',)
dev_data , _ = prepare_ptb_dataset('../dataset/ptb/ptb.valid.txt',word2index)
test_data, _ = prepare_ptb_dataset('../dataset/ptb/ptb.test.txt',word2index)

In [7]:
len(word2index)

10000

In [8]:
index2word = {v:k for k,v in word2index.items()}

## Modeling 

In [9]:
class LanguageModel(nn.Module): 
    def __init__(self,vocab_size,embedding_size,hidden_size,n_layers=1,dropout_p=0.7):

        super(LanguageModel, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size,embedding_size)
        self.rnn = nn.RNN(embedding_size,hidden_size,n_layers,nonlinearity='relu',batch_first=True)
        self.linear = nn.Linear(hidden_size,vocab_size)
        self.dropout = nn.Dropout(dropout_p)
        self.init_rnn() # IRNN
        self.init_embed()
        
    def init_embed(self):
        self.embed.weight = nn.init.xavier_uniform(self.embed.weight)
    
    def init_rnn(self):
        self.rnn.weight_hh_l0 = nn.init.eye(self.rnn.weight_hh_l0)
        self.rnn.weight_ih_l0 = nn.init.eye(self.rnn.weight_ih_l0)
        self.rnn.bias_hh_l0.data.fill_(0)
        self.rnn.bias_ih_l0.data.fill_(0)
        
    def init_hidden(self,batch_size):
        hidden = Variable(torch.zeros(self.n_layers,batch_size,self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    def detach_hidden(self,hidden):
        return hidden.detach()
    
    def forward(self, inputs,hidden,is_training=False): 
#         hidden = self.init_hidden(inputs)
        embeds = self.embed(inputs) # BxWxD
        if is_training:
            embeds = self.dropout(embeds)
        out,hidden = self.rnn(embeds,hidden)
        return self.linear(out.contiguous().view(out.size(0)*out.size(1),-1)), hidden

## Train 

It takes for a while. And It sometimes explodes its gradient because of 'relu'. I reference <a href="https://arxiv.org/pdf/1504.00941.pdf">this paper</a> about IRNN. I don't know why it happens.

In [10]:
EMBED_SIZE=100
HIDDEN_SIZE=512
NUM_LAYER=4
LR = 0.001
SEQ_LENGTH = 30 # for bptt
BATCH_SIZE = 20
EPOCH = 100

In [11]:
train_data = batchify(train_data,BATCH_SIZE)
dev_data = batchify(dev_data,BATCH_SIZE//2)
test_data = batchify(test_data,BATCH_SIZE//2)

In [14]:
model = LanguageModel(len(word2index),EMBED_SIZE,HIDDEN_SIZE,NUM_LAYER)
if USE_CUDA:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=LR)

In [None]:
for epoch in range(EPOCH):
    losses=[]
    hidden = model.init_hidden(BATCH_SIZE)
    for i in range(0, train_data.size(1) - SEQ_LENGTH, SEQ_LENGTH):
        inputs = Variable(train_data[:, i:i+SEQ_LENGTH])
        targets = Variable(train_data[:, (i+1):(i+1)+SEQ_LENGTH].contiguous())
        
        hidden = model.detach_hidden(hidden)
        model.zero_grad()
        preds,hidden = model(inputs,hidden,True)

        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data.tolist()[0])
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(),0.5) # gradient clipping
        optimizer.step()
    
    if epoch>0:
        print("[%d/%d] mean_loss : %0.2f, Perplexity : %0.2f" % (epoch,EPOCH, \
                                                                                              np.mean(losses),np.exp(np.mean(losses))))


### Test 

In [18]:
total_loss = 0
hidden = model.init_hidden(BATCH_SIZE//2)
losses=[]
for i in range(0, test_data.size(1) - SEQ_LENGTH, SEQ_LENGTH):
    inputs = Variable(test_data[:, i:i+SEQ_LENGTH],volatile=True)
    targets = Variable(test_data[:, (i+1):(i+1)+SEQ_LENGTH].contiguous())
        
    hidden = model.detach_hidden(hidden)
    model.zero_grad()
    preds,hidden = model(inputs,hidden)

    losses.append(loss_function(preds,targets.view(-1)).data.tolist()[0])

print("Test Perpelexity : %5.2f" % (np.exp(np.mean(losses))))

Test Perpelexity : 9673.71
