# 6. Recurrent Neural Networks and Language Models

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture8.pdf
* https://arxiv.org/pdf/1504.00941.pdf
* https://arxiv.org/pdf/1609.07843.pdf

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [4]:
def pad_to_batch(batch):
    x,y = list(zip(*batch))
    max_x = max([s.size(1) for s in x])
    max_y = max([s.size(1) for s in y])
    x_p,y_p=[],[]
    for i in range(len(batch)):
        if x[i].size(1)<max_x:
            x_p.append(torch.cat([x[i],Variable(LongTensor([word2index['<PAD>']]*(max_x-x[i].size(1)))).view(1,-1)],1))
        else:
            x_p.append(x[i])
        if y[i].size(1)<max_y:
            y_p.append(torch.cat([y[i],Variable(LongTensor([word2index['<PAD>']]*(max_y-y[i].size(1)))).view(1,-1)],1))
        else:
            y_p.append(y[i])
    return list(zip(x_p,y_p))

In [5]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

In [18]:
corpus = list(nltk.corpus.brown.sents())
corpus = [[word.lower() for word in sent] for sent in corpus]

### Mean sentence length

In [19]:
seq_lengths = Counter([len(s) for s in corpus])
np.mean(flatten([[key]*count for key, count in seq_lengths.items()]))

20.250994070456922

In [20]:
corpus = [sent for sent in corpus if len(sent)<=20 and len(sent)>1] # for practice

In [21]:
vocab = list(set(flatten(corpus)))

In [22]:
word2index={'<PAD>':0,'<UNK>':1}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)
        
index2word={v:k for k,v in word2index.items()}

In [23]:
X_p,y_p = [],[]

In [24]:
for sent in corpus:
    X_p.append(prepare_sequence(sent[:-1],word2index).view(1,-1))
    y_p.append(prepare_sequence(sent[1:],word2index).view(1,-1))

In [212]:
data_p = list(zip(X_p,y_p))
random.shuffle(data_p)
train_data = data_p[:int(len(data_p)*0.9)]
dev_data = data_p[int(len(data_p)*0.9):]

### Modeling 

In [213]:
class LanguageModel(nn.Module): 
    def __init__(self,vocab_size,embedding_size,hidden_size,num_layers=1):

        super(LanguageModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size,embedding_size)
        self.rnn = nn.RNN(embedding_size,hidden_size,num_layers,nonlinearity='relu',batch_first=True)
        self.linear = nn.Linear(hidden_size,vocab_size)
        
        self.init_rnn() # IRNN
        self.init_embed()
        
    def init_embed(self):
        self.embed.weight = nn.init.xavier_uniform(self.embed.weight)
    
    def init_rnn(self):
        self.rnn.weight_hh_l0 = nn.init.eye(self.rnn.weight_hh_l0)
        self.rnn.weight_ih_l0 = nn.init.eye(self.rnn.weight_ih_l0)
        self.rnn.bias_hh_l0.data.fill_(0)
        self.rnn.bias_ih_l0.data.fill_(0)
        
    def init_hidden(self,inputs):
        return Variable(FloatTensor(self.num_layers,inputs.size(0),self.hidden_size))
        
    def forward(self, inputs): 
        hidden = self.init_hidden(inputs)
        embeds = self.embed(inputs) # BxWxD
        out,hidden = self.rnn(embeds,hidden)
        return F.log_softmax(self.linear(out.contiguous().view(out.size(0)*out.size(1),-1)))

### Train 

It takes for a while. And It sometimes explodes its gradient because of 'relu'. I reference <a href="https://arxiv.org/pdf/1504.00941.pdf">this paper</a> about IRNN. I don't know why it happens.

In [214]:
EMBED_SIZE=300
HIDDEN_SIZE=512
LR = 0.001
BATCH_SIZE = 128
STEP = 10

In [215]:
model = LanguageModel(len(word2index),EMBED_SIZE,HIDDEN_SIZE)
if USE_CUDA:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss(ignore_index=0) # ignore pad
optimizer = optim.Adam(model.parameters(),lr=LR)

In [216]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        batch = pad_to_batch(batch)
        x,y = list(zip(*batch))
        inputs = torch.cat(x)
        targets = torch.cat(y)

        model.zero_grad()
        preds = model(inputs)

        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0] )
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5) # gradient clipping
        optimizer.step()
        
        
        if i % 100==0:
            print("[%d/%d] mean_loss : %0.2f, perplexity : %0.2f" %(step,STEP,np.mean(losses),np.exp(np.mean(losses))))
            losses=[]

[0/10] mean_loss : 10.26, perplexity : 28490.04
[0/10] mean_loss : 7.70, perplexity : 2208.69
[0/10] mean_loss : 6.86, perplexity : 954.94
[1/10] mean_loss : 6.35, perplexity : 570.95
[1/10] mean_loss : 6.37, perplexity : 581.53
[1/10] mean_loss : 6.20, perplexity : 491.79
[2/10] mean_loss : 5.73, perplexity : 307.51
[2/10] mean_loss : 5.74, perplexity : 311.53
[2/10] mean_loss : 5.69, perplexity : 294.82
[3/10] mean_loss : 5.30, perplexity : 199.91
[3/10] mean_loss : 5.30, perplexity : 200.26
[3/10] mean_loss : 5.27, perplexity : 195.09
[4/10] mean_loss : 4.99, perplexity : 147.40
[4/10] mean_loss : 4.91, perplexity : 135.66
[4/10] mean_loss : 4.90, perplexity : 134.93
[5/10] mean_loss : 4.59, perplexity : 98.18
[5/10] mean_loss : 4.55, perplexity : 94.36
[5/10] mean_loss : 4.54, perplexity : 93.75
[6/10] mean_loss : 4.16, perplexity : 63.95
[6/10] mean_loss : 4.17, perplexity : 64.69
[6/10] mean_loss : 4.19, perplexity : 66.17
[7/10] mean_loss : 3.89, perplexity : 49.12
[7/10] mean_l

### Test 

In [234]:
accuracy=0

In [235]:
for dev in dev_data:
    input,target = dev[0],dev[1]
    
    pred = model(input).max(1)[1].data.cpu().tolist() if USE_CUDA else model(input).max(1)[1].data.tolist() 
    target = target.data.cpu().tolist() if USE_CUDA else target.data.tolist()
    
    accuracy+=np.equal(pred,target).sum()
    
print(accuracy/len(flatten(dev_data))*100)

89.4039735099


In [208]:
test_sent = ' '.join(random.choice(corpus))
# test_sent = 'Jane said hi to'

In [209]:
test_sent

'if the turn was too tight , a barrel roll would bring them out .'

In [210]:
input = prepare_sequence(test_sent[:-1].split(),word2index).view(1,-1)
pred = model(input).max(1)[1].data.cpu().tolist() if USE_CUDA else model(input).max(1)[1].data.tolist() 

In [211]:
' '.join(list(map(lambda i: index2word[i], pred)))

'you crummy cleared empty , , he man roll would be out . .'