# 6. Recurrent Neural Networks and Language Models

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture8.pdf
* https://arxiv.org/pdf/1504.00941.pdf

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [4]:
def pad_to_batch(batch):
    x,y = list(zip(*batch))
    max_x = max([s.size(1) for s in x])
    max_y = max([s.size(1) for s in y])
    x_p,y_p=[],[]
    for i in range(len(batch)):
        if x[i].size(1)<max_x:
            x_p.append(torch.cat([x[i],Variable(LongTensor([word2index['<PAD>']]*(max_x-x[i].size(1)))).view(1,-1)],1))
        else:
            x_p.append(x[i])
        if y[i].size(1)<max_y:
            y_p.append(torch.cat([y[i],Variable(LongTensor([word2index['<PAD>']]*(max_y-y[i].size(1)))).view(1,-1)],1))
        else:
            y_p.append(y[i])
    return list(zip(x_p,y_p))

In [5]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

In [32]:
corpus = list(nltk.corpus.brown.sents())
corpus = [[word.lower() for word in sent] for sent in corpus]

### Mean sentence length

In [33]:
seq_lengths = Counter([len(s) for s in corpus])
np.mean(flatten([[key]*count for key, count in seq_lengths.items()]))

20.250994070456922

In [34]:
corpus = [sent for sent in corpus if len(sent)<=10 and len(sent)>1] # for practice

In [35]:
vocab = list(set(flatten(corpus)))

In [36]:
word2index={'<PAD>':0,'<UNK>':1}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)
        
index2word={v:k for k,v in word2index.items()}

In [43]:
X_p,y_p = [],[]

In [44]:
for sent in corpus:
    X_p.append(prepare_sequence(sent[:-1],word2index).view(1,-1))
    y_p.append(prepare_sequence(sent[1:],word2index).view(1,-1))

In [45]:
data_p = list(zip(X_p,y_p))
random.shuffle(data_p)
train_data = data_p[:int(len(data_p)*0.9)]
dev_data = data_p[int(len(data_p)*0.9):]

### Modeling 

In [65]:
class LanguageModel(nn.Module): 
    def __init__(self,vocab_size,embedding_size,hidden_size,num_layers=1):

        super(LanguageModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size,embedding_size)
        self.rnn = nn.RNN(embedding_size,hidden_size,num_layers,nonlinearity='relu',batch_first=True)
        self.linear = nn.Linear(hidden_size,vocab_size)
        
        self.init_rnn() # IRNN
        
    def init_rnn(self):
        self.rnn.weight_hh_l0 = nn.init.eye(self.rnn.weight_hh_l0)
        self.rnn.weight_ih_l0 = nn.init.eye(self.rnn.weight_ih_l0)
        self.rnn.bias_hh_l0.data.fill_(0)
        self.rnn.bias_ih_l0.data.fill_(0)
        
    def init_hidden(self,inputs):
        return Variable(FloatTensor(self.num_layers,inputs.size(0),self.hidden_size))
        
    def forward(self, inputs): 
        hidden = self.init_hidden(inputs)
        embeds = self.embed(inputs) # BxWxD
        out,hidden = self.rnn(embeds,hidden)
        return F.log_softmax(self.linear(out.contiguous().view(out.size(0)*out.size(1),-1)))

### Train 

It takes for a while

In [66]:
EMBED_SIZE=100
HIDDEN_SIZE=300
LR = 0.001
BATCH_SIZE = 32
STEP = 5

In [67]:
model = LanguageModel(len(word2index),EMBED_SIZE,HIDDEN_SIZE)
if USE_CUDA:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss(ignore_index=0) # ignore pad
optimizer = optim.Adam(model.parameters(),lr=LR)

In [68]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        batch = pad_to_batch(batch)
        x,y = list(zip(*batch))
        inputs = torch.cat(x)
        targets = torch.cat(y)

        model.zero_grad()
        preds = model(inputs)

        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0] )
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5) # gradient clipping
        optimizer.step()
        
        
        if i % 1000==0:
            print("[%d/%d] mean_loss : %0.2f, perplexity : %0.2f" %(step,STEP,np.mean(losses),np.exp(sum(losses))))
            losses=[]

[0/5] mean_loss : 9.48, perplexity : 13062.90


KeyboardInterrupt: 

### Test 