In [1]:
import torch
from torchtext import data
import spacy
from spacy.symbols import ORTH
from torchtext.datasets import WikiText2


my_tok = spacy.load('en')
 
def spacy_tok(x):
    return [tok.text for tok in my_tok.tokenizer(x)]
 
TEXT = data.Field(lower=True, tokenize=spacy_tok)


In [2]:
my_tok.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "n't"}])

In [3]:
train, valid, test = WikiText2.splits(TEXT) 

downloading wikitext-2-v1.zip


wikitext-2-v1.zip: 100%|██████████| 4.48M/4.48M [00:00<00:00, 6.52MB/s]


extracting


In [4]:
TEXT.build_vocab(train)

In [5]:
batch_size = 50
bptt = 200

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=batch_size,
    bptt_len=bptt, 
    device=device,
    repeat=False)

In [8]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, input_dim, embed_dim, hid_dim, n_layers, dropout=0.5):
        super(RNNModel, self).__init__()
        
        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.encoder = nn.Embedding(input_dim, embed_dim)
    
        self.rnn = nn.GRU(embed_dim,
                          hid_dim,
                          n_layers,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.decoder = nn.Linear(hid_dim, input_dim)

        self.drop = nn.Dropout(dropout)
        self.init_weights()


    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs, hidden):
        
        #inputs = [seq len, batch size]
        #hidden = [num layers * num directions, batch size, hid dim]
        
        emb = self.drop(self.encoder(inputs))
        
        #emb = [seq len, batch size, emb dim]
        
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        
        #output = [seq len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        
        decoded = self.decoder(output)
        
        #decoded = [seq len, batch size, vocab size]
        
        decoded = decoded.view(-1, self.input_dim)
        
        #decoded = [seq len * batch size, vocab size]

        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())

        return weight.new_zeros(self.n_layers, bsz, self.hid_dim)

In [9]:
vocab_size = len(TEXT.vocab)
emb_dim = 512
hid_dim = 256
n_layers = 5
dropout = 0.2
eval_batch_size = 10
lr = 4
log_interval = 20

model = RNNModel(vocab_size, emb_dim, hid_dim, n_layers, dropout)


In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 24,371,398 trainable parameters


In [11]:
import torch.optim as optim

criterion = nn.NLLLoss()

In [12]:
model=model.to(device)
criterion=criterion.to(device)

In [13]:
def train(model, iterator, criterion):
    clip = 0.25
    total_loss = 0
    
    model.train()
    
    hidden = model.init_hidden(batch_size)
        
    for k, batch in enumerate(iterator):
        data = batch.text
        targets = batch.target.view(-1)

        data = data.to(device)
        targets = targets.to(device)

        model.zero_grad()
        hidden = hidden.detach()
      
        output, hidden = model(data, hidden)    
        
        loss = criterion(output, targets)
                
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
        
        total_loss += loss.item()
        
        if k % log_interval == 0 and k > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(epoch, k, len(iterator), lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0


In [14]:
def evaluate(model, iterator, criterion):
    
    total_loss = 0
    
    model.eval()
    
    hidden = model.init_hidden(batch_size)
    
    with torch.no_grad():
    
        for batch in iterator:
            data = batch.text
            targets = batch.target.view(-1)

            data = data.to(device)
            targets = targets.to(device)

            output, hidden = model(data, hidden)
            hidden = hidden.detach()
            
            loss = criterion(output, targets).item()

            total_loss += len(data) * loss

        
    return total_loss / (len(iterator)*bptt - 1)

In [15]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [16]:
N_EPOCHS = 100

best_valid_loss = float('inf')
counter = 0
patience = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train(model, train_iter, criterion)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):.2f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut2-model.pt')
        counter = 0 
    else:
        lr /= 4.0
        counter += 1
        if counter >= patience:
            break

    

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


| epoch   0 |    20/  224 batches | lr 4 | loss  8.96 | ppl  7758.80
| epoch   0 |    40/  224 batches | lr 4 | loss  7.70 | ppl  2209.14
| epoch   0 |    60/  224 batches | lr 4 | loss  7.65 | ppl  2100.38
| epoch   0 |    80/  224 batches | lr 4 | loss  7.49 | ppl  1793.73
| epoch   0 |   100/  224 batches | lr 4 | loss  7.37 | ppl  1591.25
| epoch   0 |   120/  224 batches | lr 4 | loss  7.36 | ppl  1578.57
| epoch   0 |   140/  224 batches | lr 4 | loss  7.27 | ppl  1443.26
| epoch   0 |   160/  224 batches | lr 4 | loss  7.28 | ppl  1448.60
| epoch   0 |   180/  224 batches | lr 4 | loss  7.24 | ppl  1393.33
| epoch   0 |   200/  224 batches | lr 4 | loss  7.13 | ppl  1247.50
| epoch   0 |   220/  224 batches | lr 4 | loss  7.12 | ppl  1231.92
Epoch: 01 | Epoch Time: 2m 36s
	 Val. Loss: 6.427 |  Val. PPL: 618.27
| epoch   1 |    20/  224 batches | lr 4 | loss  7.44 | ppl  1696.79
| epoch   1 |    40/  224 batches | lr 4 | loss  7.00 | ppl  1101.59
| epoch   1 |    60/  224 batches

In [17]:
# Run on test data.
test_loss = evaluate(model, test_iter, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  4.15 | test ppl    63.31


In [50]:
def generate(prime_str, predict_len = 20, temperature = 0.7):
    hidden = model.init_hidden(1)
    inputs_list = [TEXT.vocab.stoi[k] for k in prime_str.split()]
    inputs = torch.tensor(inputs_list, dtype=torch.long).unsqueeze(1).to(device)
    # Use priming string to "build up" hidden state
    words = prime_str.split()
    for p in range(len(inputs_list) - 1):
        _, hidden = model(inputs[p].unsqueeze(1), hidden)

    inp = inputs[-1].unsqueeze(1)
    for i in range(predict_len):
        output, hidden = model(inp, hidden)
        word_weights = output.squeeze().div(temperature).exp().cpu()
        # word_weights = word_weights[-1]
        word_idx = torch.multinomial(word_weights, 1)[0]
        inp.fill_(word_idx)
        word = TEXT.vocab.itos[word_idx]
        words.append(word)
    return  ' '.join(words)

# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    word_indexes = [TEXT.vocab.stoi[k] for k in line.split()]
    return torch.tensor(word_indexes).unsqueeze(1).to(device)

import numpy as np

def softmax(x):
    """ applies softmax to an input x"""
    e_x = np.exp(x)
    return e_x / e_x.sum()

def greedy_search(prime_str, predict_len = 20):
    inputs = inputTensor(prime_str)
    hidden = model.init_hidden(1).to(device)
    input_list = prime_str.split()
    # Use priming string to "build up" hidden state
    with torch.no_grad():
        for p in range(len(input_list)-1):
            _, hidden = model(inputs[p].unsqueeze(1), hidden)
        inputs = inputs[-1].unsqueeze(1).to(device)

        prob = []
        for i in range(predict_len):
            output, hidden = model(inputs, hidden)
            output_np = output.data.cpu().numpy()[0]
            all_probs = softmax(output_np)
            
            topv, topp = output.topk(1)

            topi = topp.item()
            prob.append(all_probs[topi])
            if topi == 1:
                inputs.fill_(topi)
                word = TEXT.vocab.itos[topi]
                input_list.append(word)
                break   
            else:
                inputs.fill_(topi)
                word = TEXT.vocab.itos[topi]
                input_list.append(word)
    return ' '.join(input_list), prob, np.prod(prob)


def random_choice(prime_str, top_k = 5, predict_len = 20):
    inputs = inputTensor(prime_str)
    hidden = model.init_hidden(1).to(device)
    input_list = prime_str.split(',')
    # Use priming string to "build up" hidden state
    with torch.no_grad():
        for p in range(len(input_list)-1):
            _, hidden = model(inputs[p].unsqueeze(1), hidden)
        inputs = inputs[-1].unsqueeze(1).to(device)

        prob = []
        for i in range(predict_len):
            output, hidden = model(inputs, hidden)
            output_np = output.data.cpu().numpy()[0]
            all_probs = softmax(output_np)
            
            _, topps = output.topk(top_k)
            choices = topps.tolist()
            topi = np.random.choice(choices[0])
            prob.append(all_probs[topi])
            if topi == 1:
                inputs.fill_(topi)
                word = TEXT.vocab.itos[topi]
                input_list.append(word)
                break   
            else:
                inputs.fill_(topi)
                word = TEXT.vocab.itos[topi]
                input_list.append(word)
    return ' '.join(input_list), prob, np.prod(prob)

In [67]:
i = 'my'
generate(i, temperature=0.8)

'my girl , was become in one of the design of the family . the director of the wall was in'

In [65]:
greedy_search(i)

('my < unk > , and the < unk > of the < unk > , which is a < unk',
 [0.061742943,
  0.99999887,
  0.99999106,
  0.12676093,
  0.13301022,
  0.17028119,
  0.092724964,
  0.9999933,
  0.9999985,
  0.16161998,
  0.43439707,
  0.119035535,
  0.9999957,
  0.9999975,
  0.12628525,
  0.09518967,
  0.1424818,
  0.0666404,
  0.07087717,
  0.99997914],
 1.1112343e-12)

In [68]:
random_choice(i)

('my own name for " ode . in a <   , and a man " and a " white ,',
 [0.009784639,
  0.011308314,
  0.07020123,
  0.05974017,
  0.015353499,
  0.020234983,
  0.016850049,
  0.06322432,
  0.09004022,
  1.9562922e-06,
  0.00025247503,
  0.04140955,
  0.14365011,
  0.00981843,
  0.053170785,
  0.039070062,
  0.13081944,
  0.09925737,
  0.006800825,
  0.008428886],
 6.168374e-37)