In [1]:
import torch
from torchtext import data
import spacy
from spacy.symbols import ORTH
from torchtext.datasets import WikiText2

my_tok = spacy.load('en')
 
def spacy_tok(x):
    return [tok.text for tok in my_tok.tokenizer(x)]
 
TEXT = data.Field(lower=True, tokenize=spacy_tok)


In [2]:
my_tok.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "n't"}])

In [3]:
train, valid, test = WikiText2.splits(TEXT)

downloading wikitext-2-v1.zip


wikitext-2-v1.zip: 100%|██████████| 4.48M/4.48M [00:01<00:00, 2.62MB/s]


extracting


In [4]:
TEXT.build_vocab(train)

In [5]:
batch_size = 50
bptt = 200

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=batch_size,
    bptt_len=bptt, 
    device=device,
    repeat=False)

In [9]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len = bptt):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask = True):
      
        #src = [seq len, batch size]

        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * math.sqrt(self.ninp)

        #src = [seq len, batch size, emb_dim]
        
        src = self.pos_encoder(src)
        
        #src = [seq len, batch size, emb_dim]
        
        hidden_state = self.transformer_encoder(src, self.src_mask)
        
        #output = [seq len, batch size, emb_dim]
        
        output = self.decoder(hidden_state)
        
        #output = [seq len, batch size, vocab_size]
        
        return F.log_softmax(output, dim=-1), hidden_state

In [10]:
vocab_size = len(TEXT.vocab)
# emb_dim = 256
# hid_dim = 256
# n_layers = 10
# n_heads = 8
# dropout = 0.1

emb_dim = 128
hid_dim = 128
n_layers = 4
n_heads = 4
dropout = 0.1

lr = 4
log_interval = 20

model = TransformerModel(vocab_size, emb_dim, n_heads, hid_dim, n_layers, dropout)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,817,926 trainable parameters


In [12]:
import torch.optim as optim

criterion = nn.NLLLoss()

In [13]:
model=model.to(device)
criterion=criterion.to(device)

In [14]:
def train(model, iterator, criterion):
    clip = 0.25
    total_loss = 0
    
    model.train()
            
    for k, batch in enumerate(iterator):
        data = batch.text
        targets = batch.target.view(-1)

        data = data.to(device)
        targets = targets.to(device)

        model.zero_grad()
      
        output, hidden = model(data) 

        output = output.view(-1, vocab_size)
        
        loss = criterion(output, targets)
                
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
        
        total_loss += loss.item()
        
        if k % log_interval == 0 and k > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(epoch, k, len(iterator), lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0


In [15]:
def evaluate(model, iterator, criterion):
    
    total_loss = 0
    
    model.eval()
        
    with torch.no_grad():
    
        for batch in iterator:
            data = batch.text
            targets = batch.target.view(-1)

            data = data.to(device)
            targets = targets.to(device)

            output, hidden = model(data)

            output = output.view(-1, vocab_size)
            
            loss = criterion(output, targets).item()

            total_loss += len(data) * loss

        
    return total_loss / (len(iterator)*bptt - 1)

In [16]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [17]:
N_EPOCHS = 100

best_valid_loss = float('inf')
counter = 0
patience = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train(model, train_iter, criterion)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):.2f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut2-model.pt')
        counter = 0 
    else:
        lr /= 4.0
        counter += 1
        if counter >= patience:
            break

    

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


| epoch   0 |    20/  224 batches | lr 4 | loss  8.63 | ppl  5598.79
| epoch   0 |    40/  224 batches | lr 4 | loss  7.38 | ppl  1599.76
| epoch   0 |    60/  224 batches | lr 4 | loss  7.18 | ppl  1310.59
| epoch   0 |    80/  224 batches | lr 4 | loss  6.99 | ppl  1086.52
| epoch   0 |   100/  224 batches | lr 4 | loss  6.88 | ppl   977.07
| epoch   0 |   120/  224 batches | lr 4 | loss  6.81 | ppl   904.11
| epoch   0 |   140/  224 batches | lr 4 | loss  6.65 | ppl   769.73
| epoch   0 |   160/  224 batches | lr 4 | loss  6.63 | ppl   756.54
| epoch   0 |   180/  224 batches | lr 4 | loss  6.58 | ppl   723.27
| epoch   0 |   200/  224 batches | lr 4 | loss  6.51 | ppl   673.17
| epoch   0 |   220/  224 batches | lr 4 | loss  6.51 | ppl   671.33
Epoch: 01 | Epoch Time: 0m 39s
	 Val. Loss: 5.856 |  Val. PPL: 349.18
| epoch   1 |    20/  224 batches | lr 4 | loss  6.76 | ppl   862.95
| epoch   1 |    40/  224 batches | lr 4 | loss  6.36 | ppl   578.93
| epoch   1 |    60/  224 batches

In [18]:
# Run on test data.
test_loss = evaluate(model, test_iter, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  4.10 | test ppl    60.04


In [19]:
import numpy as np


def inputTensor(line):
    word_indexes = [TEXT.vocab.stoi[k] for k in line.split()]
    return torch.tensor(word_indexes).unsqueeze(1).to(device)

def generate(prime_str, predict_len = 20, temperature = 0.7):
    input = inputTensor(prime_str)
    words = prime_str.split()

    with torch.no_grad():
      for i in range(predict_len):
          output, hid = model(input)
          word_weights = output[-1].squeeze().div(temperature).exp().cpu()
          word_idx = torch.multinomial(word_weights, 1)[0]
          word_tensor = torch.Tensor([[word_idx]]).long().to(device)
          input = torch.cat([input, word_tensor], 0)
          word = TEXT.vocab.itos[word_idx]
          words.append(word)
      return ' '.join(words), hid

def greedy_search(prime_str, predict_len = 20):
    input = inputTensor(prime_str)
    words = prime_str.split()

    with torch.no_grad():
      for i in range(predict_len):
          output = model(input)
          topv, top = output[-1].topk(1)
          word_idx = top.item()
          word_tensor = torch.Tensor([[word_idx]]).long().to(device)
          input = torch.cat([input, word_tensor], 0)
          word = TEXT.vocab.itos[word_idx]
          words.append(word)
      return ' '.join(words)


def random_choice(prime_str, top_k = 5, predict_len = 20):
    input = inputTensor(prime_str)
    words = prime_str.split()

    with torch.no_grad():
      for i in range(predict_len):
          output = model(input)
          topv, top = output[-1].topk(top_k)
          choices = top.tolist()
          word_idx = np.random.choice(choices[0])
          word_tensor = torch.Tensor([[word_idx]]).long().to(device)
          input = torch.cat([input, word_tensor], 0)
          word = TEXT.vocab.itos[word_idx]
          words.append(word)
      return ' '.join(words)




In [20]:
input_str = 'hello my name is'

In [22]:
o,h=generate(input_str, temperature=0.1)

In [23]:
o

'hello my name is a < unk > , and the < unk > of the < unk > of the < unk >'

In [25]:
h.shape

torch.Size([23, 1, 128])

In [None]:
greedy_search(input_str)

In [None]:
random_choice(input_str)

In [146]:
a = 'hello'
b = 'hi'

In [147]:
a_t = inputTensor(a)
b_t = inputTensor(b)

In [148]:
_,h1 = model(a_t)
_,h2 = model(b_t)

In [149]:
h1.shape, h2.shape

(torch.Size([1, 1, 128]), torch.Size([1, 1, 128]))

In [150]:
h1 = h1.mean(dim=0) 
h2 = h2.mean(dim=0)

In [151]:
h1 = h1.squeeze()
h2 = h2.squeeze()

In [152]:
h1 = h1.detach().cpu().numpy()
h2 = h2.detach().cpu().numpy()

In [153]:
import numpy as np

In [154]:
np.dot(h1,h2)/(np.linalg.norm(h1)*np.linalg.norm(h2))

0.35752913