In [1]:
import torch
from torchtext import data
import spacy
from spacy.symbols import ORTH
from torchtext.datasets import WikiText2

my_tok = spacy.load('en')
 
def spacy_tok(x):
    return [tok.text for tok in my_tok.tokenizer(x)]
 
TEXT = data.Field(lower=True, tokenize=spacy_tok)

In [2]:
my_tok.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "n't"}])


In [3]:
train, valid, test = WikiText2.splits(TEXT) 

wikitext-2-v1.zip:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

downloading wikitext-2-v1.zip


wikitext-2-v1.zip: 100%|██████████| 4.48M/4.48M [00:00<00:00, 38.8MB/s]


extracting


In [4]:
TEXT.build_vocab(train)

In [5]:
batch_size = 50
bptt = 200

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=batch_size,
    bptt_len=bptt, 
    device=device,
    repeat=False)

In [8]:
import torch
import torch.nn as nn
import numpy as np
import math
import torch.nn.functional as F


def create_sinusoidal_embeddings(embeds):
    position_enc = torch.tensor([
        [pos / np.power(10000, 2 * (j // 2) / embeds.embedding_dim) for j in range(embeds.embedding_dim)]
                                                                    for pos in range(embeds.num_embeddings)])
    embeds.weight[:, 0::2] = torch.sin(position_enc[:, 0::2])
    embeds.weight[:, 1::2] = torch.cos(position_enc[:, 1::2])
    embeds.weight.detach_()
    embeds.weight.requires_grad = False


class Transformer(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout,
                 sinusoidal_embeddings, causal=False):
        """ Transformer (GPT-2 architecture) """
        super().__init__()
        self.causal = causal
        self.tokens_embeddings = nn.Embedding(num_embeddings, embed_dim)
        self.position_embeddings = nn.Embedding(num_max_positions, embed_dim)
        if sinusoidal_embeddings:
            create_sinusoidal_embeddings(self.position_embeddings)
        self.dropout = nn.Dropout(dropout)

        self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList()
        self.layer_norms_1, self.layer_norms_2 = nn.ModuleList(), nn.ModuleList()
        for _ in range(num_layers):
            self.attentions.append(nn.MultiheadAttention(embed_dim, num_heads, dropout = dropout))
            self.feed_forwards.append(nn.Sequential(nn.Linear(embed_dim, hidden_dim),
                                                    nn.ReLU(),
                                                    nn.Linear(hidden_dim, embed_dim)))
            self.layer_norms_1.append(nn.LayerNorm(embed_dim, eps=1e-12))
            self.layer_norms_2.append(nn.LayerNorm(embed_dim, eps=1e-12))

    def forward(self, x, padding_mask = None):
        """ Input has shape [seq length, batch] """
        positions = torch.arange(len(x), device=x.device).unsqueeze(-1)
        h = self.tokens_embeddings(x)
        h = h + self.position_embeddings(positions).expand_as(h)
        h = self.dropout(h)

        attn_mask = None
        if self.causal:
            attn_mask = torch.full((len(x), len(x)), -float('Inf'), device=h.device, dtype=h.dtype)
            attn_mask = torch.triu(attn_mask, diagonal = 1)
        for layer_norm_1, attention, layer_norm_2, feed_forward in zip(self.layer_norms_1, self.attentions,
                                                                       self.layer_norms_2, self.feed_forwards):
            h = layer_norm_1(h)
            x, _ = attention(h, h, h, attn_mask = attn_mask, need_weights = False, key_padding_mask = padding_mask)
            x = self.dropout(x)
            h = x + h

            h = layer_norm_2(h)
            x = feed_forward(h)
            x = self.dropout(x)
            h = x + h
        return h


class TransformerWithLMHead(nn.Module):
    def __init__(self, embed_dim, 
                 hidden_dim, 
                 num_embeddings,
                 num_max_positions, 
                 num_heads, 
                 num_layers,
                 dropout, 
                 sinusoidal_embeddings, 
                 mlm,
                 initializer_range):
      
        """ Transformer with a language modeling head on top (tied weights) """

        super().__init__()
        self.transformer = Transformer(embed_dim, hidden_dim, num_embeddings,
                                       num_max_positions, num_heads, num_layers,
                                       dropout, sinusoidal_embeddings, causal=not mlm)
        self.lm_head = nn.Linear(embed_dim, num_embeddings, bias=False)
        self.apply(self.init_weights)
        self.tie_weights()

    def tie_weights(self):
        self.lm_head.weight = self.transformer.tokens_embeddings.weight

    def init_weights(self, module):
        """ initialize weights - note that nn.MultiheadAttention is already initalized by PyTorch (xavier_uniform) """
        if isinstance(module, (nn.Linear, nn.Embedding, nn.LayerNorm)):
            module.weight.data.normal_(mean=0.0, std=initializer_range)
        if isinstance(module, (nn.Linear, nn.LayerNorm)) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, x, padding_mask=None):
        """ Input has shape [seq length, batch] """
        hidden_states = self.transformer(x, padding_mask)
        logits = self.lm_head(hidden_states)

        logits = F.log_softmax(logits, dim=-1)

        return logits

In [9]:
embed_dim = 128
hidden_dim = 128
num_embeddings = len(TEXT.vocab.itos)
num_max_positions = 500
num_heads = 4
num_layers = 4
dropout = 0.1
sinusoidal_embeddings = True
mlm = True
causal = not mlm
initializer_range = 0.02
# lr = 2.5e-4
lr = 4
# weight_decay = 0.0
# gradient_accumulation_steps = 1
# max_norm = 0.25
log_interval = 20

model = TransformerWithLMHead(embed_dim, 
                 hidden_dim, 
                 num_embeddings,
                 num_max_positions, 
                 num_heads, 
                 num_layers,
                 dropout, 
                 sinusoidal_embeddings, 
                 mlm,
                 initializer_range)

In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,093,696 trainable parameters


In [11]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = lr)

criterion = nn.NLLLoss()
model=model.to(device)
criterion = criterion.to(device)


In [13]:
def train(model, iterator, criterion):
    clip = 0.25
    total_loss = 0
    
    model.train()
            
    for k, batch in enumerate(iterator):
        data = batch.text
        targets = batch.target.view(-1)

        data = data.to(device)
        targets = targets.to(device)

        model.zero_grad()
      
        output = model(data) 

        output = output.view(-1, num_embeddings)
        
        loss = criterion(output, targets)
                
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # optimizer.step()
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
        
        total_loss += loss.item()
        
        if k % log_interval == 0 and k > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(epoch, k, len(iterator), lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0


In [14]:
def evaluate(model, iterator, criterion):
    
    total_loss = 0
    
    model.eval()
        
    with torch.no_grad():
    
        for batch in iterator:
            data = batch.text
            targets = batch.target.view(-1)

            data = data.to(device)
            targets = targets.to(device)

            output = model(data)

            output = output.view(-1, num_embeddings)
            
            loss = criterion(output, targets).item()

            total_loss += len(data) * loss

        
    return total_loss / (len(iterator)*bptt - 1)

In [15]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [16]:
N_EPOCHS = 100

best_valid_loss = float('inf')
counter = 0
patience = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train(model, train_iter, criterion)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):.2f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut2-model.pt')
        counter = 0 
    else:
        lr /= 4.0
        counter += 1
        if counter >= patience:
            break

    

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


AttributeError: ignored

In [None]:
# Run on test data.
test_loss = evaluate(model, test_iter, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

NameError: ignored