In [None]:
import torch
from torchtext import data
import spacy
from spacy.symbols import ORTH
from torchtext.datasets import WikiText2

my_tok = spacy.load('en')
 
def spacy_tok(x):
    return [tok.text for tok in my_tok.tokenizer(x)]
 
TEXT = data.Field(lower=True, tokenize=spacy_tok)


In [None]:
my_tok.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "n't"}])

In [None]:
train, valid, test = WikiText2.splits(TEXT)

In [None]:
TEXT.build_vocab(train)

In [None]:
batch_size = 50
bptt = 200

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=batch_size,
    bptt_len=bptt, 
    device=device,
    repeat=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, seq len, seq len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
        
        # x = torch.matmul(self.dropout(attention), V) Vamos a probar a quitar el dropout del attention

        x = torch.matmul(attention, V)
        
        #x = [batch size, n heads, seq len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, seq len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, seq len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x, attention



class Transformer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_att_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, src len]
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_att_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src


class TransformerWithLMHead(nn.Module):
    def __init__(self, 
                 input_dim, 
                 emb_dim, 
                 n_layers, 
                 n_heads, 
                 hid_dim,
                 dropout, 
                 device,
                 max_length = bptt):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)

        
        self.layers = nn.ModuleList([Transformer(emb_dim, 
                                                  n_heads, 
                                                  hid_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([emb_dim])).to(device)
        self.fc = nn.Linear(emb_dim, input_dim)


    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
        
    def forward(self, src):
        
        #src = [batch size, src len]

        src_mask = self.make_src_mask(src)

        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
                
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]

        src = src.transpose(1,0)

        #src = [src len, batch size, hid dim]

        out = self.fc(src)

        # out = [src len, batch size, vocab_size]
            
        return F.log_softmax(out, dim=-1)

In [None]:
vocab_size = len(TEXT.vocab)
emb_dim = 128
hid_dim = 128
n_layers = 4
n_heads = 4
dropout = 0.1

lr = 4
log_interval = 20

model = TransformerWithLMHead(vocab_size, emb_dim, n_layers, n_heads, hid_dim, dropout, device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,843,526 trainable parameters


In [None]:
import torch.optim as optim

criterion = nn.NLLLoss()

In [None]:
model=model.to(device)
criterion=criterion.to(device)

In [None]:
def train(model, iterator, criterion):
    clip = 0.25
    total_loss = 0
    
    model.train()
            
    for k, batch in enumerate(iterator):
        data = batch.text
        data = data.transpose(1,0)
        targets = batch.target.view(-1)

        data = data.to(device)
        targets = targets.to(device)

        model.zero_grad()
      
        output = model(data) 

        output = output.view(-1, vocab_size)
        
        loss = criterion(output, targets)
                
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
        
        total_loss += loss.item()
        
        if k % log_interval == 0 and k > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(epoch, k, len(iterator), lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0


In [None]:
def evaluate(model, iterator, criterion):
    
    total_loss = 0
    
    model.eval()
        
    with torch.no_grad():
    
        for batch in iterator:
            data = batch.text
            data = data.transpose(1,0)
            targets = batch.target.view(-1)

            len_data = data.shape[1]

            data = data.to(device)
            targets = targets.to(device)

            output = model(data)

            output = output.view(-1, vocab_size)
            
            loss = criterion(output, targets).item()

            total_loss += len_data * loss

        
    return total_loss / (len(iterator)*bptt - 1)

In [None]:
import time
import math

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 100

best_valid_loss = float('inf')
counter = 0
patience = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train(model, train_iter, criterion)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):.2f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut2-model.pt')
        counter = 0 
    else:
        lr /= 4.0
        counter += 1
        if counter >= patience:
            break

    

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


| epoch   0 |    20/  224 batches | lr 4 | loss  8.47 | ppl  4792.34
| epoch   0 |    40/  224 batches | lr 4 | loss  7.28 | ppl  1445.63
| epoch   0 |    60/  224 batches | lr 4 | loss  7.09 | ppl  1201.14
| epoch   0 |    80/  224 batches | lr 4 | loss  6.91 | ppl   998.39
| epoch   0 |   100/  224 batches | lr 4 | loss  6.80 | ppl   898.79
| epoch   0 |   120/  224 batches | lr 4 | loss  6.74 | ppl   845.47
| epoch   0 |   140/  224 batches | lr 4 | loss  6.64 | ppl   762.49
| epoch   0 |   160/  224 batches | lr 4 | loss  6.60 | ppl   733.70
| epoch   0 |   180/  224 batches | lr 4 | loss  6.54 | ppl   693.71
| epoch   0 |   200/  224 batches | lr 4 | loss  6.46 | ppl   640.95
| epoch   0 |   220/  224 batches | lr 4 | loss  6.47 | ppl   644.66
Epoch: 01 | Epoch Time: 1m 16s
	 Val. Loss: 5.669 |  Val. PPL: 289.86
| epoch   1 |    20/  224 batches | lr 4 | loss  6.73 | ppl   838.76
| epoch   1 |    40/  224 batches | lr 4 | loss  6.35 | ppl   571.33
| epoch   1 |    60/  224 batches

KeyboardInterrupt: ignored

In [None]:
# Run on test data.
test_loss = evaluate(model, test_iter, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

In [None]:
batch = next(iter(train_iter))

In [None]:
data = batch.text
targets = batch.target
targets = targets.view(-1)

In [None]:
data

In [None]:
targets

In [None]:
data1 = batch.text
targets1 = batch.target
data1 = data1.transpose(1,0)
targets1 = targets1.transpose(1,0)
# targets1 = targets1.view(-1)

In [None]:
data1

In [None]:
targets1.flatten()