In [71]:
import spacy
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import torch.nn as nn
import torch.optim as optim
import random
import math
import time
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import torch
from utils import translate_sentence, bleu, load_checkpoint, save_checkpoint

In [72]:
spacy_eng = spacy.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")

In [73]:
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

In [74]:
english = Field(tokenize=tokenize_eng, lower=True, init_token='<sos>', eos_token='<eos>')
german = Field(tokenize=tokenize_ger, lower=True, init_token='<sos>', eos_token='<eos>')

In [75]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

In [76]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [80]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, drop_prob):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding_size = embedding_size
        
        self.dropout = nn.Dropout(drop_prob)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=drop_prob)
    
    
    def forward(self, x):
        # x shape: (seq_length, N) ---> has N batches each of seq_length
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size) ---> Each of the word has an embedding of size in embedding_size
        
        outputs, (hidden, cell) = self.lstm(embedding)
        # output shape: [seq_length, batch_size, hid_dim * n_directions ]
        # hidden shape: [n_layers * n_direction, batch_size, hid_dim ]
        # cell state shape: [n_layers * n_direction, batch_size, hid_dim ]
        
        return hidden, cell

In [81]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, drop_prob):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        
        self.dropout = nn.Dropout(drop_prob)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=drop_prob)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        #x = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        x = x.unsqueeze(0)
        
        # x = [1, batch Size]
        
        embedding = self.dropout(self.embedding(x))
        
        # embedding = [1, batch size, embedding size]
        
        output, (hidden, cell) = self.lstm(embedding, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        predictions = self.fc(output)
        
        # predictions = [1, batch_size, output_dim]
        
        predictions = predictions.squeeze(0)
        
        # predictions = [batch_size, output_dim]
        
        return predictions, hidden, cell
        
        

In [82]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hidden_size == decoder.hidden_size, \
            "Hidden size of encoder and decoder does not match"
        assert encoder.num_layers == decoder.num_layers, \
            "Num layers of encoder and decoder does not match"
        
    def forward(self, source, target, teacher_forcing_ratio = 0.5):
         #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = source.shape[1]
        trg_len = target.shape[0]
        target_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(trg_len, batch_size, target_vocab_size).to(device)
        
        hidden, cell = self.encoder(source)
        
        # get the first token for the decoder <sos> across all batches
        x = target[0, :] 
        
        for t in range(trg_len):
            
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(x, hidden, cell)
            # output  = [batch size, output_dim]
            
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            # get the best guess using argmax for each of the batch
            top_one = output.argmax(1)
            
            # set the next input conditioning on teacher forcing usage or not
            x = target[t] if random.random() < teacher_forcing_ratio else top_one
        
        return outputs
       
        

In [83]:
# Training Hyperparameters
num_epochs = 20
learing_rate = 0.01
batch_size = 64

# Model Hyperparameters
load_model = False
device = torch.device('cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# TensorBoard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

In [84]:

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
            (train_data, validation_data, test_data),
                batch_size=batch_size, 
                sort_within_batch=True,
                sort_key = lambda x: len(x.src), 
                device=device)

In [85]:
# defining the model
encoder = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder, decoder, device).to(device)

In [86]:
for name, param in model.named_parameters():
    print(name, param)

encoder.embedding.weight Parameter containing:
tensor([[-1.3564,  1.1339,  0.7082,  ..., -0.6823, -1.6562, -0.7343],
        [ 0.5285, -0.6477,  1.0205,  ..., -0.0385, -0.5533,  0.3937],
        [-0.8782, -1.7481,  1.0477,  ..., -0.7103,  0.1289, -0.4427],
        ...,
        [-0.6792,  0.6214,  2.3403,  ..., -1.3644,  1.1562, -0.3353],
        [-0.0353, -0.0175, -0.3030,  ...,  1.1515, -0.9493, -0.3736],
        [-0.1261,  0.2769, -0.4705,  ..., -0.6186, -0.1904, -0.7831]],
       requires_grad=True)
encoder.lstm.weight_ih_l0 Parameter containing:
tensor([[-0.0076,  0.0171, -0.0194,  ..., -0.0186,  0.0142,  0.0195],
        [-0.0063, -0.0243, -0.0153,  ...,  0.0199, -0.0009, -0.0055],
        [ 0.0095,  0.0291, -0.0268,  ..., -0.0054,  0.0063,  0.0257],
        ...,
        [ 0.0118, -0.0158,  0.0153,  ..., -0.0259,  0.0174, -0.0072],
        [-0.0302,  0.0168,  0.0289,  ..., -0.0068,  0.0236, -0.0027],
        [ 0.0101,  0.0209, -0.0265,  ..., -0.0241, -0.0225,  0.0019]],
       req

In [87]:
# ignore the padding tokens by loss function
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [88]:
if load_model:
    load_checkpoint(torch.load('seq2seq_checkpoint.pth.ptar', model, optimizer))


In [89]:
optimizer = optim.Adam(model.parameters())

In [94]:
def train(model, iterator, optimizer, criterion, clip, writer):
    
    # take the model to training mode(activates the dropout and batchnorm)
    model.train()
    
    # to track the loss in each epoch
    epoch_loss = 0
    
    for batch_idx, batch in enumerate(iterator):
        
        # src = [src_len, batch_size]
        # trg = [trg_len, batch_size]
        
        src = batch.src.to(device)
        trg = batch.trg.to(device)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        # trg = [(trg len - 1) * batch size]
        # output = [(trg len - 1) * batch size, output_dim]
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        #writer.add_scaler('Training Loss', loss, global_step=step)
        #step += 1
        
        epoch_loss += loss.item()
    
    # we return the loss per epoch for the training
    return epoch_loss/len(iterator)
    

In [95]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        
        for batch_index, batch in enumerate(iterator):
            
            src = batch.src
            trg = batch.trg
            
            output = model(src, trg, 0)
            
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
            
        return epoch_loss / len(iterator)

In [96]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [98]:
# Training the model
num_epochs = 10
clip = 1

best_valid_loss = float('inf')

for epoch in range(num_epochs):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, clip, writer)
    validation_loss = evaluate(model, validation_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if validation_loss < best_valid_loss :
        best_valid_loss = validation_loss
        torch.save(model.state_dict(), 'seq2seq_model.pt')
    
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

NameError: name 'valid_loss' is not defined

In [None]:
model.load_state_dict(torch.load('seq2seq_model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

In [None]:
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")