In [1]:
!pip install datasets evaluate --upgrade
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [2]:
import warnings as wn
wn.filterwarnings("ignore")

In [3]:
import random
import numpy as np
import spacy # helps us to import english by defult
import datasets
import tqdm # this shows the continute
import evaluate # to evalute the accuracy

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext

In [5]:
seed = 1234 # this keeps the random values has constant

random.seed(seed)
np.random.seed(seed)# The value in the numpy random seed saves the state of randomness.
torch.manual_seed(seed)# Sets the seed for generating random numbers.
torch.cuda.manual_seed(seed)#  Set the seed for generating random numbers for the current GPU.
torch.backends.cudnn.deterministic = True# causes cuDNN only to use deterministic convolution algorithms.

In [6]:
dataset = datasets.load_dataset("bentrevett/multi30k")# extracting the data

Downloading readme:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [8]:
train_data, valid_data, test_data = ( # alocating each columns in datasets as given names
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [10]:
train_data[0] # exploring train_data sets with 0 column 

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [12]:
en_nlp = spacy.load("en_core_web_sm") #  a small English pipeline trained on written web text
de_nlp = spacy.load("de_core_news_sm") # German pipeline optimized for CPU.

In [9]:
string = "What a lovely day it is today!"

[token.text for token in en_nlp.tokenizer(string)] #helps computers understand and process human language by splitting it into manageable units.

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [10]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):  # creating the model and placing requierd parameters
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length] # taking makimum length
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    
    if lower:
        
        en_tokens = [token.lower() for token in en_tokens]# token.lower helps us to make the words into lower case
        de_tokens = [token.lower() for token in de_tokens]
        
    en_tokens = [sos_token] + en_tokens + [eos_token] # adding sos with eos
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens} # return helps us to send the function's result back to the caller

In [14]:
max_length = 1_000 # we're trimming all sequences to a maximum length of 1000
lower = True
sos_token = "<sos>" #  converting each token to lower using "sos"
eos_token = "<eos>" #  converting each token to lower using "eos"

fn_kwargs = {  #function need to be stored in a dictionary and passed to the fn_kwargs argument of map.
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

In [11]:
train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs) # mean average precision
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
# We can now look at an example, confirming the two new features have been added.
#  both of which are lowercased list of strings with the start/end of sequence tokens appended

In [12]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [13]:
# min_freq argument to not create an index for tokens which appear less than min_freq times in our training set.

min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator( # Build a Vocab from an iterator. Parameters: iterator – Iterator used to build Vocab.
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [14]:
#    in our vocabulary (indices 0 to 9) using the get_itos
# where itos = "int to string", which returns a list of tokens.

en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', 'a', '.', 'in', 'the', 'on', 'man']

In [15]:
en_vocab.get_itos()[9]

'man'

In [16]:
de_vocab.get_itos()[:10] # We can get the index from a given token using the get_stoi

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'ein', 'einem', 'in', 'eine', ',']

In [17]:
en_vocab.get_stoi()["the"]

7

In [18]:
#  As a shorthand, we can just use the vocabulary as a dictionary and pass the token to get the index.

en_vocab["the"]

7

In [19]:
# The len of each vocabulary gives us the number of unique tokens in each one.

len(en_vocab), len(de_vocab)

(5893, 7853)

In [20]:
# We can also use the in keyword to get a boolean indicating if a token is in the vocabulary.

"the" in en_vocab

True

In [21]:
# This means that no tokens containing any uppercase characters appear in our vocabulary.

"The" in en_vocab

False

In [23]:
assert en_vocab[unk_token] == de_vocab[unk_token] # The assert keyword is used when debugging code
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [24]:
en_vocab.set_default_index(unk_index)  #  setdefault() method is used to set default value to the key.
de_vocab.set_default_index(unk_index)

In [25]:
en_vocab["The"]

0

In [26]:
# And we can get the token corresponding to that index to prove it's the <unk> token.

en_vocab.get_itos()[0]

'<unk>'

In [27]:
tokens = ["i", "love", "watching", "crime", "shows"]

In [28]:
# lookup_indices method. This takes in a list of tokens and returns a list of indices.

en_vocab.lookup_indices(tokens)

[956, 2169, 173, 0, 821]

In [29]:
# we can use the lookup_tokens method to convert a list of indices back into tokens using the vocabulary.

en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', 'shows']

In [30]:
#  numericalize_example function which we'll use with the map method of our dataset.

def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [31]:
#  We apply the numericalize_example function, passing our vocabularies in the fn_kwargs dictionary to the fn_kwargs argument

fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)# map ensuring that in the process of moving data 
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)#from a source to a destination, data accuracy is maintained. 
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [32]:
# the two new features: "en_ids" and "de_ids", both a list of integers representing their indices in the respective vocabulary.

train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 3],
 'de_ids': [2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 3]}

In [33]:
#We can confirm the indices are correct by using the lookup_tokens method with the corresponding vocabulary on the list of indices.

en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>',
 'two',
 'young',
 ',',
 'white',
 'males',
 'are',
 'outside',
 'near',
 'many',
 'bushes',
 '.',
 '<eos>']

In [34]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(  # The with_format method converts features indicated by the columns argument to a given type.
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(  # with_format will remove any features not in the list of features passed to columns.
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True, # We want to keep those features, which we can do with output_all_columns=True.
)

In [35]:
# We can confirm this worked by checking an example and seeing the "en_ids" and "de_ids"


train_data[0]

{'en_ids': tensor([   2,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            3]),
 'de_ids': tensor([   2,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    3]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [36]:
# We can also check this using the type built-in function on one of the features.


type(train_data[0]["en_ids"])

torch.Tensor

In [37]:
def get_collate_fn(pad_index):#The get_collate_fn takes in the padding token index and returns the collate_fn defined inside it
    def collate_fn(batch):# The collate_fn below takes a "batch" as input
        
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

In [38]:
#  the functions which give us our data loaders creating using PyTorch's DataLoader class.


def get_data_loader(dataset, batch_size, pad_index, shuffle=False): #get_data_loader is created using a Dataset
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader #we create our data loaders


To reduce the training time, we generally want to us the largest batch size possible. When using a GPU, this means using the largest batch size that will fit in GPU memory.

Shuffling of data makes training more stable and potentially improves the final performance of the model, however only needs to be done on the training set. The metrics calculated for the validation and test set will be the same no matter what order the data is in.


In [39]:

batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [None]:
# We'll be building our model in three parts. The encoder, the decoder and a seq2seq model that encapsulates the encoder and decoder
# and will provide a way to interface with each.

In [40]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):# init helps us to sets up the layers
        super().__init__() # to initialize the attributes of the parent class.
        self.hidden_dim = hidden_dim # is the dimensionality of the hidden and cell states
        self.n_layers = n_layers #n_layers is the number of layers in the RNN.
        self.embedding = nn.Embedding(input_dim, embedding_dim) #The embedding layer is created using nn.Embedding
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout) #layer is created using
        self.dropout = nn.Dropout(dropout) # this helps us to reduce accuracy

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        
        return hidden, cell

In [41]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim #It defines the size of the output vectors from this layer for each word.
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
      
        input = input.unsqueeze(0) #adds it as a new zeroth dimension
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
       
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell #to receive our prediction. We then return the prediction

In [42]:
class Seq2Seq(nn.Module): #The Seq2Seq model takes in an Encoder, Decoder, and a device
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio): # The teacher forcing ratio is used when training our model

        batch_size = trg.shape[1]
        trg_length = trg.shape[0] #We know how long our target sentences should be (trg_length), so we loop that many times.
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
     
        input = trg[0, :]
        for t in range(1, trg_length):
        
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            
            input = trg[t] if teacher_force else top1
            return outputs

In [43]:
# Now we have our model implemented, we can begin training it.


input_dim = len(de_vocab)
output_dim = len(en_vocab) #As mentioned before, the input and output dimensions are defined by the size of the vocabulary.

# The embedding dimesions and dropout for the encoder and decoder can be different
# but the number of layers and the size of the hidden/cell states must be the same.


encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

# The device is used to tell PyTorch whether a model or a tensor should be processed on a GPU or CPU.

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  #The torch.cuda.is_available() function returns True


encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [44]:
# Weight Initialization


def init_weights(m):#init_weights function will be called on every module and sub-module within our model.
    for name, param in m.named_parameters(): # For each module we loop through all of the parameters
        nn.init.uniform_(param.data, -0.08, 0.08) #sample them from a uniform distribution with nn.init.uniform_.


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [45]:
# We can also count the number of parameters in our model.


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 13,898,501 trainable parameters


In [46]:
# training the model

In [47]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [48]:
# Training Loop


def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["de_ids"].to(device)
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        
        optimizer.zero_grad()  #zero the gradients calculated from the last batch
        output = model(src, trg, teacher_forcing_ratio) #feed the source and target into the model to get the output
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim) #2d inputs with 1d targets we need to flatten each of them with .view
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward() #calculate the gradients with loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step() #update the parameters of our model by doing an optimizer step

        
        
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [49]:
# evalute the loop

def evaluate_fn(model, data_loader, criterion, device):
    model.eval() #to set the model to evaluation mode with model.eval()
    epoch_loss = 0
    with torch.no_grad(): #We use the with torch.no_grad() block to ensure no gradients are calculated within the block.
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [50]:
# Model Training
# We can finally start training our model!

# We'll be printing out both the loss and the perplexity(confusion) at each epoch.
#It is easier to see a change in perplexity(confusion) than a change in loss as the numbers are much bigger.



n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 10%|██████████████▋                                                                                                                                    | 1/10 [00:15<02:21, 15.76s/it]

	Train Loss:   5.035 | Train PPL: 153.632
	Valid Loss:   4.991 | Valid PPL: 147.028


 20%|█████████████████████████████▍                                                                                                                     | 2/10 [00:31<02:05, 15.71s/it]

	Train Loss:   4.399 | Train PPL:  81.330
	Valid Loss:   4.661 | Valid PPL: 105.741


 30%|████████████████████████████████████████████                                                                                                       | 3/10 [00:47<01:50, 15.73s/it]

	Train Loss:   4.075 | Train PPL:  58.860
	Valid Loss:   4.487 | Valid PPL:  88.884


 40%|██████████████████████████████████████████████████████████▊                                                                                        | 4/10 [01:02<01:33, 15.62s/it]

	Train Loss:   3.873 | Train PPL:  48.071
	Valid Loss:   4.274 | Valid PPL:  71.819


 50%|█████████████████████████████████████████████████████████████████████████▌                                                                         | 5/10 [01:18<01:18, 15.63s/it]

	Train Loss:   3.688 | Train PPL:  39.972
	Valid Loss:   4.209 | Valid PPL:  67.278


 60%|████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 6/10 [01:33<01:02, 15.64s/it]

	Train Loss:   3.541 | Train PPL:  34.492
	Valid Loss:   4.075 | Valid PPL:  58.835


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 7/10 [01:49<00:46, 15.65s/it]

	Train Loss:   3.404 | Train PPL:  30.074
	Valid Loss:   4.034 | Valid PPL:  56.458


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 8/10 [02:05<00:31, 15.68s/it]

	Train Loss:   3.248 | Train PPL:  25.751
	Valid Loss:   3.987 | Valid PPL:  53.888


 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 9/10 [02:20<00:15, 15.65s/it]

	Train Loss:   3.109 | Train PPL:  22.396
	Valid Loss:   3.894 | Valid PPL:  49.093


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:36<00:00, 15.66s/it]

	Train Loss:   2.986 | Train PPL:  19.802
	Valid Loss:   3.787 | Valid PPL:  44.137





In [None]:
# Evaluating the Model


# We'll load the parameters (state_dict) that gave our model the best validation loss and run it on
#the test set to get our test loss and perplexity.


model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

In [52]:
def translate_sentence( #to use our model to translate every example from our test set, which we do with the translate_sentence
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = de_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]: #getting the predicted_token our model thinks is most likely to be next in the sequence
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens

In [53]:
# We'll pass a test example (something the model hasn't been trained on) to use as a sentence to test our translate_sentence function, passing in the German sentence
# and expecting to get something that looks like the English sentence.

sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]

sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [None]:
# The model doesn't just translate examples in the training, validation and test sets.
# We can use it to translate arbitrary sentences by passing any string to the translate_sentence.

In [54]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [55]:
translation

['<sos>',
 'a',
 'man',
 'in',
 'a',
 'white',
 'shirt',
 'is',
 'cutting',
 'something',
 '.',
 '<eos>']

In [56]:
# we input the German translation of "A man is watching a film."

sentence = "Ein Mann sitzt auf einer Bank."

In [57]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [58]:
translation # And we receive our translation, which is reasonably close.

['<sos>', 'man', 'sitting', 'on', 'a', 'bench', '.', '<eos>']

In [59]:

# We can now loop over our test_data, getting our model's translation of each test sentence

translations = [
    translate_sentence(
        example["de"],
        model,
        en_nlp,
        de_nlp,
        en_vocab,
        de_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    )
    for example in tqdm.tqdm(test_data)
]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 235.22it/s]


In [1]:
# To calculate BLEU, we'll use the evaluate library. It's recommended to use libraries for measuring metrics
#  to ensure there are now bugs in your metric calculations and giving you potentially incorrect results.

In [60]:
# The BLEU metric can be loaded from the evaluate library like so:

bleu = evaluate.load("bleu")

In [3]:
#  We also convert our translations from a list of tokens into a string by joining them with spacing 
#     inbetween and getting rid of the <sos> and <eos> tokens (as they will never appear in our reference sentences).

In [61]:
predictions = [" ".join(translation[1:-1]) for translation in translations]

references = [[example["en"]] for example in test_data]

In [2]:
# We also need to define a function which tokenizes an input string. This will be used 
# to calculate the BLEU score by comparing our predicted tokens against the reference tokens.

In [62]:
predictions[0], references[0]

('a man in a white shirt is cutting something .',
 ['A man in an orange hat starring at something.'])

In [63]:
# The "get_tokenize_fn" returns our "tokenizer_fn", which uses our "spaCy" tokenizer and lowercases tokens if necessary.

def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

In [64]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)

In [65]:
tokenizer_fn(predictions[0]), tokenizer_fn(references[0][0])

(['a', 'man', 'in', 'a', 'white', 'shirt', 'is', 'cutting', 'something', '.'],
 ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.'])

In [4]:
# Finally, we calculate the BLEU metric across our test set!

# We pass our predictions, references and our tokenizer_fn to the compute method of the BLEU metric to get our results.

In [66]:
results = bleu.compute(
    predictions=predictions, references=references, tokenizer=tokenizer_fn
)

In [67]:
results

# We get a BLEU score of 0.14! Nothing to write home about, but not bad for our first translation model

{'bleu': 0.13756994726803526,
 'precisions': [0.4681576952236543,
  0.18843314191960622,
  0.09133154602323502,
  0.04445534838076546],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0101087455965692,
 'translation_length': 13190,
 'reference_length': 13058}