
## Data Collection and Preprocessing
## Data Loading


#### Packages and Libraries Installation

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

#read dataset
with open('DGT.en-ga.en', 'r', encoding='utf-8') as e:
    english_sent = e.readlines()

with open('DGT.en-ga.ga', 'r', encoding='utf-8') as ir:
    irish_sent = ir.readlines()

sent_data = {'English Sentences':english_sent,'Irish Sentences':irish_sent}

#create dataframe
df_sent_data = pd.DataFrame(sent_data)
df_sent_data = df_sent_data.dropna()

#Sample 12000 rows
df_sent_data["source_len"] = df_sent_data["English Sentences"].apply(lambda x: len(x.split(" ")))
df_sent_data["target_len"]= df_sent_data["Irish Sentences"].apply(lambda x: len(x.split(" ")))

sampled_data = df_sent_data.query("source_len <= 15").query("target_len <= 15").sample(12000, random_state=2015,replace = False)

sampled_data.head(5)

Unnamed: 0,English Sentences,Irish Sentences,source_len,target_len
43782,Article 1\n,Airteagal 1\n,2,2
179445,Article 4d\n,1352/2011 ón gCoimisiún\n,2,3
5098,the basic characteristics of the bodywork shap...,is ionann saintréithe bunúsacha chruth na cabh...,10,7
168583,Acting in accordance with the ordinary legisla...,Tar éis dóibh an dréachtghníomh reachtach a ch...,8,12
34778,Whereas:\n,De bharr an méid seo a leanas:\n,1,7


In [None]:
max(sampled_data.target_len)

15

#### Train-Development-Test Split

In [None]:
train, test = train_test_split(sampled_data, test_size=1000, random_state=2023)
train, val = train_test_split(train, test_size=1000, random_state=2023)

train["split"] = "train"
val["split"] = "val"
test["split"] = "test"

dataset = pd.concat([train, val, test])

print(f"Datasets => Train {len(train)} | Val {len(val)} | Test {len(test)}")

Datasets => Train 10000 | Val 1000 | Test 1000


## Preprocessing




#### Lowercase, Remove punctuation and Tokenize text

In [None]:
from nltk.tokenize import word_tokenize
from typing import List
import re

class Langauge:
  def __init__(self, lang: str):
    # Name of the langauge
    self.lang = lang
    # Mapping of each word in vocabulary to id
    self.word2idx = {"PAD": 0, "BOF": 1, "EOS": 2}
    # Reverse mapping from id to word in vocabulary
    self.idx2word = {0: "PAD", 1: "BOF", 2: "EOS"}
    # Count of each word in vocabulary
    self.word2cnt = {}
    # number of words present in vocabulary
    self.n_words = len(self.idx2word)

  def addSentence(self, sentence: str):
    """
    Given a sentence, lowercase is and remove any punctuation. Tokenize the
    sentence and for each word in the tokenized list call the addWord method.
    """
    txt = sentence.lower()
    clean_text = re.sub(r'[^\w\s]', '', txt).strip()
    for word in word_tokenize(clean_text):
      self.addWord(word)

  def addWord(self, word: str):
    """
    For each input word, check if it exists in the the word2index. If it does
    not, add the word to the word2index and set the value to the current
    vocabulary length. Update the index2word entry as well which maps the token
    id to the word. Finaally update the vocabulary count (n_words).

    If the word is already in the vocabulary, udpate the count.
    """
    if word not in self.word2idx:
      self.word2idx[word] = self.n_words
      self.word2cnt[word] = 1
      self.idx2word[self.n_words] = word
      self.n_words += 1
    else:
      self.word2cnt[word] += 1

  def encodeSentence(self, sentence: str) -> List[int]:
    """
    Given a sentence:
      1. Lower case it
      2. Remove all punctuation
      3. Prepend SOS and append EOS to it.
      4. Tokenize it and return the word ids for each word in the tokenized list. If a word
      does not exist in the vocab, skip over it.

      Return a list of word ids.
    """
    txt = sentence.lower()
    clean_txt = re.sub(r'[^\w\s]', '',txt).strip()
    clean_txt = "BOF " + clean_txt + " EOS"
    return [self.word2idx[word] for word in word_tokenize(clean_txt) if word in self.word2idx]

  def decodeIds(self, ids: list) -> List[str]:
    """
    Given a list of word ids, look the ids in the index2word and return a
    string representing the decoded sentence.
    """
    return " ".join([self.idx2word[tok] for tok in ids])

#### Build seperate vocabularies for English and Irish Language.

In [None]:
from tqdm.notebook import tqdm

english = Langauge("english")
irish = Langauge("irish")

for _, row in tqdm(dataset.iterrows(), total=len(dataset)):
  english.addSentence(row["English Sentences"])
  irish.addSentence(row["Irish Sentences"])
print(f"Size of Irish vocab: {irish.n_words}")
print(f"Size of English vocab: {english.n_words}")

  0%|          | 0/12000 [00:00<?, ?it/s]

Size of Irish vocab: 9700
Size of English vocab: 7986


#### Print Statistics

Number of Samples

In [None]:
print(f" Number of samples are: ", len(sampled_data))

 Number of samples are:  12000


Maximum sequence length of Source Language

In [None]:
print(f" Maximum sequence length of English(source) Language is:" ,max(sampled_data.source_len))

 Maximum sequence length of English(source) Language is: 15


Maximum sequence length of Target Language

In [None]:
print(f" Maximum sequence length of Irish(target) Language is:" ,max(sampled_data.target_len))

 Maximum sequence length of Irish(target) Language is: 15


Number of unique source language tokens

In [None]:
print(f" Number of unique source language(english) token are:" , english.n_words)

 Number of unique source language(english) token are: 7986


Number of unique target language tokens

In [None]:
print(f" Number of unique source language(irish) token are:" , irish.n_words)

 Number of unique source language(irish) token are: 9700


## Encoder-Decoder Model Implementation and Training




In [None]:
import torch
import torch.nn as nn

In [None]:
class Encoder_Model(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        return hidden

In [None]:
class Decoder_Model(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)

        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, context):

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        emb_con = torch.cat((embedded, context), dim = 2)

        output, hidden = self.rnn(emb_con, hidden)

        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)),
                           dim = 1)

        prediction = self.fc_out(output)

        return prediction, hidden

In [None]:
class Sequence_Model(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder model and decoder model are not equal!"

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        context = self.encoder(src)

        hidden = context

        input = trg[0,:]

        for t in range(1, trg_len):

            output, hidden = self.decoder(input, hidden, context)

            #predictions in a tensor placed for each token
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio

            top1 = output.argmax(1)

            input = trg[t] if teacher_force else top1

        return outputs

In [None]:
INPUT_DIMENS = english.n_words
OUTPUT_DIMENS = irish.n_words
ENC_EMB_DIMENS = 256
DEC_EMB_DIMENS = 256
HID_DIMENS = 256
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5

enc = Encoder_Model(INPUT_DIMENS, ENC_EMB_DIMENS, HID_DIMENS, ENCODER_DROPOUT)
dec = Decoder_Model(OUTPUT_DIMENS, DEC_EMB_DIMENS, HID_DIMENS, DECODER_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Sequence_Model(enc, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)

model.apply(init_weights)

Sequence_Model(
  (encoder): Encoder_Model(
    (embedding): Embedding(7986, 256)
    (rnn): GRU(256, 256)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder_Model(
    (embedding): Embedding(9700, 256)
    (rnn): GRU(512, 256)
    (fc_out): Linear(in_features=768, out_features=9700, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

## Training the Encoder-Decoder Model


In [None]:
import torch
from tensorflow.keras.utils import pad_sequences
import pandas as pd

def encode_features(
    df: pd.DataFrame,
    english: Langauge,
    irish: Langauge,
    pad_token: int = 0,
    max_seq_length = 10
  ):

  source = []
  target = []

  for _, row in df.iterrows():
    source.append(english.encodeSentence(row["English Sentences"]))
    target.append(irish.encodeSentence(row["Irish Sentences"]))

  source = pad_sequences(
      source,
      maxlen=max_seq_length,
      padding="post",
      truncating = "post",
      value=pad_token
    )

  target = pad_sequences(
      target,
      maxlen=max_seq_length,
      padding="post",
      truncating = "post",
      value=pad_token
    )

  return source, target

train_source, train_target = encode_features(train, english, irish)
val_source, val_target = encode_features(val, english, irish)
test_source, test_target = encode_features(test, english, irish)

print(f"Shapes of train source {train_source.shape}, and target {train_target.shape}")

Shapes of train source (10000, 10), and target (10000, 10)


In [None]:
from torch.utils.data import DataLoader, TensorDataset

train_dl = DataLoader(
    TensorDataset(
        torch.LongTensor(train_source),
        torch.LongTensor(train_target)
    ),
    shuffle = True,
    batch_size = 32
)

val_dl = DataLoader(
    TensorDataset(
        torch.LongTensor(val_source),
        torch.LongTensor(val_target)
    ),
    shuffle = False,
    batch_size = 32
)

test_dl = DataLoader(
    TensorDataset(
        torch.LongTensor(test_source),
        torch.LongTensor(test_target)
    ),
    shuffle = False,
    batch_size = 32
)

In [None]:
import torch.nn.functional as F

In [None]:
loss_criteria = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
from tqdm.notebook import tqdm
import random
import numpy as np

optimizer = torch.optim.Adam(model.parameters())

loss_criteria = nn.CrossEntropyLoss()

def train(model, train_dl, dev_dl, optimizer, loss_criteria, epochs):
  for epoch in range(epochs):

    # Model Train
    model.train()
    train_loss = 0
    num_batches = 0
    for batch in train_dl:
         src, trg = batch
         src = src.to(device)
         trg = trg.to(device)

         optimizer.zero_grad()

    output = model(src, trg)

    output_dim = output.shape[-1]
    output = output[1:].view(-1, output_dim)
    trg = trg[1:].view(-1)

    loss = loss_criteria(output, trg)
    loss.backward()

    optimizer.step()

    train_loss += loss.item()
    num_batches += 1

    train_loss /= num_batches

# Model Evaluate
    model.eval()
    eval_loss = 0
    num_batches = 0
    with torch.no_grad():
     for batch in dev_dl:
      src, trg = batch
      src = src.to(device)
      trg = trg.to(device)

    output = model(src, trg, 0)
    output_dimen = output.shape[-1]
    output = output[1:].view(-1, output_dimen)
    trg = trg[1:].view(-1)
    #loss calculate
    loss = loss_criteria(output, trg)

    eval_loss += loss.item()
    num_batches += 1
    eval_loss /= num_batches

    print(f"Epoch {epoch + 1}: Train loss = {train_loss:.4f}, Eval loss = {eval_loss:.4f}")

In [None]:
epochs = 5
train(model, train_dl, val_dl, optimizer, loss_criteria, epochs)

Epoch 1: Train loss = 8.9949, Eval loss = 8.9294
Epoch 2: Train loss = 8.9331, Eval loss = 8.7360
Epoch 3: Train loss = 8.6378, Eval loss = 8.4028
Epoch 4: Train loss = 8.0132, Eval loss = 7.8383
Epoch 5: Train loss = 7.0070, Eval loss = 6.9821


# Evaluation on the Test Set using BLEU Metric


In [None]:
def translate_sentence(
    text: str,
    model: model,
    english: Langauge,
    irish: Langauge,
    device: str,
    max_len: int = 10,
  ) -> str:

  # Encode english sentence and convert to tensor
  input_ids = english.encodeSentence(text)
  input_tensor = torch.LongTensor(input_ids).unsqueeze(1).to(device)

  # Get encooder hidden states
  with torch.no_grad():
    hidden = model.encoder(input_tensor)

  # Build target holder list
  trg_indexes = [irish.word2idx["BOF"]]

  # Loop over sequence length of target sentence
  for i in range(max_len):
    trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

    # Decode the encoder outputs with respect to current target word
    with torch.no_grad():
      output, hidden = model.decoder(trg_tensor, hidden, hidden)

    # Retrieve most likely word over target distribution
    pred_token = torch.argmax(output).item()
    trg_indexes.append(pred_token)

    if pred_token == irish.word2idx["EOS"]:
      break
  return "".join(irish.decodeIds(trg_indexes))

In [None]:
for _, row in tqdm(test.iterrows(), total=len(test)):

 translate_sentence(row['Irish Sentences'], model , english, irish, device)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
def bleu_score1(reference_sentence, translate_sentence):
  return len([word for word in translate_sentence if word in reference_sentence])/len(reference_sentence)

bl = []

for _, row in tqdm(test.iterrows(), total=len(test)):
  bl.append(bleu_score1(row['Irish Sentences'], translate_sentence(row['Irish Sentences'], model , english, irish, device)))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
print("Bleu Score is:",bl)

mean_bl = sum(bl)/len(bl)
print("Average Bleu Score is:",mean_bl)

Bleu Score is: [1.6666666666666667, 0.11235955056179775, 0.12048192771084337, 0.136986301369863, 0.13157894736842105, 0.11494252873563218, 1.1764705882352942, 0.37735849056603776, 1.25, 0.2608695652173913, 0.625, 0.8333333333333334, 0.2127659574468085, 1.6666666666666667, 2.8181818181818183, 1.5384615384615385, 0.8333333333333334, 0.12345679012345678, 1.5384615384615385, 1.9090909090909092, 0.22826086956521738, 0.42857142857142855, 0.15748031496062992, 1.0, 1.5384615384615385, 1.6666666666666667, 1.5384615384615385, 0.7142857142857143, 0.23529411764705882, 0.3448275862068966, 0.14285714285714285, 0.3548387096774194, 2.3846153846153846, 0.11494252873563218, 0.24096385542168675, 1.4285714285714286, 0.2619047619047619, 1.375, 0.15151515151515152, 0.0, 1.4285714285714286, 0.8333333333333334, 2.5, 0.29850746268656714, 0.12987012987012986, 1.5384615384615385, 0.1111111111111111, 0.2127659574468085, 1.6666666666666667, 0.12048192771084337, 0.36666666666666664, 0.0, 0.19444444444444445, 1.5384

## Improving NMT using Attention


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EncoderGRU(nn.Module):
    def __init__(
        self,
        input_vocab_size,  # size of source vocabulary
        hidden_dim,        # hidden dimension of embeddings
        encoder_hid_dim,   # gru hidden dim
        decoder_hid_dim,   # decoder hidden dim
        dropout_prob = .5
      ):

        super().__init__()
        self.embedding = nn.Embedding(input_vocab_size, hidden_dim)
        self.rnn = nn.GRU(hidden_dim, encoder_hid_dim, bidirectional = True)
        self.fc = nn.Linear(encoder_hid_dim * 2, decoder_hid_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, src):

        #src = [src len, batch size]
        embedded = self.dropout(self.embedding(src))

        #embedded = [src len, batch size, emb dim]
        outputs, hidden = self.rnn(embedded)

        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer

        #hidden [-2, :, : ] is the last of the forwards GRU
        #hidden [-1, :, : ] is the last of the backwards GRU

        #initial decoder hidden is final hidden state of the forwards and backwards
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        return outputs, hidden

In [None]:
class Attention(nn.Module):
    def __init__(
        self,
        enc_hid_dim,      # Encoder hidden dimension
        dec_hid_dim       # Decoder hidden dimension
      ):
        super().__init__()

        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)

    def forward(self, hidden, encoder_outputs):

        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))

        #energy = [batch size, src len, dec hid dim]
        attention = self.v(energy).squeeze(2)

        #attention output: [batch size, src len]
        return F.softmax(attention, dim=1)

In [None]:
class DecoderGRU(nn.Module):
    def __init__(
        self,
        target_vocab_size,    # Size of target vocab
        hidden_dim,           # hidden size of embedding
        enc_hid_dim,
        dec_hid_dim,
        dropout
      ):
        super().__init__()

        self.output_dim = target_vocab_size
        self.attention = Attention(enc_hid_dim, dec_hid_dim)

        self.embedding = nn.Embedding(target_vocab_size, hidden_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + hidden_dim, dec_hid_dim)

        self.fc_out = nn.Linear(
            (enc_hid_dim * 2) + dec_hid_dim + hidden_dim,
            target_vocab_size
          )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):

        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]

        input = input.unsqueeze(0)  # [1, batch size]

        embedded = self.dropout(self.embedding(input))  # [1, batch size, emb dim]

        a = self.attention(hidden, encoder_outputs)     # [batch size, src len]
        a = a.unsqueeze(1)                              # [batch size, 1, src len]

        encoder_outputs = encoder_outputs.permute(1, 0, 2) # [batch size, src len, enc hid dim * 2]

        weighted = torch.bmm(a, encoder_outputs)           # [batch size, 1, enc hid dim * 2]
        weighted = weighted.permute(1, 0, 2)               # [1, batch size, enc hid dim * 2]

        rnn_input = torch.cat((embedded, weighted), dim = 2) # [1, batch size, (enc hid dim * 2) + emb dim]


        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))

        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)

        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1)) # [batch size, output dim]
        return prediction, hidden.squeeze(0)

In [None]:
import random
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):

        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)

        encoder_outputs, hidden = self.encoder(src)

        input = trg[0,:]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)

            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio

            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

In [None]:
INPUT_DIM = english.n_words
OUTPUT_DIM = irish.n_words
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 128
DEC_HID_DIM = 128
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = EncoderGRU(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = DecoderGRU(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT)

model2 = EncoderDecoder(enc, dec)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model2.apply(init_weights)

EncoderDecoder(
  (encoder): EncoderGRU(
    (embedding): Embedding(7986, 256)
    (rnn): GRU(256, 128, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): DecoderGRU(
    (attention): Attention(
      (attn): Linear(in_features=384, out_features=128, bias=True)
      (v): Linear(in_features=128, out_features=1, bias=False)
    )
    (embedding): Embedding(9700, 256)
    (rnn): GRU(512, 128)
    (fc_out): Linear(in_features=640, out_features=9700, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
from tqdm.notebook import tqdm
import numpy as np
optimizer = torch.optim.Adam(model.parameters())

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model2.to(device)

EPOCHS = 1
best_val_loss = float('inf')

for epoch in range(EPOCHS):

  model2.train()
  epoch_loss = 0
  for batch in tqdm(train_dl, total=len(train_dl)):

     src = batch[0].transpose(1, 0).to(device)
     trg = batch[1].transpose(1, 0).to(device)

     optimizer.zero_grad()

     output = model(src, trg)

     output_dim = output.shape[-1]
     output = output[1:].view(-1, output_dim).to(device)
     trg = trg[1:].reshape(-1)

     loss = F.cross_entropy(output, trg)
     loss.backward()

     torch.nn.utils.clip_grad_norm_(model2.parameters(), 1)
     optimizer.step()
     epoch_loss += loss.item()

  train_loss = round(epoch_loss / len(train_dl), 3)

  eval_loss = 0
  model2.eval()
  for batch in tqdm(val_dl, total=len(val_dl)):
    src = batch[0].transpose(1, 0).to(device)
    trg = batch[1].transpose(1, 0).to(device)

    with torch.no_grad():
      output = model(src, trg)

      output_dim = output.shape[-1]
      output = output[1:].view(-1, output_dim).to(device)
      trg = trg[1:].reshape(-1)

      loss = F.cross_entropy(output, trg)

      eval_loss += loss.item()

  val_loss = round(eval_loss / len(val_dl), 3)
  print(f"Epoch {epoch} | train loss {train_loss} | train ppl {np.exp(train_loss)} | val ppl {np.exp(val_loss)}")

  if val_loss < best_val_loss:
    best_val_loss = val_loss
    torch.save(model2.state_dict(), 'best-model.pt')

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

Epoch 0 | train loss 4.538 | train ppl 93.50360577597665 | val ppl 61.620832296576545


In [None]:
def translate_sentence(
    text: str,
    model2: EncoderDecoder,
    english: Langauge,
    irish: Langauge,
    device: str,
    max_len: int = 10,
  ) -> str:

  # Encode english sentence and convert to tensor
  input_ids = english.encodeSentence(text)
  input_tensor = torch.LongTensor(input_ids).unsqueeze(1).to(device)

  # Get encooder hidden states
  with torch.no_grad():
    encoder_outputs, hidden = model2.encoder(input_tensor)

  # Build target holder list
  trg_indexes = [irish.word2idx["BOF"]]

  # Loop over sequence length of target sentence
  for i in range(max_len):
    trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

    # Decode the encoder outputs with respect to current target word
    with torch.no_grad():
      output, hidden = model2.decoder(trg_tensor, hidden, encoder_outputs)

    # Retrieve most likely word over target distribution
    pred_token = torch.argmax(output).item()
    trg_indexes.append(pred_token)

    if pred_token == irish.word2idx["EOS"]:
      break

  return "".join(irish.decodeIds(trg_indexes))

In [None]:
def bleu_score2(reference_sent, translate_sent):
  return len([word for word in translate_sent if word in reference_sent])/len(reference_sent)

bl_attention = []

for _, row in tqdm(test.iterrows(), total=len(test)):
  bl_attention.append(bleu_score2(row['Irish Sentences'], translate_sentence(row['Irish Sentences'], model2 , english, irish, device)))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
print("Bleu Score with Attention is:",bl_attention)

mean_bl_attention = sum(bl_attention)/len(bl_attention)
print("Average Bleu Score with Attention is:",mean_bl_attention)

Bleu Score with Attention is: [3.5, 0.8876404494382022, 1.0120481927710843, 1.1643835616438356, 1.0657894736842106, 0.9310344827586207, 2.6470588235294117, 1.2830188679245282, 2.75, 0.7217391304347827, 2.0625, 2.6666666666666665, 1.553191489361702, 3.5, 1.0, 3.5384615384615383, 1.9166666666666667, 1.0987654320987654, 3.3076923076923075, 1.0, 0.8478260869565217, 1.3265306122448979, 0.6377952755905512, 3.05, 3.4615384615384617, 3.5, 3.3076923076923075, 0.7142857142857143, 0.9411764705882353, 1.8620689655172413, 1.1142857142857143, 1.064516129032258, 0.8461538461538461, 1.0, 0.9759036144578314, 3.0, 1.6904761904761905, 1.5, 1.2727272727272727, 0.1, 3.142857142857143, 2.6666666666666665, 3.875, 1.3134328358208955, 0.8571428571428571, 3.3846153846153846, 0.8666666666666667, 0.8617021276595744, 3.5, 0.963855421686747, 1.0333333333333334, 2.888888888888889, 0.7870370370370371, 3.3076923076923075, 0.8461538461538461, 2.869565217391304, 2.6666666666666665, 2.727272727272727, 3.0526315789473686,