In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from tqdm import tqdm

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from torch.functional import F

In [3]:
class InputEmbedding(nn.Module):

    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
        # shape = [num of words * dimension of embedding layer]

    def forward(self, x):
        # return self.embedding(x)
        return self.embedding(x) * math.sqrt(self.d_model)
        # dimension same

In [4]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, seq_length, device,dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.seq_length = seq_length
        self.dropout = nn.Dropout(dropout)
        
        # print(self.seq_length)
        pe = torch.zeros(self.seq_length, self.d_model)  # To get the matrix of dimension as of embedding layer
        positions = torch.arange(0, self.seq_length, dtype = torch.float).unsqueeze(1)  # matrix of [seq_length x 1]
        
        # div_term = (positions * (torch.pow(10000, 2 * torch.arange(0, d_model, 2).float() /self.d_model))) #to calculate say (angle)  pos/(10000^(2i/dmodel))
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(positions * div_term)   #Apply sine formula in even positions
        pe[:, 1::2] = torch.cos(positions * div_term)   # Appply cosine formula in odd positions
        
        pe = pe.unsqueeze(0)  # for batches dimension [1 x seq_length x d_model]

        self.register_buffer('pe', pe) # By adding this in register buffer this stores pe too while saving the model without considering it as a learning parameter
                

    def forward(self, x):
        x = x + self.pe[:, :x.shape[1], :].requires_grad_(False)  #To make it not to learn
        return self.dropout(x)

In [5]:
# class LayerNormalization(nn.Module):
#     def __init__(self, d_model,epsilon=1e-5):
#         super(LayerNormalization, self).__init__()
#         self.epsilon = epsilon
#         self.gamma = nn.Parameter(torch.ones(d_model))  # Scale
#         self.beta = nn.Parameter(torch.zeros(d_model))  # Shift

#     def forward(self, x):
#         mean = x.mean(dim=-1, keepdim=True)
#         std = x.std(dim=-1, keepdim=True, unbiased = False) 
#         return self.gamma * (x - mean) / (std + self.epsilon) + self.beta

In [6]:
class FeedForward(nn.Module):

    def __init__(self, d_model, dff, dropout = 0.1):
        super(FeedForward, self).__init__()
        self.forward1 = nn.Linear(d_model, dff)
        self.dropout = nn.Dropout(dropout)
        self.forward2 = nn.Linear(dff, d_model)

    def forward(self, x):
        return self.forward2(self.dropout(torch.relu(self.forward1(x))))

In [7]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, heads, device,dropout = 0.1):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.heads = heads
        self.d_heads = d_model//heads
        self.device = device

        self.w_q = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x_q, x_k, x_v, mask = None):
        
        batch_size, seq_len, d_model = x_q.shape
        # print(x_q.shape, x_k.shape, x_v.shape)
        query = self.w_q(x_q).view(batch_size, x_q.shape[1], self.heads, self.d_heads).permute(0, 2, 1, 3)
        key = self.w_k(x_k).view(batch_size, x_k.shape[1], self.heads, self.d_heads).permute(0, 2, 1, 3)
        value = self.w_v(x_v).view(batch_size, x_v.shape[1], self.heads, self.d_heads).permute(0, 2, 1, 3)

        similarity = torch.matmul(query, key.transpose(-2, -1))/math.sqrt(self.d_heads)
        # print(similarity.shape)
        if mask is not None:
            # print(mask.shape, similarity.shape)
            # mask = mask.to(self.device)
            # print(mask.shape)
            similarity = similarity.masked_fill(~mask.bool() , float('-inf'))
        sim = F.softmax(similarity, dim = -1)
        # print(f"x_q \t: {x_q.shape}\nx_k \t: {x_k.shape}\nx_v \t: {x_v.shape}\nquery \t: {query.shape}\nkey \t: {key.shape}\nvalue \t: {value.shape}\nsimilarity \t: {similarity.shape}\nsim \t: {sim.shape}")
        # print(sim.shape, value.shape)
        final = torch.matmul(sim, value)
        final = self.dropout(final)

        final = final.permute(0, 2, 1, 3).contiguous()
        final = final.view(batch_size, seq_len, self.d_model)
        

        return self.w_o(final)

In [8]:
class ResidualConnection(nn.Module):

    def __init__(self, d_model ,dropout):

        super(ResidualConnection, self).__init__()
        self.ln = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x1, x2):

        return x1 + self.dropout(self.ln(x2))

In [9]:
class EncoderBlock(nn.Module):

    def __init__(self,  dff, d_model, heads, device,dropout):

        super(EncoderBlock, self).__init__()
        self.multi_attention = MultiHeadAttention(d_model, heads, device,dropout)
        self.residual_connections = nn.ModuleList([ResidualConnection(d_model, dropout) for _ in range(2)])

        self.feed_forward = FeedForward(d_model, dff)

    def forward(self, x, mask):

        x1 = self.multi_attention(x, x, x, mask)
        x2 = self.residual_connections[0](x, x1)
        x3 = self.feed_forward(x2)
        x4 = self.residual_connections[1](x2, x3)
        return x4

In [10]:
class Encoder(nn.Module):

    def __init__(self, vocab_size, dff, seq_length, d_model, heads,device, dropout, n = 6):

        super(Encoder, self).__init__()

        self.embedding_layer = InputEmbedding(d_model, vocab_size)
        self.positional_encoding = PositionalEncoding(d_model, seq_length, device,dropout)
        
        self.encoder_blocks = nn.ModuleList([EncoderBlock(dff, d_model, heads,device,dropout) for _ in range(n)])

    def forward(self,x_e):

        x = self.positional_encoding(self.embedding_layer(x_e))
        self.mask = (x_e != 0).long().unsqueeze(1).unsqueeze(2)
        # print(self.mask.shape)
        for block in self.encoder_blocks:
            x = block(x, self.mask)
        return x, self.mask

In [11]:
class DecoderBlock(nn.Module):

    def __init__(self, dff, d_model, heads, device,dropout):

        super(DecoderBlock, self).__init__()
        
        self.masked_attention = MultiHeadAttention(d_model, heads, device, dropout)
        self.residual_connections = nn.ModuleList([ResidualConnection(d_model, dropout) for _ in range(3)])
        self.cross_attention = MultiHeadAttention(d_model, heads, device, dropout)
        self.feed_forward = FeedForward(d_model, dff)


    def forward(self, x, x_enc, tgt_mask, src_mask):
        # print(x.shape, x_enc.shape)
        # print('here3')



        x1= self.masked_attention(x, x, x, tgt_mask)
        x2 = self.residual_connections[0](x, x1)

        x3= self.cross_attention(x2, x_enc, x_enc, src_mask)
        x4 = self.residual_connections[1](x2, x3)

        x5 = self.feed_forward(x4)
        x6 = self.residual_connections[2](x4, x5)

        return x6

In [12]:
class Decoder(nn.Module):

    def __init__(self, vocab_size, dff, seq_length, d_model, heads,device ,dropout, n = 6):

        super(Decoder, self).__init__()
        
        self.embedding_layer = InputEmbedding(d_model, vocab_size)
        self.positional_encoding = PositionalEncoding(d_model, seq_length, device,dropout)

        self.decoder_blocks = nn.ModuleList([DecoderBlock(dff, d_model, heads,device, dropout) for _ in range(n)])

        self.register_buffer("casual_mask", torch.tril(torch.ones(seq_length, seq_length)))
        self.linear = nn.Linear(d_model, vocab_size)
        # print('decoder')/


    def forward(self, x_d, x_enc, src_mask = None, tgt_mask = None):
        # print('here1')
        if src_mask is None:
            src_mask = (x_d != 0).unsqueeze(1).unsqueeze(2) 
            casual_mask = self.casual_mask[:x_d.shape[1], :x_d.shape[1]].unsqueeze(0).unsqueeze(0)
            tgt_mask = src_mask.bool() & casual_mask.bool()
  
        x = self.positional_encoding(self.embedding_layer(x_d))

        # print(f"Mask : ", self.mask1.shape, self.mask2, self.mask2.shape)
        for block in self.decoder_blocks:
            # print('here2')
            x = block(x, x_enc, tgt_mask.long(), src_mask.long())
            # print(self.mask)
            # print('here')
        return (self.linear(x))

In [13]:
class MyTransformer(nn.Module):

    def __init__(self, vocab_size_source, vocab_size_target, seq_length_source, seq_length_target, device,d_model = 512, heads = 8, dropout = 0.1, dff = 2048):

        super(MyTransformer, self).__init__()

        self.encoder = Encoder(vocab_size_source, dff, seq_length_source, d_model, heads, device, dropout)
        # print(1)
        self.decoder = Decoder(vocab_size_target, dff, seq_length_target, d_model, heads, device, dropout)

        # for parameter in self.parameters():
        #     if isinstance(parameter, nn.Linear):
        #         nn.init.xavier_uniform_(parameter)

    def forward(self, x_in, x_op=None):
        # print('here')
        x_enc, src_mask = self.encoder(x_in)
        
        if x_op is None:
            return x_enc, src_mask
            
        output = self.decoder(x_op, x_enc)
        return output

In [14]:
from datasets import load_dataset
import string
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk_dir = r'/kaggle/working/'
nltk.download('stopwords', download_dir = nltk_dir)
nltk.download('wordnet', download_dir = nltk_dir)
nltk.download('punkt', download_dir = nltk_dir)
nltk.data.path.append('/kaggle/working/')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /kaggle/working/...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data] Downloading package punkt to /kaggle/working/...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [15]:
dataset = load_dataset("Helsinki-NLP/opus_books", 'en-es')
dfff = dataset['train'].to_pandas()
dfff = dfff['translation']
df = pd.DataFrame(dfff.to_list())[:40000]

df['len1'] = df['en'].apply(lambda x : len(x.split()))
df['len2'] = df['es'].apply(lambda x : len(x.split()))
df = df[(df['len1'] >5) & (df['len1'] <100) & (df['len2'] >5) & (df['len2'] <100)]

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/93470 [00:00<?, ? examples/s]

In [16]:
df.head()

Unnamed: 0,en,es,len1,len2
4,The family of Dashwood had long been settled i...,La familia Dashwood llevaba largo tiempo afinc...,10,9
5,"Their estate was large, and their residence wa...","Su propiedad era de buen tamaño, y en el centr...",40,41
6,The late owner of this estate was a single man...,El último dueño de esta propiedad había sido u...,34,36
7,"But her death, which happened ten years before...","Pero la muerte de ella, ocurrida diez años ant...",54,56
8,"In the society of his nephew and niece, and th...","En compañía de su sobrino y sobrina, y de los ...",18,21


In [17]:
# max(df['len2'])

In [18]:

X_token = [text for tokens in df['en'] for text in tokens.split()]
c = Counter(X_token)

In [19]:
len(c)

70025

In [20]:
len(df)

32814

In [21]:
len(df)

32814

In [22]:

import sentencepiece as spm

class PreprocessText(nn.Module):

    def __init__(self, df, vocab_size=8000, model_prefix="bpe"):
        super(PreprocessText, self).__init__()
        self.df = df
        self.vocab_size = vocab_size
        self.model_prefix = model_prefix

        self.pad_id = 0
        self.unk_id = 1
        self.bos_id = 2
        self.eos_id = 3

        # Train BPE tokenizer
        self.train_bpe()

        # Load trained tokenizer
        self.sp_en = spm.SentencePieceProcessor(model_file=f"{model_prefix}_en.model")
        self.sp_es = spm.SentencePieceProcessor(model_file=f"{model_prefix}_es.model")

    def train_bpe(self):
        """Train BPE model for both languages using SentencePiece"""
        with open("en_corpus.txt", "w", encoding="utf-8") as f:
            f.write("\n".join(self.df['en'].astype(str).tolist()))

        with open("es_corpus.txt", "w", encoding="utf-8") as f:
            f.write("\n".join(self.df['es'].astype(str).tolist()))
            
        spm.SentencePieceTrainer.train(
            input="en_corpus.txt", 
            model_prefix=f"{self.model_prefix}_en",
            vocab_size=self.vocab_size, 
            model_type="bpe",
            pad_id=self.pad_id,
            unk_id=self.unk_id,
            bos_id=self.bos_id,
            eos_id=self.eos_id
        )

        spm.SentencePieceTrainer.train(
            input="es_corpus.txt",
            model_prefix=f"{self.model_prefix}_es",
            vocab_size=self.vocab_size,
            model_type="bpe",
            pad_id=self.pad_id,
            unk_id=self.unk_id,
            bos_id=self.bos_id,
            eos_id=self.eos_id
        )

    def tokenize_text(self, text, sp):
        """Tokenize using trained BPE model"""
        text = text.lower()  
        text = text.translate(str.maketrans('', '', string.punctuation))  
        return sp.encode(text, out_type=str)

    def build_vocab(self, sp):
        """Get the vocabulary dictionary from the trained BPE model"""
        return {sp.id_to_piece(i): i for i in range(sp.get_piece_size())}


    def text_to_sequence(self, tokens, vocab):
        """Convert tokens to sequence of indices"""
        # tokens = ['<s>'] + tokens + ['</s>']
        return [vocab.get(token, self.unk_id) for token in tokens]

    def pad_sequences(self, seq, pad_index = 0):
        """Pad sequences to max_len"""
        max_len = self.max_len - 2 
        if len(seq) >= self.max_len:
            seq = seq[:self.max_len]
            return [self.bos_id] + seq + [self.eos_id]
        paded = [self.bos_id] + seq + [self.eos_id] + [self.pad_id] *  (self.max_len - len(seq))
        return paded
            # return [2] + seq[:self.max_len] + [3]
        # return [2] + seq + [3] + [pad_index] * (self.max_len - len(seq))
        # return np.array(seq + [0] * (self.max_len - len(seq)) if len(seq) < self.max_len else seq[:self.max_len])

    def build(self):
        """Preprocess text using BPE and return tensors for training"""
        # Tokenize text
        self.df['enn'] = self.df['en'].apply(lambda x: self.tokenize_text(x, self.sp_en))
        self.df['ess'] = self.df['es'].apply(lambda x: self.tokenize_text(x, self.sp_es))

        # Build vocab
        self.en_vocab = self.build_vocab(self.sp_en)
        self.es_vocab = self.build_vocab(self.sp_es)

        # Convert to sequences
        self.df['en_num'] = self.df['enn'].apply(lambda x: self.text_to_sequence(x, self.en_vocab))
        self.df['es_num'] = self.df['ess'].apply(lambda x: self.text_to_sequence(x, self.es_vocab))

        # Get max length
        self.max_len = max(self.df['en_num'].apply(len).max() + 2, self.df['es_num'].apply(len).max() + 2)

        # Pad sequences
        pad_index_en = self.en_vocab['<pad>']
        pad_index_es = self.es_vocab['<pad>']
        
        self.df['en_paded'] = self.df['en_num'].apply(lambda x : self.pad_sequences(x, pad_index_en))
        self.df['es_paded'] = self.df['es_num'].apply(lambda x : self.pad_sequences(x, pad_index_es))

        # Convert to tensors
        X = torch.tensor(self.df['en_paded'].to_list(), dtype=torch.long)
        y = torch.tensor(self.df['es_paded'].to_list(), dtype=torch.long)
        
        # Create training data
        source = X[:, 1:]
        target_ip = y[:, :-1]
        target_op = y[:, 1:]

        return self.df, source, target_ip, target_op, self.en_vocab, self.es_vocab, self.max_len + 2

# Example usage:
pt = PreprocessText(df)
df, src, tgt_ip, tgt_op, en_vocab, es_vocab, max_len = pt.build()

In [23]:
max_len

161

In [24]:
print(df.iloc[0]['en'])

The family of Dashwood had long been settled in Sussex.


In [25]:
# (en_vocab)

In [26]:
df.head()

Unnamed: 0,en,es,len1,len2,enn,ess,en_num,es_num,en_paded,es_paded
4,The family of Dashwood had long been settled i...,La familia Dashwood llevaba largo tiempo afinc...,10,9,"[▁the, ▁family, ▁of, ▁d, ash, wood, ▁had, ▁lon...","[▁la, ▁familia, ▁d, ash, wood, ▁llevaba, ▁larg...","[10, 1444, 30, 28, 744, 1121, 97, 432, 229, 23...","[31, 1647, 4, 1147, 1020, 1570, 1474, 420, 653...","[2, 10, 1444, 30, 28, 744, 1121, 97, 432, 229,...","[2, 31, 1647, 4, 1147, 1020, 1570, 1474, 420, ..."
5,"Their estate was large, and their residence wa...","Su propiedad era de buen tamaño, y en el centr...",40,41,"[▁their, ▁estate, ▁was, ▁large, ▁and, ▁their, ...","[▁su, ▁propiedad, ▁era, ▁de, ▁buen, ▁tamaño, ▁...","[260, 3647, 69, 1087, 27, 260, 5996, 69, 109, ...","[48, 5069, 201, 10, 319, 4260, 24, 32, 38, 357...","[2, 260, 3647, 69, 1087, 27, 260, 5996, 69, 10...","[2, 48, 5069, 201, 10, 319, 4260, 24, 32, 38, ..."
6,The late owner of this estate was a single man...,El último dueño de esta propiedad había sido u...,34,36,"[▁the, ▁late, ▁owner, ▁of, ▁this, ▁estate, ▁wa...","[▁el, ▁último, ▁dueño, ▁de, ▁esta, ▁propiedad,...","[10, 1894, 4794, 30, 140, 3647, 69, 6, 2561, 2...","[38, 1965, 3203, 10, 342, 5069, 142, 605, 46, ...","[2, 10, 1894, 4794, 30, 140, 3647, 69, 6, 2561...","[2, 38, 1965, 3203, 10, 342, 5069, 142, 605, 4..."
7,"But her death, which happened ten years before...","Pero la muerte de ella, ocurrida diez años ant...",54,56,"[▁but, ▁her, ▁death, ▁which, ▁happened, ▁ten, ...","[▁pero, ▁la, ▁muerte, ▁de, ▁ella, ▁ocur, rida,...","[136, 96, 1185, 134, 1501, 1313, 1021, 377, 90...","[188, 31, 1170, 10, 308, 977, 1900, 1590, 836,...","[2, 136, 96, 1185, 134, 1501, 1313, 1021, 377,...","[2, 188, 31, 1170, 10, 308, 977, 1900, 1590, 8..."
8,"In the society of his nephew and niece, and th...","En compañía de su sobrino y sobrina, y de los ...",18,21,"[▁in, ▁the, ▁society, ▁of, ▁his, ▁ne, phe, w, ...","[▁en, ▁compañía, ▁de, ▁su, ▁sob, r, ino, ▁y, ▁...","[43, 10, 3015, 30, 90, 161, 7033, 7948, 27, 32...","[32, 1852, 10, 48, 1930, 7937, 370, 24, 3510, ...","[2, 43, 10, 3015, 30, 90, 161, 7033, 7948, 27,...","[2, 32, 1852, 10, 48, 1930, 7937, 370, 24, 351..."


In [27]:
src.max().item()

7999

In [28]:
BATCH_SIZE = 32
train_dataset = TensorDataset(src, tgt_ip, tgt_op)

total = len(df)
train = int(0.9 * total)
val = total-train
train_dataset, val_dataset = random_split(train_dataset, [train, val])
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = True)

In [29]:
class TrainingandInference(nn.Module):

    def __init__(self, en_vocab, es_vocab, max_len, lr=0.00005):
        super(TrainingandInference, self).__init__()
        self.en_vocab = en_vocab
        self.es_vocab = es_vocab
        self.ignore = ['<pad>', '<unk>', '<s>', '</s>']  # Ignore only padding, keep BPE tokens
        self.pad_id = 0
        self.unk_id = 1
        self.bos_id = 2
        self.eos_id = 3
        
        vocab_size_source = len(en_vocab)
        vocab_size_target = len(es_vocab)
        seq_length_source = max_len - 1
        seq_length_target = max_len - 1
        d_model = 512
        heads = 8
        dropout = 0.1
        dff = 2048

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = MyTransformer(
            vocab_size_source, vocab_size_target, 
            seq_length_source, seq_length_target,
            self.device, d_model, heads, dropout, dff
        ).to(self.device)

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        self.criteria = nn.CrossEntropyLoss(ignore_index=0)
        self.model.apply(self.init_weights)

        self.inverse_en_vocab = {v: k for k, v in en_vocab.items()}
        self.inverse_es_vocab = {v: k for k, v in es_vocab.items()}

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0, std=0.02)

    def train(self, train_loader, val_loader, epochs):
        self.epochs = epochs
        for self.epoch in range(epochs):
            train_loss = 0
            # train_bar = tqdm(train_loader, desc=f"Epoch {self.epoch+1}/{epochs} [Training]", leave=True, dynamic_ncols = True)            
            self.model.train()
            for i, (source, target_ip, target_op) in enumerate(train_loader):
                source = source.to(self.device)
                target_ip = target_ip.to(self.device)
                target_op = target_op.to(self.device)

                self.optimizer.zero_grad()
                output = self.model(source, target_ip)
                
                output = output.reshape(-1, output.size(-1))
                target_op = target_op.reshape(-1)


                loss = self.criteria(output, target_op)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                self.optimizer.step()
                train_loss += loss.item()

                # train_bar.set_postfix(loss=f"{train_loss / (i+1):.4f}")
                # train_bar.update(1)
            self.validation(val_loader)
            print(f"Epoch: {self.epoch + 1}\tTrain Loss: {train_loss / len(train_loader)}")
            
    def greedy_decode(self, source, max_length = 50):
        self.model.eval()
        batch_size = source.size(0)

        with torch.no_grad():
            (encoder_out, src_mask) = self.model.encoder(source.to(self.device))
    
            decoder_input = torch.full((batch_size, 1), self.bos_id, device = self.device, dtype = torch.long)
    
            for step in range(max_length):
                # look_ahead_mask = torch.triu(torch.ones((1, step + 1, step + 1), device=source.device), diagonal=1).bool()
                # print(decoder_input.shape, encoder_out.shape)
                
                target_padding_mask = (decoder_input != self.pad_id).unsqueeze(1).unsqueeze(2)
                casual_mask = self.model.decoder.casual_mask[:decoder_input.shape[1], :decoder_input.shape[1]].unsqueeze(0).unsqueeze(0)
                tgt_mask = target_padding_mask.bool() & casual_mask.bool() 
                # print(f"Target_padding : {target_padding_mask}\n Casual_mask : {casual_mask}")
                logits = self.model.decoder(decoder_input, encoder_out, src_mask, casual_mask)
                next_token = logits[:, -1].argmax(-1, keepdim = True)
                decoder_input = torch.cat([decoder_input, next_token], dim = -1)
                # print(next_token)
                if (next_token == self.eos_id).all():
                    break
            return decoder_input
    


        
    def validation(self, val_loader):
        self.model.eval()
        val_loss = 0
        # val_bar = tqdm(val_loader, desc=f"Epoch {self.epoch+1}/{self.epochs} [Validation]", leave=True, dynamic_ncols = True)
        with torch.no_grad():
            for i, (source, target_ip, target_op) in enumerate(val_loader):
                source = source.to(self.device)
                target_ip = target_ip.to(self.device)
                target_op = target_op.to(self.device)

                output = self.model(source, target_ip)
                
                output = output.reshape(-1, output.size(-1))
                target_op = target_op.reshape(-1)

                loss = self.criteria(output, target_op)
                val_loss += loss.item()
                # val_bar.set_postfix(loss=f"{val_loss / (i+1):.4f}")
                # val_bar.update(1)

                if i < 4:
                    predictions = self.greedy_decode(source)
                    self.show_text(source, target_ip, predictions)

            print(f"Validation Loss: {val_loss / len(val_loader)}")

    def show_text(self, source, target_ip, target_op):
        def decode_bpe(tokens, vocab):
            # if tokens.dim() == 0:
            #     tokens = tokens.unsqueeze(0)
            text = [vocab[num.item()] for num in tokens]
            text = "".join([words for words in text if words not in self.ignore])
            text = text.replace("▁", " ")  # Convert BPE space marker to actual space
            return text.strip()
        # print(source.shape, target_ip.shape, target_op.shape)
        # print([[a for a in sen ]for sen in target_op])
        # print(target_op)
        source_text = [decode_bpe(sen, self.inverse_en_vocab) for sen in source]
        target_ip_text = [decode_bpe(sen, self.inverse_es_vocab) for sen in target_ip]
        target_op_text = [decode_bpe(sen, self.inverse_es_vocab) for sen in target_op]

        for i in range(1):
            print("\n" + "-" * 80)
            print(f"ENGLISH          : {source_text[i]}")
            print(f"TARGET SPANISH   : {target_ip_text[i]}")
            print(f"PREDICTED SPANISH: {target_op_text[i]}")
            print("-" * 80)

In [30]:
print(len(es_vocab))

8000


In [None]:
TaI = TrainingandInference(en_vocab, es_vocab, max_len)
TaI.train(train_loader,val_loader,40)