In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
from torchsummary import summary
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pad_sequence
from tokenizers import Tokenizer
import pandas as pd
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import Transformer

class MultiHeadAttention(nn.Module):
    def __init__(self, d_k, d_model, n_heads, max_len, causal=False):
        super().__init__()
        
        self.d_k = d_k
        self.n_heads = n_heads

        self.key = nn.Linear(d_model,d_k * n_heads)
        self.query = nn.Linear(d_model,d_k * n_heads)
        self.values = nn.Linear(d_model,d_k * n_heads)
        
        self.fc = nn.Linear(d_k*n_heads,d_model)

        self.causal = causal

        if causal:
            cm = torch.tril(torch.ones(max_len,max_len))
            self.register_buffer("causal_mask",cm.view(1,1,max_len,max_len))
    def forward(self, q, k, v, pad_mask=None):
        q = self.query(q)
        k = self.key(k)
        v = self.values(v)

        N = q.shape[0]
        T_output = q.shape[1]
        T_input = k.shape[1]

        q = q.view(N,T_output,self.n_heads,self.d_k).transpose(1,2)
        k = k.view(N,T_input,self.n_heads,self.d_k).transpose(1,2)
        v = v.view(N,T_input,self.n_heads,self.d_k).transpose(1,2)

        # Compute attention
        # (N,h,T,d_k) x (N,h,d_k,T) --> (N,h,T,T)

        
        attn_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)

        if pad_mask is not None:
            attn_scores=attn_scores.masked_fill(pad_mask[:,None,None,:]==0,float('-inf'))

        if self.causal:
            attn_scores = attn_scores.masked_fill(

            self.causal_mask[:, :, :T_output, :T_input] == 0,float('-inf'))

        attn_weights = F.softmax(attn_scores, dim=-1) 

       


        A = attn_weights @ v
        A = A.transpose(1, 2)
        
        A = A.contiguous().view(N, T_output, self.d_k * self.n_heads)
        return self.fc(A)

class EncoderBlock(nn.Module):
    def __init__(self,d_k,d_model,n_heads,max_len,dropout_prob=0.1):
        super().__init__()

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_k,d_model,n_heads,max_len,causal = False)
        self.ann = nn.Sequential(
            nn.Linear(d_model,d_model*4),
            nn.GELU(),
            nn.Linear(d_model*4,d_model),
            nn.Dropout(dropout_prob)
        )
        self.dropout = nn.Dropout(p = dropout_prob)

    def forward(self,x,pad_mask=None):
        x = self.ln1(x + self.mha(x,x,x,pad_mask))
        x = self.ln2(x + self.ann(x))
        x = self.dropout(x)

        return x
    
class DecoderBlock(nn.Module):
    def __init__(self,d_k,d_model,n_heads,max_len,dropout_prob=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.ln3 = nn.LayerNorm(d_model)
        self.mha1 = MultiHeadAttention(d_k,d_model,n_heads,max_len,causal= True)
        self.mha2 = MultiHeadAttention(d_k,d_model,n_heads,max_len,causal=False)
        self.ann = nn.Sequential(
            nn.Linear(d_model,d_model*4),
            nn.GELU(),
            nn.Linear(d_model*4,d_model),
            nn.Dropout(dropout_prob)
        )
        self.dropout = nn.Dropout(p = dropout_prob)

    def forward(self,enc_output,dec_input,enc_mask = None,dec_mask = None):
        x = self.ln1(dec_input + self.mha1(dec_input,dec_input,dec_input,dec_mask))
        x = self.ln2(x+self.mha2(x,enc_output,enc_output,enc_mask))
        x = self.ln3(x + self.ann(x))
        x = self.dropout(x)
        return x

class PositionalEncoding(nn.Module):
    def __init__(self, d_model,max_len = 2048,dropout_prob = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p = dropout_prob)

        position = torch.arange(max_len).unsqueeze(1)
        exp_term = torch.arange(0,d_model,2)
        div_term = torch.exp(exp_term*(-math.log(10000.0)/d_model))
        pe = torch.zeros(1,max_len,d_model)
        pe[0,:,0::2] = torch.sin(position*div_term)
        pe[0,:,1::2] = torch.cos(position*div_term)
        self.register_buffer('pe',pe)

    def forward(self,x):
        #x.shape :N x T x D
        x = x + self.pe[:,:x.size(1),:]
        return self.dropout(x)
        
class Encoder(nn.Module):
    def __init__(self,vocab_size,max_len,d_k,d_model,n_heads,n_layers,dropout_prob):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size,d_model)
        self.pos_encoding = PositionalEncoding(d_model,max_len,dropout_prob)
        transformer_block = [
            EncoderBlock(d_k,d_model,n_heads,max_len,dropout_prob) for _ in range(n_layers)
        ]
        self.transformer_blocks = nn.Sequential(*transformer_block)
        self.ln = nn.LayerNorm(d_model)

    def forward(self,x,pad_mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for block in self.transformer_blocks:
            x = block(x,pad_mask)
        x = self.ln(x)
        return x

class Decoder(nn.Module):
    def __init__(self,vocab_size,max_len,d_k,d_model,n_heads,n_layers,dropout_prob):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size,d_model)
        self.pos_encoding = PositionalEncoding(d_model,max_len,dropout_prob)
        transformer_blocks = [
            DecoderBlock(d_k,d_model,n_heads,max_len,dropout_prob) for _ in range(n_layers)
        ]
        self.transformers_blocks = nn.Sequential(*transformer_blocks)
        self.ln = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model,vocab_size)

    def forward(self,enc_output,dec_input,enc_mask = None,dec_mask = None):
        x = self.embedding(dec_input)
        x = self.pos_encoding(x)
        for block in self.transformers_blocks:
            x = block(enc_output,x,enc_mask,dec_mask)
        x = self.ln(x)
        x = self.fc(x)
        return x
    
class Transformer(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
    def forward(self,enc_input,dec_input,enc_mask,dec_mask):
        enc_output = self.encoder(enc_input,enc_mask)
        dec_output = self.decoder(enc_output,dec_input,enc_mask,dec_mask)
        return dec_output


In [1]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

NameError: name 'torch' is not defined

In [6]:
from datasets import load_dataset
raw_dataset = load_dataset(
    'csv',
    data_files=r'..\Datasets\Dataset.csv',  
    delimiter=',',      
    column_names=["src", "tgt"]  
)

Generating train split: 100001 examples [00:00, 183270.77 examples/s]


In [7]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 100001
    })
})

In [8]:
split = raw_dataset['train'].train_test_split(test_size=.2,seed=42)

In [9]:
split

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['src', 'tgt'],
        num_rows: 20001
    })
})

In [11]:
tokenizer = Tokenizer.from_file(r"Tokenizer\test_tokenizer.json")


src_sentence = split['train'][0]['src']
tgt_sentence = split['train'][0]['tgt']
inputs = tokenizer.encode(src_sentence)
inputs.ids
tokenizer.decode(inputs.ids)

Exception: The system cannot find the path specified. (os error 3)

In [12]:
from tokenizers.processors import TemplateProcessing
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast

tok = Tokenizer.from_file(r"..\Tokenizer\test_tokenizer.json")

tok.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> <s> $B </s>",
    special_tokens=[
        ("<s>", tok.token_to_id("<s>")),
        ("</s>", tok.token_to_id("</s>")),
    ],
)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tok,
    bos_token="<s>",
    eos_token="</s>",
    pad_token="<pad>",
    unk_token="<unk>",
    cls_token="<s>",
    sep_token="</s>",
)


In [13]:
max_input_length = 32
max_target_length = 32

def preprocess_function(batch):

    model_inputs = tokenizer(
        batch['src'],
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
        add_special_tokens=True 
    )

    
    labels = tokenizer(
            text_target=batch['tgt'],  
            max_length=max_target_length,
            truncation=True,
            padding="max_length",
            add_special_tokens=True 
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
tokenized_dataset = split.map(preprocess_function,batched=True,remove_columns=split['train'].column_names,)

Map: 100%|██████████| 80000/80000 [00:04<00:00, 16146.69 examples/s]
Map: 100%|██████████| 20001/20001 [00:01<00:00, 13023.20 examples/s]


In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20001
    })
})

In [16]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer)




In [17]:
batch = data_collator([tokenized_dataset['train'][i] for i in range(0,5)])
batch.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [18]:
batch.pop("token_type_ids", None)
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [19]:
batch['labels']

tensor([[    2,     6,   338,   160,   125, 25220,    18,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [    2,     6,    59,    14,   149,    98,  8458,    11,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [    2,     5,   145, 23707,    12, 11660,    11,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [    2,     6,    52,    27,  5093,    99,    14,  1206,    11,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,  

In [20]:
tokenizer.all_special_ids

[2, 3, 1, 0]

In [21]:
tokenizer.all_special_tokens

['<s>', '</s>', '<unk>', '<pad>']

In [22]:
encoding = tokenizer("Hello world!", add_special_tokens=True)
encoding

{'input_ids': [2, 4210, 2369, 42, 3], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [23]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_dataset['train'],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

valid_loader = DataLoader(
    tokenized_dataset['test'],
    batch_size=32,
    collate_fn=data_collator
)

In [24]:
for batch in train_loader:
    for k ,v in batch.items():
        print("k:",k,"v.shape",v.shape)
    break

k: input_ids v.shape torch.Size([32, 32])
k: token_type_ids v.shape torch.Size([32, 32])
k: attention_mask v.shape torch.Size([32, 32])
k: labels v.shape torch.Size([32, 32])


In [25]:
tokenizer.vocab_size

34169

In [26]:
encoder = Encoder(vocab_size=tokenizer.vocab_size+1,
                  max_len=512,
                  d_k=16,
                  d_model=64,
                  n_heads=4,
                  n_layers=2,
                  dropout_prob=0.1)

decoder = Decoder(vocab_size=tokenizer.vocab_size+1,
                  max_len=512,
                  d_k=16,
                  d_model=64,
                  n_heads=4,
                  n_layers=2,
                  dropout_prob=0.1)
transformer = Transformer(encoder,decoder)

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [28]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(transformer.parameters())

In [29]:
from datetime import datetime
from tqdm import tqdm

def train(model,criterion,optimizer,train_loader,valid_loader,epochs):
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)
    for it in range(epochs):
        model.train()
        t0 = datetime.now()
        train_loss = []
        for batch in tqdm(train_loader, desc=f"Epoch {it+1}/{epochs}", leave=False):
            batch.pop("token_type_ids", None)
            batch = {k : v.to(device) for k,v in batch.items()}

            optimizer.zero_grad()

            enc_input = batch['input_ids']
            enc_mask = batch['attention_mask']
            targets = batch['labels']

            dec_input = targets.clone().detach()
            dec_input = torch.roll(dec_input,shifts=1,dims=1)

            dec_mask = torch.ones_like(dec_input)
            dec_mask = dec_mask.masked_fill(dec_input==tokenizer.pad_token_id,0)

            outputs = model(enc_input,dec_input,enc_mask,targets)
            loss = criterion(outputs.transpose(2,1),targets)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        train_loss=np.mean(train_loss)
        model.eval()

        test_loss = []
        for batch in tqdm(valid_loader, desc="Validating", leave=False):
            batch.pop("token_type_ids", None)
            batch = {k : v.to(device) for k,v in batch.items()}

            optimizer.zero_grad()

            enc_input = batch['input_ids']
            enc_mask = batch['attention_mask']
            targets = batch['labels']

            dec_input = targets.clone().detach()
            dec_input = torch.roll(dec_input,shifts=1,dims=1)

            dec_mask = torch.ones_like(dec_input)
            dec_mask = dec_mask.masked_fill(dec_input==tokenizer.pad_token_id,0)

            outputs = model(enc_input,dec_input,enc_mask,targets)
            loss = criterion(outputs.transpose(2,1),targets)
            loss.backward()
            optimizer.step()
            test_loss.append(loss.item())
        test_loss=np.mean(test_loss)
        train_losses[it]=train_loss
        test_losses[it]=test_loss

        dt = datetime.now()-t0
        print(f"Epoch {it+1}/{epochs}, Train Loss : {train_loss:.4f}, Test_Loss: {test_loss:.4f}, Duration: {dt}")
    return train_losses,test_losses


In [None]:
transformer.to(device)

train_losses ,test_losses = train(
    transformer,criterion,optimizer,train_loader,valid_loader,20
)

                                                               

Epoch 1/20, Train Loss : 3.0324, Test_Loss: 2.1645, Duration: 0:06:27.416220


                                                               

Epoch 2/20, Train Loss : 2.0439, Test_Loss: 1.5631, Duration: 0:05:35.353413


                                                               

Epoch 3/20, Train Loss : 1.7007, Test_Loss: 1.2421, Duration: 0:04:53.182989


                                                               

Epoch 4/20, Train Loss : 1.4997, Test_Loss: 1.0464, Duration: 0:04:53.255092


                                                               

Epoch 5/20, Train Loss : 1.3645, Test_Loss: 0.9180, Duration: 0:04:53.867160


                                                               

Epoch 6/20, Train Loss : 1.2691, Test_Loss: 0.8251, Duration: 0:04:53.130995


                                                               

Epoch 7/20, Train Loss : 1.1947, Test_Loss: 0.7573, Duration: 0:04:52.862825


                                                               

Epoch 8/20, Train Loss : 1.1357, Test_Loss: 0.7069, Duration: 0:04:53.089075


                                                               

Epoch 9/20, Train Loss : 1.0856, Test_Loss: 0.6658, Duration: 0:04:54.146076


                                                                

Epoch 10/20, Train Loss : 1.0471, Test_Loss: 0.6339, Duration: 0:06:07.864912


                                                                

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

def plot_loss(train_losses, test_losses):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_losses, label='Train Loss', marker='o')
    plt.plot(epochs, test_losses, label='Validation Loss', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
plot_loss(train_losses ,test_losses )

NameError: name 'train_losses' is not defined

In [None]:
trainable_params = sum(p.numel() for p in transformer.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 6828538


In [None]:
torch.save(transformer.state_dict(), "transformer_seq2seq.pth")

In [None]:
def translate(input_sentence, lang_token_id):
    # Add the language token before the sentence
    input_with_lang = f"{lang_token_id} {input_sentence}"

    # Tokenize input with language token
    enc_input = tokenizer(input_with_lang, return_tensors='pt')
    enc_input = {k: v.to(device) for k, v in enc_input.items()}
    
    enc_output = encoder(enc_input['input_ids'], enc_input['attention_mask'])

    # Decoder input starts with BOS token (2), followed by target lang token
    dec_input_ids = torch.tensor([[2, lang_token_id]], device=device)
    dec_attn_mask = torch.ones_like(dec_input_ids)

    for _ in range(32):
        dec_output = decoder(
            enc_output,
            dec_input_ids,
            enc_input['attention_mask'],
            dec_attn_mask
        )
        prediction_id = torch.argmax(dec_output[:, -1, :], dim=-1)
        dec_input_ids = torch.cat((dec_input_ids, prediction_id.view(1, 1)), dim=1)
        dec_attn_mask = torch.ones_like(dec_input_ids)

        if prediction_id.item() == tokenizer.eos_token_id:
            break

    # Skip BOS and lang token in output
    translation = tokenizer.decode(dec_input_ids[0, 2:], skip_special_tokens=True)
    print(translation)


In [None]:
translate("One day I will find you",5)

Una día te encontrará .


In [None]:
es = 5
fr = 6

In [None]:
translate("I woke up today quite early",es)

Hoy millas despierto las cinco .
