In [33]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_de_dataset = load_dataset("wmt14", "de-en",  keep_in_memory=True)
raw_fr_dataset = load_dataset("wmt14", "fr-en",  keep_in_memory=True)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

Using the latest cached version of the dataset since wmt14 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'de-en' at /Users/fneffati/.cache/huggingface/datasets/wmt14/de-en/0.0.0/b199e406369ec1b7634206d3ded5ba45de2fe696 (last modified on Thu May  9 09:17:47 2024).


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]



In [34]:
print(tokenizer.vocab_size)

32100


In [35]:
import numpy as np

train_set_np = np.array(raw_de_dataset['train'])
test_set_np  = np.array(raw_de_dataset['test'])

In [36]:
train_set_np[9]

{'translation': {'de': 'Zu den Attentatsopfern, die es in jüngster Zeit in Sri Lanka zu beklagen gab, zählt auch Herr Kumar Ponnambalam, der dem Europäischen Parlament erst vor wenigen Monaten einen Besuch abgestattet hatte.',
  'en': 'One of the people assassinated very recently in Sri Lanka was Mr Kumar Ponnambalam, who had visited the European Parliament just a few months ago.'}}

In [37]:
def tokenize_de(example, src_lang_token, max_length=50):
    src_text = src_lang_token + " " + example["translation"]["de"]
    tgt_text = example["translation"]["en"]

    src_encoding = tokenizer(src_text, truncation=True, max_length=max_length, padding='max_length',  return_tensors="pt")
    tgt_encoding = tokenizer(tgt_text, truncation=True, max_length=max_length, padding='max_length', return_tensors="pt")

    src_ids = src_encoding["input_ids"]
    tgt_ids = tgt_encoding["input_ids"]
    
    result = src_ids, tgt_ids

    return result

In [620]:
counter = 0
train_set_tokenized = []
for item in train_set_np:
    train_set_tokenized.append(tokenize_de(item, "<2de>"))
    
    counter += 1
    if counter == 200000:
        break
    
train_set_tokenized_flat = [[item for sublist in inner_list for item in sublist] for inner_list in train_set_tokenized]

In [621]:
counter = 0 

test_set_tokenized = []
for item in test_set_np:
    test_set_tokenized.append(tokenize_de(item, "<2de>"))
    
test_set_tokenized_flat = [[item for sublist in inner_list for item in sublist] for inner_list in test_set_tokenized]

# Model Architecture 

In [20]:
import torch 
import torch.nn as nn

## Encoder 

## Input Embedding

In [21]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        
        assert (self.head_dim * heads == embed_size), "Embedding size needs to be divisible by heads"
        self.values = nn.Linear(self.head_dim, self.head_dim)
        self.keys = nn.Linear(self.head_dim, self.head_dim)
        self.queries = nn.Linear(self.head_dim, self.head_dim)
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)
        
    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len =   values.shape[1], keys.shape[1], query.shape[1]
        
    
        #split embeddings into self.heads pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)
        
        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)
        
    
        
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query len, heads, heads_dim)
        # keys shape: (N, key_len, heads, heads_dim)
        # energy shape: (N, heads, query_len, key_len
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))
            
        attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)
        out = torch.einsum("nhql,nlhd->nqhd",[attention, values]).reshape(
            N, query_len, self.heads*self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # (N, query_len, heads, head_dim) then flatten last two dimensions
        
        out = self.fc_out(out)
        return out        

In [22]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion*embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion*embed_size, embed_size),
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

## Encoder

In [23]:
class Encoder(nn.Module):
    def __init__(self,
                 src_vocab_size,
                 embed_size,
                 num_layers,
                 heads,
                 device,
                 forward_expansion,
                 dropout,
                 max_length,):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size, heads, dropout=dropout, forward_expansion=forward_expansion
                )
                for _ in range(num_layers)
            ]
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        
        # print(f"Input tensor x shape: {x.shape}")
        # print(f"Input tensor x min value: {x.min().item()}")
        # print(f"Input tensor x max value: {x.max().item()}")
        # print(f"Word embedding vocabulary size: {self.word_embedding.weight.size(0)}")
        
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
                
        for layer in self.layers:
            out = layer(out, out ,out, mask)
        return out 

### Decder

In [24]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, value, key , src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out 
    
class Decoder(nn.Module):
    def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        
        self.layers = nn.ModuleList(
            [DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
             for _ in range(num_layers)]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
        
        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)
        out = self.fc_out(x)
        return out

In [25]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=256, num_layers=6, forward_expansion=4, heads=8, dropout=0, device="cpu", max_length=100):
        super(Transformer, self).__init__()
        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length
        )
        
        self.decoder = Decoder(
            trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length
        )
        
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(self.device)
    
    def make_trg_mask(self, trg):
        # print(f"trg shape: {trg.shape}")
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )
        
        return trg_mask.to(self.device)
    
    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

### Test

In [26]:
device = torch.device("cpu")
print(device)

x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(
    device
)
trg = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)

src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 10
trg_vocab_size = 10
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(
    device
)
out = model(x, trg[:, :-1])
print(out.shape)

cpu
torch.Size([2, 7, 10])


In [27]:
device = torch.device("cpu")

a = torch.tensor( [ [1, 35, 2, 3, 110, 20, 0], [1, 19, 48, 12,  17, 13, 1] ] ).to(device)
b = torch.tensor( [[1, 35, 21, 15, 9,  3,  0], [1, 27, 15, 40,  10, 6,   1]] ).to(device)

src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 200
trg_vocab_size = 50
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(device)

out = model(a, b[:, :-1])
print(out.shape)

torch.Size([2, 6, 50])


In [587]:
# print(train_set_np[0])
print(len(train_set_tokenized[0][0]))
# text = tokenizer.decode(train_set_tokenized[0][1], skip_special_tokens=True)
# print(train_set_tokenized[0][0])
# print(text)

1


In [588]:
device = torch.device("cpu")
print(device)

x = train_set_tokenized[0][0].to(device)
trg = train_set_tokenized[0][1].to(device)

print(x)

src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 25000
trg_vocab_size = 25000
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(
    device
)
out = model(x, trg[:, :-1])
print(out.shape)


cpu
tensor([[    3,     2,   357,   221,  3155, 15158, 24860,    74, 11216,   425,
             7,  4267,    32,   221,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
torch.Size([1, 49, 25000])


In [13]:
import torch
from torch.utils.data import DataLoader

# Create data loaders
train_loader = DataLoader(train_set_tokenized_flat, batch_size=32, shuffle=True)
val_loader = DataLoader(test_set_tokenized_flat, batch_size=32)

NameError: name 'train_set_tokenized_flat' is not defined

In [623]:
# Define the model, loss function, and optimizer
device = torch.device("cpu")
model = Transformer(src_vocab_size=35000, 
                    trg_vocab_size=35000, 
                    src_pad_idx=0, 
                    trg_pad_idx=0).to(device)

criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

In [624]:
from tqdm import tqdm
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = 0.0
    model.train()
    for batch in tqdm(train_loader):
        input_ids, labels = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, labels[:, :-1])
        
        loss = criterion(outputs.view(-1, 35000), labels[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()        

    val_loss = 0.0
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_ids, labels = batch[0].to(device), batch[1].to(device)
            outputs = model(input_ids, labels[:, :-1])
            loss = criterion(outputs.view(-1, 35000), labels[:, 1:].contiguous().view(-1))
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}")

100%|██████████| 6250/6250 [46:08<00:00,  2.26it/s]  


Epoch 1/10, Train Loss: 5.46729545539856


100%|██████████| 6250/6250 [49:50<00:00,  2.09it/s]   


Epoch 2/10, Train Loss: 4.6196503998565674


100%|██████████| 6250/6250 [48:18<00:00,  2.16it/s]   


Epoch 3/10, Train Loss: 4.298205116729736


100%|██████████| 6250/6250 [45:12<00:00,  2.30it/s] 


Epoch 4/10, Train Loss: 4.051104640884399


100%|██████████| 6250/6250 [44:24<00:00,  2.35it/s]


Epoch 5/10, Train Loss: 3.8409506521224976


100%|██████████| 6250/6250 [43:50<00:00,  2.38it/s]


Epoch 6/10, Train Loss: 3.6653189277648925


100%|██████████| 6250/6250 [46:21<00:00,  2.25it/s] 


Epoch 7/10, Train Loss: 3.5161287268829344


100%|██████████| 6250/6250 [46:10<00:00,  2.26it/s]   


Epoch 8/10, Train Loss: 3.3885325021743773


100%|██████████| 6250/6250 [45:14<00:00,  2.30it/s]


Epoch 9/10, Train Loss: 3.277026579055786


100%|██████████| 6250/6250 [45:18<00:00,  2.30it/s] 


Epoch 10/10, Train Loss: 3.1787816304397585


In [635]:
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Input sentence
input_sentence = "<2de> ich bin in mein helfen"

# Tokenize and convert to tensor
input_ids = tokenizer.encode(input_sentence, return_tensors="pt")

# Create source mask
src_mask = model.make_src_mask(input_ids)

# Generate translation
model.eval()
with torch.no_grad():
    output = model(input_ids, input_ids[:, :-1])
    decoded_output = output.argmax(dim=-1)

# Decode output tensor
translation = tokenizer.decode(decoded_output.squeeze(), skip_special_tokens=False)

print("Input sentence:", input_sentence)
print("Translation:", translation)


Input sentence: <2de> ich bin in mein helfen
Translation: . I in</s></s>aer</s> the.</s>


One of the people assassinated very recently in Sri Lanka was Mr Kumar Ponnambalam, who had visited the European Parliament just a few months ago.')




In [626]:
torch.save(model.state_dict(), "10E2K.pth")