In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

from datasets import load_dataset
import os
import math
import matplotlib.pyplot as plt

In [None]:
data_dir = '/kaggle/input/en-de-dataset/'
print(os.listdir(data_dir))

['valid.en', 'valid.de', 'de_en.csv', 'train.de', 'train.en']


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



In [None]:
enc1 = tokenizer.encode("Hello, how can i help you?")
enc2 = tokenizer.encode("The moon is beautiful, isn't it?")

print(enc1, enc2)
# Some encoding chars ==> (, -> 2) , (? -> 31), (<s> -> 0)

[16816, 2, 406, 85, 787, 548, 41, 31, 0] [36, 11689, 19, 1442, 2, 4813, 22, 46, 56, 31, 0]


In [None]:
import pandas as pd

df = pd.read_csv(data_dir + '/de_en.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ENGLISH,GERMAN
0,0,hi,hallo
1,1,hi,gru gott
2,2,run,lauf
3,3,wow,potzdonner
4,4,wow,donnerwetter


In [None]:
en_sentences = df['ENGLISH'].astype(str).tolist()
de_sentences = df['GERMAN'].astype(str).tolist()

en_sentences = en_sentences
de_sentences = de_sentences


print(f"Total pairs: {len(en_sentences)}")

Total pairs: 152820


In [None]:
import random

group = list(zip(en_sentences[:10000], de_sentences[:10000]))
random.shuffle(group)

# 90% train, 10% validation
train_group = group[:int(len(group) * 0.9)]
val_group = group[int(len(group) * 0.9):]

print(f"Train size: {len(train_group)}")
print(f"Validation size: {len(val_group)}")

Train size: 9000
Validation size: 1000


In [None]:
class EnDePairDataset(Dataset):
    def __init__(self, en_sentences, de_sentences, tokenizer):
        self.en_sentences = en_sentences
        self.de_sentences = de_sentences

        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.en_sentences)

    def __getitem__(self, idx):
        "return (En tensor, De tensor)"
        self.en_sentence = self.en_sentences[idx]
        self.de_sentence = self.de_sentences[idx]

        en_encoded = self.tokenizer.encode(self.en_sentence)
        de_encoded = self.tokenizer.encode(self.de_sentence)
        return torch.tensor(en_encoded, dtype=torch.long), torch.tensor(de_encoded, dtype=torch.long)

In [None]:
train_ds = EnDePairDataset(en_sentences[:9000], de_sentences[:9000], tokenizer)
val_ds = EnDePairDataset(en_sentences[:1000], de_sentences[:1000], tokenizer)

print("Length of train_dataset, val_dataset:")
print(len(train_ds), len(val_ds))

print("First example:")

print(train_ds[0])
print(tokenizer.decode(train_ds[0][0]))
print(tokenizer.decode(train_ds[0][1]))

Length of train_dataset, val_dataset:
9000 1000
First example:
(tensor([16478,     0]), tensor([7475,  166,    0]))
hi</s>
hallo</s>


In [None]:
import torch

max_seq_len = 100
pad_idx = tokenizer.pad_token_id

def collate_fn(batch):
    "Return fixed size tensor"
    srcs, tgts = zip(*batch)
    batch_size = len(batch)

    src_batch = []
    tgt_batch = []

    for src in srcs:
        if len(src) < max_seq_len:
            "Padding"
            padded = torch.cat([src, torch.tensor([pad_idx] * (max_seq_len - len(src)), dtype=torch.long)])
        else:
            "Truncate"
            padded = src[:max_seq_len]
        src_batch.append(padded)

    for tgt in tgts:
        if len(tgt) < max_seq_len:
            padded = torch.cat([tgt, torch.tensor([pad_idx] * (max_seq_len - len(tgt)), dtype=torch.long)])
        else:
            padded = tgt[:max_seq_len]
        tgt_batch.append(padded)

    src_batch = torch.stack(src_batch)
    tgt_batch = torch.stack(tgt_batch)

    return src_batch, tgt_batch

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate_fn)

# (batch_size, seq)-> (64, 2150)
len(train_loader)

141

In [None]:
# Hyper parameters

lr = 3e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_embd = 512
n_head = 4
n_layer = 4
dropout = 0.1
vocab_size = tokenizer.vocab_size
max_seq_len=100
pad_idx = tokenizer.pad_token_id

import copy
print(device)

cuda


In [None]:
class Embeddings(nn.Module):
  def __init__(self, vocab_size, n_embd):
    super(Embeddings, self).__init__()
    self.embedding = nn.Embedding(vocab_size, n_embd)
    self.scale = n_embd ** 0.5

  def forward(self, x):
    return self.embedding(x) * self.scale

In [None]:
for idx, (src, tgt) in enumerate(train_loader):
  x = src
  break

In [None]:
print(x.shape) # (B, S)

y = Embeddings(vocab_size, n_embd)(x)
print(y.shape) # (B, S, n_embd)


torch.Size([64, 100])
torch.Size([64, 100, 512])


In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, n_embd, max_len=100):
    super(PositionalEncoding, self).__init__()
    pe = torch.zeros(max_len, n_embd)
    pos = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, n_embd, 2) * -(math.log(10000.0) / n_embd)) # e ^ [-ln(10000) * (2i/ n_embd)]
    pe[:, 0::2] = torch.sin(pos * div_term)
    pe[:, 1::2] = torch.cos(pos * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self, x):
    return x + self.pe[:, :x.size(1)]

In [None]:
# shape of positionalEncoding and embeddings will be same..

pos_y = PositionalEncoding(n_embd)(y)
print(pos_y.shape) # (B, S, n_embd)

torch.Size([64, 100, 512])


In [None]:
def clones(module, N):
  "Create N identical layers."
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
def attention(q, k, v, mask=None, dropout=None):
  "Compute Scaled Dot Product Attention (Attention score)"
  dim_k = k.size(-1)

  # Q (B, S, dim_k) , K (B, S, dim_k)
  # For Q @ K, we need to transpose K -> K (B, dim_k, S) ---> more precisely Q(S, D) * K (D, S) => (S, S)
  score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dim_k) # score = (Q* K) / sqrt(d_k)  --> single_layer (d_k) = n_embd || multi_layer(d_k) = n_embd / n_head
  if mask is not None:
    score = score.masked_fill(mask == 0, -1e9)
  att_w = score.softmax(dim=-1)
  if dropout is not None:
    att_w = dropout(att_w)


  # att_w (B, S, S) , V (B, S, d_k or n_embd [in case of single attention head])
  # att_w @ V ---> (B, S, d_k or n_embd)
  return torch.matmul(att_w, v), att_w

In [None]:
# Single layer attention

q = nn.Linear(n_embd, n_embd)(pos_y)
k = nn.Linear(n_embd, n_embd)(pos_y)
v = nn.Linear(n_embd, n_embd)(pos_y)

print(q.shape) # (B, S, n_embd)

attn_y, attn_w = attention(q, k, v)
print(attn_y.shape) # (B, S, n_embd)
print(attn_w.shape) # (B, S, S)

torch.Size([64, 100, 512])
torch.Size([64, 100, 512])
torch.Size([64, 100, 100])


In [None]:
class MultiHeadedAttention(nn.Module):
  def __init__(self, n_head, n_embd, dropout=0.1):
    super(MultiHeadedAttention, self).__init__()
    assert n_embd % n_head == 0, "can't divide n_embd by n_head"
    self.n_head = n_head
    self.n_embd = n_embd
    self.dim_k = n_embd // n_head # d_k
    self.Ws = clones(nn.Linear(n_embd, n_embd), 4)
    self.attn = None
    self.dropout = nn.Dropout(dropout)

  def forward(self, q, k, v, mask=None):
    if mask is not None:
      mask = mask.unsqueeze(1)
    n_batches = q.size(0)

      
    # projecting q, k, v (passing through FC-linear layer)
    Q = self.Ws[0](q).view(n_batches, -1, self.n_head, self.dim_k).transpose(1, 2) # Q = q @ W_q 
    K = self.Ws[1](k).view(n_batches, -1, self.n_head, self.dim_k).transpose(1, 2) # K = k @ W_k
    V = self.Ws[2](v).view(n_batches, -1, self.n_head, self.dim_k).transpose(1, 2) # V = v @ W_v

    # W_q, W_k, W_v are learnable weight matrices (from linear layer)

    x, self.attn = attention(Q, K, V, mask=mask, dropout=self.dropout)

    "Concatenating all heads"
    x = x.transpose(1, 2).contiguous().view(n_batches, -1, self.n_head * self.dim_k)

    return self.Ws[-1](x)

In [None]:
class LayerNorm(nn.Module):
  "Normalize features"
  def __init__(self, size, eps=1e-6):
    super(LayerNorm, self).__init__()
    self.a_2 = nn.Parameter(torch.ones(size))
    self.b_2 = nn.Parameter(torch.zeros(size))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(1, keepdim=True)
    std = x.std(1, keepdim=True)
    return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [None]:
class SublayerConnection(nn.Module):
  "Residual connection followed by layer norm"
  def __init__(self, size, dropout):
    super(SublayerConnection, self).__init__()
    self.layer_norm = LayerNorm(size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.layer_norm(x)))

In [None]:
class FeedForwardLayer(nn.Module):
  def __init__(self, n_embd, dropout):
    super(FeedForwardLayer, self).__init__()
    self.ff = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.ff(x)

In [None]:
class EncoderLayer(nn.Module):
  "Consist of multi-head attention and feed forward"
  def __init__(self, size, self_attn, feed_forward, dropout):
    super(EncoderLayer, self).__init__()
    self.self_attn = self_attn
    self.feed_forward = feed_forward
    self.add_and_norm = clones(SublayerConnection(size, dropout), 2)
    self.size = size

  def forward(self, x, mask):
    x = self.add_and_norm[0](x, lambda x: self.self_attn(x, x, x, mask))
    x = self.add_and_norm[1](x, self.feed_forward)
    return x

In [None]:
class DecoderLayer(nn.Module):
  "Consist of multi-head attn, src_attn, feed forward"
  def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
    super(DecoderLayer, self).__init__()
    self.size = size
    self.self_attn = self_attn
    self.src_attn = src_attn
    self.feed_forward = feed_forward
    self.add_and_norm = clones(SublayerConnection(size, dropout), 3)

  def forward(self, x, encoder_op, src_mask, tgt_mask):
    x = self.add_and_norm[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
    x = self.add_and_norm[1](x, lambda x: self.src_attn(x, encoder_op, encoder_op, src_mask))
    x = self.add_and_norm[2](x, self.feed_forward)
    return x

In [None]:
class Encoder(nn.Module):
  def __init__(self, layer, N):
    super(Encoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [None]:
class Decoder(nn.Module):
  def __init__(self, layer, N):
    super(Decoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, encoder_op, src_mask, tgt_mask):
    for layer in self.layers:
      x = layer(x, encoder_op, src_mask, tgt_mask)
    return self.norm(x)

In [None]:
class Transfomer(nn.Module):
  def __init__(self):
    super(Transfomer, self).__init__()
    c = copy.deepcopy
    attn = MultiHeadedAttention(n_head, n_embd, dropout)
    ff = FeedForwardLayer(n_embd, dropout)
    position = PositionalEncoding(n_embd, max_seq_len)

    self.encoder = Encoder(EncoderLayer(n_embd, c(attn), c(ff), dropout), n_layer)
    self.decoder = Decoder(DecoderLayer(n_embd, c(attn), c(attn), c(ff), dropout), n_layer)
    self.src_embed = nn.Sequential(Embeddings(vocab_size, n_embd), c(position))
    self.tgt_embed = nn.Sequential(Embeddings(vocab_size, n_embd), c(position))
    self.generator = nn.Linear(n_embd, vocab_size)

  def forward(self, src, tgt, src_mask, tgt_mask):
    "Process src and tgt sequences."
    encoded_src = self.encode(src, src_mask)
    decoded_tgt = self.decode(encoded_src, src_mask, tgt, tgt_mask)

    return self.generator(decoded_tgt)

  def encode(self, src, src_mask):
    return self.encoder(self.src_embed(src), src_mask)

  def decode(self, encoder_op, src_mask, tgt, tgt_mask):
    tgt_mask = make_tgt_mask(tgt, pad_idx).to(tgt.device)
    return self.decoder(self.tgt_embed(tgt), encoder_op, src_mask, tgt_mask)

In [33]:
def make_src_mask(src, pad_idx):
    "Create a mask to hide padding tokens in the source sequence."
    return (src != pad_idx).unsqueeze(1)

def make_tgt_mask(tgt, pad_idx):
    "Create a mask to hide padding tokens in the target sequence and future tokens."
    # (B, seq_len, seq_len)
    tgt_pad_mask = (tgt != pad_idx).unsqueeze(-2)
    tgt_seq_len = tgt.size(-1)
    subsequent_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len, dtype=torch.bool, device=tgt.device))
    # (B, seq_len, seq_len) & (1, seq_len, seq_len) -> (B, seq_len, seq_len)
    return tgt_pad_mask & subsequent_mask

In [34]:
model = Transfomer().to(device)

p = sum(p.nelement() for p in model.parameters())
print(f"Number of parameters: {p / 1e6} M")

Number of parameters: 118.728949 M


In [35]:
import time

# loss function
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

# lr decay
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

epochs = 50
model_save_path = '/kaggle/working/' + 'transfomer_En-de.pth'

train_losses = []
val_losses = []

try:
  for epoch in range(epochs):
    start_time = time.time()
    model.train()

    total_train_loss = 0

    for i, (src, tgt) in enumerate(train_loader):
      src = src.to(device)
      tgt = tgt.to(device)

      src_mask = make_src_mask(src, pad_idx).to(device)
      tgt_input = tgt[:, :-1]
      tgt_output = tgt[:, 1:]
      tgt_mask = make_tgt_mask(tgt_input, pad_idx).to(device)

      optimizer.zero_grad()
      output = model(src, tgt_input, src_mask, tgt_mask)
      loss = criterion(output.contiguous().view(-1, output.size(-1)), tgt_output.contiguous().view(-1))
      loss.backward()
      optimizer.step()

      total_train_loss += loss.item()

      if i % 100 == 0:
        print(f"Batch {i} / {len(train_loader)}, training loss: {loss.item()}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}, training loss: {avg_train_loss}")

    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved after epoch {epoch+1} to {model_save_path}")

    model.eval()
    total_val_loss = 0

    with torch.no_grad():
      for i, (src, tgt) in enumerate(val_loader):
        src = src.to(device)
        tgt = tgt.to(device)

        src_mask = make_src_mask(src, pad_idx).to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = make_tgt_mask(tgt_input, pad_idx).to(device)

        output = model(src, tgt_input, src_mask, tgt_mask)
        loss = criterion(output.contiguous().view(-1, output.size(-1)), tgt_output.contiguous().view(-1))
        total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f"Epoch {epoch+1}, validation loss: {avg_val_loss}")

    end_time = time.time()
    print(f"Epoch {epoch+1}, time: {end_time - start_time}")
    scheduler.step()


except KeyboardInterrupt:
    print("Training interrupted. Saving model...")
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

print("Training finished.")

Batch 0 / 141, training loss: 12.52568531036377
Batch 100 / 141, training loss: 3.4074044227600098
Epoch 1, training loss: 4.0851633514918335
Model saved after epoch 1 to /kaggle/working/transfomer_En-de.pth
Epoch 1, validation loss: 1.8461016342043877
Epoch 1, time: 66.75939011573792
Batch 0 / 141, training loss: 2.3023734092712402
Batch 100 / 141, training loss: 1.411494493484497
Epoch 2, training loss: 1.9508716028632846
Model saved after epoch 2 to /kaggle/working/transfomer_En-de.pth
Epoch 2, validation loss: 0.8390398770570755
Epoch 2, time: 66.4373984336853
Batch 0 / 141, training loss: 1.2990890741348267
Batch 100 / 141, training loss: 1.1280661821365356
Epoch 3, training loss: 1.1933070657946538
Model saved after epoch 3 to /kaggle/working/transfomer_En-de.pth
Epoch 3, validation loss: 0.40727472491562366
Epoch 3, time: 66.69695401191711
Batch 0 / 141, training loss: 0.7663459181785583
Training interrupted. Saving model...
Model saved to /kaggle/working/transfomer_En-de.pth
Tr

In [None]:
# Load the saved model
model_load_path = model_save_path

loaded_model = Transfomer().to(device)
loaded_model.load_state_dict(torch.load(model_load_path, map_location=device))
loaded_model.eval()

print("Model loaded successfully.")

In [129]:
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"

print(tokenizer.bos_token_id, tokenizer.eos_token_id)

1 0


In [133]:
def translate_sentence(sentence, tokenizer, model, device, max_len=100):
    model.eval()
    with torch.no_grad():
        # Encode source sentence
        src_tokens = tokenizer.encode(sentence, return_tensors='pt').to(device)
        src_mask = (src_tokens != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)

        # Encode the source
        memory = model.encode(src_tokens, src_mask)

        # Init target with BOS
        if tokenizer.bos_token_id is not None:
            tgt_indices = [tokenizer.bos_token_id]
        elif tokenizer.cls_token_id is not None:
            tgt_indices = [tokenizer.cls_token_id]
        else:
            # fallback to first token of tokenizer
            tgt_indices = [tokenizer.convert_tokens_to_ids(tokenizer.cls_token)]

        for _ in range(max_len):
            tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(device)
            tgt_mask = (tgt_tensor != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)

            output = model.decode(memory, src_mask, tgt_tensor, tgt_mask)
            logits = model.generator(output[:, -1])
            next_token = torch.argmax(logits, dim=-1).item()

            # Stop if EOS is generated
            if next_token == tokenizer.eos_token_id:
                break

            tgt_indices.append(next_token)

        # Decode output tokens
        return tokenizer.decode(tgt_indices, skip_special_tokens=True).strip()


In [None]:
english_sentence = "I love you Khushi"
german_translation = translate_sentence(english_sentence, tokenizer, loaded_model, device)
print("German Translation:", german_translation)