# En - De Transformer using Pytorch


In [None]:
!pip install datasets
!pip install sentencepiece

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

from datasets import load_dataset
import sentencepiece as spm
import os
import math

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<h4>Preparing Dataset</h4>

In [6]:
import pandas as pd
import random

In [7]:
data_path = '/content/drive/My Drive/data/de_en.csv'

df = pd.read_csv(data_path)
df.head()

Unnamed: 0.1,Unnamed: 0,ENGLISH,GERMAN
0,0,hi,hallo
1,1,hi,gru gott
2,2,run,lauf
3,3,wow,potzdonner
4,4,wow,donnerwetter


In [55]:
en_sentences = df['ENGLISH'].astype(str).tolist()
de_sentences = df['GERMAN'].astype(str).tolist()

en_sentences = en_sentences[:20000]
de_sentences = de_sentences[:20000]


print(f"Total pairs: {len(en_sentences)}")

Total pairs: 20000


In [9]:
de_sentences[:10]

['hallo',
 'gru gott',
 'lauf',
 'potzdonner',
 'donnerwetter',
 'feuer',
 'hilfe',
 'zu hulf',
 'stopp',
 'warte']

In [56]:
group = list(zip(en_sentences, de_sentences))
random.shuffle(group)

# 90% train, 10% validation
train_group = group[:int(len(group) * 0.9)]
val_group = group[int(len(group) * 0.9):]

print(f"Train size: {len(train_group)}")
print(f"Validation size: {len(val_group)}")

Train size: 18000
Validation size: 2000


In [57]:
output_dir = '/content/drive/My Drive/data/'

with open(output_dir + 'train.en', 'w', encoding='utf-8') as f_en, \
     open(output_dir + 'train.de', 'w', encoding='utf-8') as f_de:
    for en, de in train_group:
        f_en.write(en.strip() + "\n")
        f_de.write(de.strip() + "\n")

with open(output_dir + 'valid.en', 'w', encoding='utf-8') as f_en, \
     open(output_dir + 'valid.de', 'w', encoding='utf-8') as f_de:
    for en, de in val_group:
        f_en.write(en.strip() + "\n")
        f_de.write(de.strip() + "\n")

print("Saved train/valid splits in Drive!")


Saved train/valid splits in Drive!


<h4>Tokenizer</h4>

In [58]:
import sentencepiece as spm

In [59]:
data_dir = "/content/drive/My Drive/data/"
spm_dir = "/content/drive/My Drive/data/spm/"

os.makedirs(spm_dir, exist_ok=True)

train_en = os.path.join(data_dir, "train.en")
train_de = os.path.join(data_dir, "train.de")

In [60]:
spm.SentencePieceTrainer.Train(
    input=train_en,
    model_prefix=f"{spm_dir}en",
    vocab_size=8000,
    character_coverage=1.0,
    model_type="bpe"
)

In [61]:
spm.SentencePieceTrainer.Train(
    input=train_de,
    model_prefix=f"{spm_dir}de",
    vocab_size=8000,
    character_coverage=1.0,
    model_type="bpe"
)

In [62]:
sp_en = spm.SentencePieceProcessor()
sp_en.load(f"{spm_dir}en.model")

sp_de = spm.SentencePieceProcessor()
sp_de.load(f"{spm_dir}de.model")

True

In [63]:
sample_en = "i love this song"
sample_de = "ich liebe dieses lied"

encode_en = sp_en.encode(sample_en, out_type=int)
encode_de = sp_de.encode(sample_de, out_type=int)

print("en encoding:", encode_en)
print("de encoding:", encode_de)

print("en decoding:", sp_en.decode(encode_en))
print("de decoding:", sp_de.decode(encode_de))

en encoding: [3, 141, 70, 1432]
de encoding: [15, 303, 464, 1764]
en decoding: i love this song
de decoding: ich liebe dieses lied


<h4>Creating Dataset</h4>

In [64]:
class DeEnPairDataset(Dataset):
    def __init__(self, en_file, de_file, sp_en, sp_de):
        "Read file from drive"
        with open(en_file, 'r') as f:
            self.en_sentences = [line.strip() for line in f.readlines()]
        with open(de_file, 'r') as f:
            self.de_sentences = [line.strip() for line in f.readlines()]
        self.sp_en = sp_en
        self.sp_de = sp_de

    def __len__(self):
        return len(self.en_sentences)

    def __getitem__(self, idx):
        "return (De tensor, En tensor)"
        en_encoded = self.sp_en.encode(self.en_sentences[idx], out_type=int)
        de_encoded = self.sp_de.encode(self.de_sentences[idx], out_type=int)
        return torch.tensor(en_encoded, dtype=torch.long), torch.tensor(de_encoded, dtype=torch.long)

In [65]:
train_ds = DeEnPairDataset(os.path.join(data_dir, "train.en"), os.path.join(data_dir, "train.de"), sp_en, sp_de)
val_ds = DeEnPairDataset(os.path.join(data_dir, "valid.en"), os.path.join(data_dir, "valid.de"), sp_en, sp_de)

print(len(train_ds), len(val_ds))

print(train_ds[0])

sp_en.decode(train_ds[0][0].tolist()), sp_de.decode(train_ds[0][1].tolist())

18000 2000
(tensor([493, 138,  44]), tensor([721, 220,  84]))


('cook for me', 'koch fur mich')

<h4>Fixed size tensors (input, target) in batch</h4>

In [66]:
import torch

max_seq_len = 100
pad_idx = 0

def collate_fn(batch):
    "Return fixed size tensor"
    srcs, tgts = zip(*batch)
    batch_size = len(batch)

    src_batch = []
    tgt_batch = []

    for src in srcs:
        if len(src) < max_seq_len:
            "Padding"
            padded = torch.cat([src, torch.tensor([pad_idx] * (max_seq_len - len(src)), dtype=torch.long)])
        else:
            "Truncate"
            padded = src[:max_seq_len]
        src_batch.append(padded)

    for tgt in tgts:
        if len(tgt) < max_seq_len:
            padded = torch.cat([tgt, torch.tensor([pad_idx] * (max_seq_len - len(tgt)), dtype=torch.long)])
        else:
            padded = tgt[:max_seq_len]
        tgt_batch.append(padded)

    src_batch = torch.stack(src_batch)
    tgt_batch = torch.stack(tgt_batch)

    return src_batch, tgt_batch

In [67]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)

# (batch_size, seq)-> (32, 625)
len(train_loader)

563

<h4>Model</h4>

In [68]:
# Hyper parameters

lr = 3e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_embd = 512
n_head = 4
n_layer = 4
dropout = 0.1
vocab_size = 8000
max_seq_len=100

import copy

In [69]:
def clones(module, N):
  "Create N identical layers."
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [70]:
class LayerNorm(nn.Module):
  "Normalize features"
  def __init__(self, features, eps=1e-6):
    super(LayerNorm, self).__init__()
    self.a_2 = nn.Parameter(torch.ones(features))
    self.b_2 = nn.Parameter(torch.zeros(features))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)
    return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [71]:
class SublayerConnection(nn.Module):
  "Residual connection followed by layer norm"
  def __init__(self, size, dropout):
    super(SublayerConnection, self).__init__()
    self.layer_norm = LayerNorm(size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.layer_norm(x)))

In [72]:
def attention(q, k, v, mask=None, dropout=None):
  "Compute Scaled Dot Product Attention (Attention score)"
  dim_k = k.size(-1)
  score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dim_k)
  if mask is not None:
    score = score.masked_fill(mask == 0, -1e9)
  att_w = score.softmax(dim=-1)
  if dropout is not None:
    att_w = dropout(att_w)

  return torch.matmul(att_w, v), att_w

In [73]:
class MultiHeadedAttention(nn.Module):
  def __init__(self, n_head, n_embd, dropout=0.1):
    super(MultiHeadedAttention, self).__init__()
    assert n_embd % n_head == 0, "can't divide n_embd by n_head"
    self.n_head = n_head
    self.n_embd = n_embd
    self.dim_k = n_embd // n_head
    self.Ws = clones(nn.Linear(n_embd, n_embd), 4)
    self.attn = None
    self.dropout = nn.Dropout(dropout)

  def forward(self, q, k, v, mask=None):
    if mask is not None:
      mask = mask.unsqueeze(1)
    n_batches = q.size(0)

    Q = self.Ws[0](q).view(n_batches, -1, self.n_head, self.dim_k).transpose(1, 2)
    K = self.Ws[1](k).view(n_batches, -1, self.n_head, self.dim_k).transpose(1, 2)
    V = self.Ws[2](v).view(n_batches, -1, self.n_head, self.dim_k).transpose(1, 2)

    x, self.attn = attention(Q, K, V, mask=mask, dropout=self.dropout)

    "Concatenating all heads"
    x = x.transpose(1, 2).contiguous().view(n_batches, -1, self.n_head * self.dim_k)

    return self.Ws[-1](x)

In [74]:
class FeedForwardLayer(nn.Module):
  def __init__(self, n_embd, dropout):
    super(FeedForwardLayer, self).__init__()
    self.ff = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.ff(x)

In [75]:
class Embeddings(nn.Module):
  def __init__(self, vocab_size, n_embd):
    super(Embeddings, self).__init__()
    self.embedding = nn.Embedding(vocab_size, n_embd)
    self.scale = n_embd ** 0.5

  def forward(self, x):
    return self.embedding(x) * self.scale

In [76]:
class PositionalEncoding(nn.Module):
  def __init__(self, n_embd, max_len=100):
    super(PositionalEncoding, self).__init__()

    pe = torch.zeros(max_len, n_embd)
    pos = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, n_embd, 2) * -(math.log(10000.0) / n_embd)) # e ^ [-ln(10000) * (2i/ n_embd)]
    pe[:, 0::2] = torch.sin(pos * div_term)
    pe[:, 1::2] = torch.cos(pos * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self, x):
    return x + self.pe[:, :x.size(1)]

In [77]:
class EncoderLayer(nn.Module):
  "Consist of multi-head attention and feed forward"
  def __init__(self, size, self_attn, feed_forward, dropout):
    super(EncoderLayer, self).__init__()
    self.self_attn = self_attn
    self.feed_forward = feed_forward
    self.add_and_norm = clones(SublayerConnection(size, dropout), 2)
    self.size = size

  def forward(self, x, mask):
    x = self.add_and_norm[0](x, lambda x: self.self_attn(x, x, x, mask))
    x = self.add_and_norm[1](x, self.feed_forward)
    return x

In [78]:
class DecoderLayer(nn.Module):
  "Consist of multi-head attn, src_attn, feed forward"
  def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
    super(DecoderLayer, self).__init__()
    self.size = size
    self.self_attn = self_attn
    self.src_attn = src_attn
    self.feed_forward = feed_forward
    self.add_and_norm = clones(SublayerConnection(size, dropout), 3)

  def forward(self, x, encoder_op, src_mask, tgt_mask):
    x = self.add_and_norm[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
    x = self.add_and_norm[1](x, lambda x: self.src_attn(x, encoder_op, encoder_op, src_mask))
    x = self.add_and_norm[2](x, self.feed_forward)
    return x

In [79]:
class Encoder(nn.Module):
  def __init__(self, layer, N):
    super(Encoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [80]:
class Decoder(nn.Module):
  def __init__(self, layer, N):
    super(Decoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, encoder_op, src_mask, tgt_mask):
    for layer in self.layers:
      x = layer(x, encoder_op, src_mask, tgt_mask)
    return self.norm(x)

In [None]:
class Transformer(nn.Module):
  def __init__(self, src_vocab_size, tgt_vocab_size, n_embd, n_head, n_layer, dropout, max_seq_len):
    super(Transformer, self).__init__()
    c = copy.deepcopy
    attn = MultiHeadedAttention(n_head, n_embd, dropout)
    ff = FeedForwardLayer(n_embd, dropout)
    position = PositionalEncoding(n_embd, max_seq_len)

    self.encoder = Encoder(EncoderLayer(n_embd, c(attn), c(ff), dropout), n_layer)
    self.decoder = Decoder(DecoderLayer(n_embd, c(attn), c(attn), c(ff), dropout), n_layer)
    self.src_embed = nn.Sequential(Embeddings(src_vocab_size, n_embd), c(position))
    self.tgt_embed = nn.Sequential(Embeddings(tgt_vocab_size, n_embd), c(position))
    self.generator = nn.Linear(n_embd, tgt_vocab_size)

  def forward(self, src, tgt, src_mask, tgt_mask):
    "Process src and tgt sequences."
    encoded_src = self.encode(src, src_mask)
    decoded_tgt = self.decode(encoded_src, src_mask, tgt, tgt_mask)

    return self.generator(decoded_tgt)

  def encode(self, src, src_mask):
    return self.encoder(self.src_embed(src), src_mask)

  def decode(self, encoder_op, src_mask, tgt, tgt_mask):
    tgt_mask = make_tgt_mask(tgt, pad_idx).to(tgt.device)
    return self.decoder(self.tgt_embed(tgt), encoder_op, src_mask, tgt_mask)

In [82]:
def make_src_mask(src, pad_idx):
    "Create a mask to hide padding tokens in the source sequence."
    return (src != pad_idx).unsqueeze(1)

def make_tgt_mask(tgt, pad_idx):
    "Create a mask to hide padding tokens in the target sequence and future tokens."
    # (B, seq_len, seq_len)
    tgt_pad_mask = (tgt != pad_idx).unsqueeze(-2)
    tgt_seq_len = tgt.size(-1)
    subsequent_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len, dtype=torch.bool, device=tgt.device))
    # (B, seq_len, seq_len) & (1, seq_len, seq_len) -> (B, seq_len, seq_len)
    return tgt_pad_mask & subsequent_mask

In [None]:
model = Transformer(vocab_size, vocab_size, n_embd, n_head, n_layer, dropout, max_seq_len).to(device)

p = sum(p.nelement() for p in model.parameters())
print(f"Number of parameters: {p / 1e6} M")

Number of parameters: 41.723712 M


<h4>Model Training</h4>

In [None]:
import time

# loss function
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

# lr decay
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

epochs = 10
model_save_path = '/content/drive/My Drive/transformer_En-De.pth'

train_losses = []
val_losses = []

try:
  for epoch in range(epochs):
    start_time = time.time()
    model.train()

    total_train_loss = 0

    for i, (src, tgt) in enumerate(train_loader):
      src = src.to(device)
      tgt = tgt.to(device)

      src_mask = make_src_mask(src, pad_idx).to(device)
      tgt_input = tgt[:, :-1]
      tgt_output = tgt[:, 1:]
      tgt_mask = make_tgt_mask(tgt_input, pad_idx).to(device)

      optimizer.zero_grad()
      output = model(src, tgt_input, src_mask, tgt_mask)
      loss = criterion(output.contiguous().view(-1, output.size(-1)), tgt_output.contiguous().view(-1))
      loss.backward()
      optimizer.step()

      total_train_loss += loss.item()

      if i % 100 == 0:
        print(f"Batch {i} / {len(train_loader)}, training loss: {loss.item()}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}, training loss: {avg_train_loss}")

    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved after epoch {epoch+1} to {model_save_path}")

    model.eval()
    total_val_loss = 0

    with torch.no_grad():
      for i, (src, tgt) in enumerate(val_loader):
        src = src.to(device)
        tgt = tgt.to(device)

        src_mask = make_src_mask(src, pad_idx).to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = make_tgt_mask(tgt_input, pad_idx).to(device)

        output = model(src, tgt_input, src_mask, tgt_mask)
        loss = criterion(output.contiguous().view(-1, output.size(-1)), tgt_output.contiguous().view(-1))
        total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f"Epoch {epoch+1}, validation loss: {avg_val_loss}")

    end_time = time.time()
    print(f"Epoch {epoch+1}, time: {end_time - start_time}")
    scheduler.step()


except KeyboardInterrupt:
    print("Training interrupted. Saving model...")
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

print("Training finished.")

#### Load and Test Model

In [None]:
# Load the saved model
model_load_path = '/content/drive/My Drive/transformer_En-De.pth'

loaded_model = Transformer(vocab_size, vocab_size, n_embd, n_head, n_layer, dropout, max_seq_len).to(device)
loaded_model.load_state_dict(torch.load(model_load_path, map_location=device))
loaded_model.eval()

print("Model loaded successfully.")

Model loaded successfully.


In [133]:
def translate_sentence(model, sentence, sp_en, sp_de, max_seq_len, device, pad_idx, start_symbol=1, end_symbol=2):
    model.eval()
    with torch.no_grad():
        encoded_sentence = sp_en.encode(sentence, out_type=int)
        src_tensor = torch.tensor(encoded_sentence, dtype=torch.long).unsqueeze(0).to(device)
        src_mask = make_src_mask(src_tensor, pad_idx).to(device)

        encoder_outputs = model.encode(src_tensor, src_mask)

        tgt_tensor = torch.tensor([[start_symbol]], dtype=torch.long).to(device)

        for _ in range(max_seq_len - 1):
            tgt_mask = make_tgt_mask(tgt_tensor, pad_idx).to(device)

            output = model.decode(encoder_outputs, src_mask, tgt_tensor, tgt_mask)
            logits = model.generator(output[:, -1, :])

            next_token = logits.argmax(dim=-1)

            tgt_tensor = torch.cat([tgt_tensor, next_token.unsqueeze(0)], dim=1)

            if next_token.item() == end_symbol:
                break

        output_tokens = tgt_tensor.squeeze(0).tolist()
        if output_tokens[0] == start_symbol:
            output_tokens = output_tokens[1:]
        if end_symbol in output_tokens:
            output_tokens = output_tokens[:output_tokens.index(end_symbol)]

        translated_sentence = sp_de.decode(output_tokens)

    return translated_sentence


In [None]:
# Test the translation
test_sentence = "my wife is very beautiful and i love her"
translated_output = translate_sentence(loaded_model, test_sentence, sp_en, sp_de, max_seq_len, device, pad_idx)
print(f"English: {test_sentence}")
print(f"German: {translated_output}")