<a href="https://colab.research.google.com/github/DanielChaeS/Transformer-Based-Neural-Machine-Translation/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install tokenizers datasets pandas tqdm


In [None]:
#model.py
import torch
import torch.nn as nn
import math

#positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

#multi-headed attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.head_dim = d_model // num_heads
        self.num_heads = num_heads

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x_q, x_kv, mask=None):
        B, T_q, _ = x_q.size()
        T_kv = x_kv.size(1)

        q = self.q_proj(x_q)
        k = self.k_proj(x_kv)
        v = self.v_proj(x_kv)

        def reshape(x):
            return x.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)

        q, k, v = map(reshape, (q, k, v))

        attn_weights = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        if mask is not None:
            # Ensure mask dimensions match attention weights
            if mask.dim() == 2:
                mask = mask.unsqueeze(0).unsqueeze(0)
            elif mask.dim() == 3:
                mask = mask.unsqueeze(1)
            # Broadcast mask to match attention heads
            mask = mask.expand(B, self.num_heads, T_q, T_kv)
            attn_weights = attn_weights.masked_fill(mask == 0, float('-inf'))

        attn = attn_weights.softmax(dim=-1)
        out = (attn @ v).transpose(1, 2).contiguous().view(B, T_q, -1)
        return self.out_proj(out)


#feed forward layer
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)

#encoder/decoder block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)  # For decoder cross-attention
        self.ff = FeedForward(d_model, d_ff)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.ln3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.is_decoder = False

    def forward(self, x, context=None, mask=None, cross_mask=None):
        # Self attention
        attn_out = self.attn(x, x, mask)
        x = self.ln1(x + self.dropout(attn_out))

        # Cross attention (only for decoder)
        if context is not None:
            cross_attn_out = self.cross_attn(x, context, cross_mask)
            x = self.ln2(x + self.dropout(cross_attn_out))

        # Feed forward
        ff_out = self.ff(x)
        final_ln = self.ln3 if context is not None else self.ln2
        x = final_ln(x + self.dropout(ff_out))
        return x

#encoder and decoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        x = self.pe(self.embed(x))
        for layer in self.layers:
            x = layer(x, mask=mask)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        x = self.pe(self.embed(x))
        for layer in self.layers:
            x = layer(x, context=enc_out, mask=tgt_mask, cross_mask=src_mask)
        return self.out(x)

#transformer model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_layers=6, num_heads=8, d_ff=2048, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(src_vocab_size, d_model, num_layers, num_heads, d_ff, dropout)
        self.decoder = Decoder(tgt_vocab_size, d_model, num_layers, num_heads, d_ff, dropout)

    def make_subsequent_mask(self, size):
        """Create a causal mask for self-attention in decoder"""
        mask = torch.triu(torch.ones(size, size), diagonal=1)
        return mask == 0  # Return True for allowed positions, False for masked

    def make_src_mask(self, src):
        """Create padding mask for source sequence"""
        # Assuming 0 is the padding token
        return (src != 0).unsqueeze(1).unsqueeze(2)

    def make_tgt_mask(self, tgt):
        """Create combined padding and causal mask for target sequence"""
        seq_len = tgt.size(1)
        # Padding mask
        pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        # Causal mask
        causal_mask = self.make_subsequent_mask(seq_len).to(tgt.device)
        # Combine masks
        return pad_mask & causal_mask

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        if src_mask is None:
            src_mask = self.make_src_mask(src)
        if tgt_mask is None:
            tgt_mask = self.make_tgt_mask(tgt)

        enc_out = self.encoder(src, mask=src_mask)
        return self.decoder(tgt, enc_out, src_mask, tgt_mask)

#inference (greedy decoding)
def greedy_decode(model, src, sos_idx, eos_idx, max_len=50):
    model.eval()
    with torch.no_grad():
        if src.dim() == 1:
            src = src.unsqueeze(0)

        enc_out = model.encoder(src)

        tgt = torch.tensor([[sos_idx]], device=src.device)
        for _ in range(max_len):
            tgt_mask = model.make_tgt_mask(tgt)
            out = model.decoder(tgt, enc_out, tgt_mask=tgt_mask)
            next_token = out[:, -1, :].argmax(-1, keepdim=True)
            tgt = torch.cat([tgt, next_token], dim=1)
            if next_token.item() == eos_idx:
                break
        return tgt.squeeze(0)

In [None]:
#data.py
import torch
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
import pandas as pd

class CSVTranslationDataset(Dataset):
    def __init__(self, csv_path, src_tokenizer_path, tgt_tokenizer_path, max_len=64, limit=50000):
        # Load and limit the dataset
        df = pd.read_csv("/content/drive/MyDrive/korean_transformer/opensubs_ko_en.csv").dropna()
        self.data = list(zip(df["korean"][:limit], df["english"][:limit]))

        # Load tokenizers
        self.src_tokenizer = Tokenizer.from_file(src_tokenizer_path)
        self.tgt_tokenizer = Tokenizer.from_file(tgt_tokenizer_path)

        self.max_len = max_len

    def encode(self, text, tokenizer):
        ids = tokenizer.encode(text).ids[:self.max_len - 2]
        return [tokenizer.token_to_id("<sos>")] + ids + [tokenizer.token_to_id("<eos>")]

    def pad(self, ids, pad_id):
        return ids + [pad_id] * (self.max_len - len(ids))

    def __getitem__(self, idx):
        src, tgt = self.data[idx]

        src_ids = self.encode(src, self.src_tokenizer)
        tgt_ids = self.encode(tgt, self.tgt_tokenizer)

        return {
            "src": torch.tensor(self.pad(src_ids, self.src_tokenizer.token_to_id("<pad>")), dtype=torch.long),
            "tgt": torch.tensor(self.pad(tgt_ids, self.tgt_tokenizer.token_to_id("<pad>")), dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)


def get_dataloader(csv_path, src_tokenizer_path, tgt_tokenizer_path,
                   batch_size=32, max_len=64, limit=50000, split="train"):
    df = pd.read_csv("/content/drive/MyDrive/korean_transformer/opensubs_ko_en.csv").dropna().reset_index(drop=True)

    if split == "train":
        df = df[:int(0.9 * limit)]  # first 90%
    elif split == "val":
        df = df[int(0.9 * limit):limit]  # last 10%

    dataset = CSVTranslationDataset(
        csv_path=None,
        src_tokenizer_path=src_tokenizer_path,
        tgt_tokenizer_path=tgt_tokenizer_path,
        max_len=max_len,
        limit=len(df)
    )
    dataset.data = list(zip(df["korean"], df["english"]))  # manually override

    return DataLoader(dataset, batch_size=batch_size, shuffle=(split == "train"))

In [None]:
#utils.py
def greedy_decode(model, src, sos_idx, eos_idx, max_len=50):
    model.eval()
    src = src.cuda()
    enc_out = model.encoder(src)

    tgt = torch.tensor([[sos_idx]], device=src.device)
    for _ in range(max_len):
        out = model.decoder(tgt, enc_out)
        next_token = out[:, -1, :].argmax(-1, keepdim=True)
        tgt = torch.cat([tgt, next_token], dim=1)
        if next_token.item() == eos_idx:
            break
    return tgt.squeeze(0)[1:]

In [None]:
#train.py
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm #progrss bars
from tokenizers import Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SRC_VOCAB_SIZE = 32000
TGT_VOCAB_SIZE = 32000


ko_tokenizer = Tokenizer.from_file("/content/drive/MyDrive/korean_transformer/korean-tokenizer.json")
PAD_IDX = ko_tokenizer.token_to_id("<pad>")

model = Transformer(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    d_model=256,
    num_layers=2,
    num_heads=4,
    d_ff=1024,
    dropout=0.1
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

train_loader = get_dataloader(
    csv_path="/content/drive/MyDrive/korean_transformer/opensubs_ko_en.csv",
    src_tokenizer_path="/content/drive/MyDrive/korean_transformer/korean-tokenizer.json",
    tgt_tokenizer_path="/content/drive/MyDrive/korean_transformer/english-tokenizer.json",
    batch_size=32,
    max_len=64,
    limit=50000,
    split="train"
)

val_loader = get_dataloader(
    csv_path="/content/drive/MyDrive/korean_transformer/opensubs_ko_en.csv",
    src_tokenizer_path="/content/drive/MyDrive/korean_transformer/korean-tokenizer.json",
    tgt_tokenizer_path="/content/drive/MyDrive/korean_transformer/english-tokenizer.json",
    batch_size=32,
    max_len=64,
    limit=50000,
    split="val"
)

def validate_model(model, val_loader, criterion, device):
    """Evaluate the model on validation set"""
    model.eval()
    total_val_loss = 0
    num_batches = 0

    with torch.no_grad():
        for batch in val_loader:
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            logits = model(src, tgt_input)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))

            total_val_loss += loss.item()
            num_batches += 1

    return total_val_loss / num_batches

# Training loop with validation
best_val_loss = float('inf')
patience = 3
patience_counter = 0

for epoch in range(10):
    # Training phase
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        src = batch['src'].to(device)
        tgt = batch['tgt'].to(device)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        logits = model(src, tgt_input)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    # Validation phase
    val_loss = validate_model(model, val_loader, criterion, device)

    # Calculate average losses
    avg_train_loss = total_train_loss / len(train_loader)

    print(f"Epoch {epoch + 1}:")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # Save the best model
        torch.save(model.state_dict(), "/content/drive/MyDrive/korean_transformer/best_model.pth")
        print("  ✅ New best model saved!")
    else:
        patience_counter += 1
        print(f"  No improvement. Patience: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("  Early stopping triggered!")
            break

    print("-" * 50)

# Save the final model
torch.save(model.state_dict(), "/content/drive/MyDrive/korean_transformer/final_model.pth")
print("✅ Final model saved to Drive!")


it's bleu eval time!

In [None]:
!pip install nltk
import nltk
nltk.download('punkt')


In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def run_bleu_eval(model, val_loader, tokenizer_src, tokenizer_tgt, sos_idx, eos_idx, max_samples=100):
    model.eval()
    preds = []
    refs = []

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if i >= max_samples:
                break
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)

            for j in range(src.size(0)):
                input_ids = src[j].unsqueeze(0)

                # Decode prediction
                output_ids = greedy_decode(model, input_ids, sos_idx, eos_idx)
                pred = tokenizer_tgt.decode(output_ids.tolist(), skip_special_tokens=True)

                # Reference
                ref_ids = tgt[j].tolist()
                ref = tokenizer_tgt.decode(ref_ids, skip_special_tokens=True)

                # Tokenize (BLEU expects lists of tokens)
                preds.append(nltk.word_tokenize(pred))
                refs.append([nltk.word_tokenize(ref)])

    bleu = corpus_bleu(refs, preds) * 100
    print(f"🔍 BLEU Score: {bleu:.2f}")
    return bleu


In [None]:
# Load your model
model = Transformer(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    d_model=256,
    num_layers=2,
    num_heads=4,
    d_ff=1024,
    dropout=0.1
).to(device)

model.load_state_dict(torch.load("/content/drive/MyDrive/korean_transformer/best_model.pth"))
model.eval()

# Load tokenizers
from tokenizers import Tokenizer
ko_tokenizer = Tokenizer.from_file("/content/drive/MyDrive/korean_transformer/korean-tokenizer.json")
en_tokenizer = Tokenizer.from_file("/content/drive/MyDrive/korean_transformer/english-tokenizer.json")

sos_idx = en_tokenizer.token_to_id("<sos>")
eos_idx = en_tokenizer.token_to_id("<eos>")

# Load val loader
val_loader = get_dataloader(
    csv_path='/content/drive/MyDrive/korean_transformer/opensubs_ko_en.csv',
    src_tokenizer_path="/content/drive/MyDrive/korean_transformer/korean-tokenizer.json",
    tgt_tokenizer_path="/content/drive/MyDrive/korean_transformer/english-tokenizer.json",
    split="val",
    limit=50000,
    batch_size=8
)

# Run BLEU
run_bleu_eval(model, val_loader, ko_tokenizer, en_tokenizer, sos_idx, eos_idx, max_samples=100)