<a href="https://colab.research.google.com/github/CrazySoda/Machine-Learning/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install Dependencies

In [None]:
!pip install -U spacy
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm


# Helpers.py





In [None]:
%%writefile helpers.py

# helpers.py

import torch
import spacy
import sacrebleu
from typing import List


# -------------------------------------------------
# Load spaCy models ONCE (important for speed)
# -------------------------------------------------
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")


def tokenize_de(text: str):
    return [tok.text.lower() for tok in spacy_ger(text)]


def tokenize_en(text: str):
    return [tok.text.lower() for tok in spacy_eng(text)]


# -------------------------------------------------
# Translate a single sentence
# -------------------------------------------------
def translate_sentence(
    model,
    sentence,
    german_vocab,
    english_vocab,
    device,
    max_length: int = 50,
):
    model.eval()

    # Tokenize input
    if isinstance(sentence, str):
        tokens = tokenize_de(sentence)
    else:
        tokens = [tok.lower() for tok in sentence]

    # Add <sos> and <eos>
    tokens = ["<sos>"] + tokens + ["<eos>"]

    # Convert to indices
    text_to_indices = [german_vocab[token] for token in tokens]

    # Shape: (seq_len, 1)
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english_vocab["<sos>"]]

    for _ in range(max_length):
        prev_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(prev_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        if best_guess == english_vocab["<eos>"]:
            break

    translated_tokens = [
        english_vocab.lookup_token(idx) for idx in outputs
    ]

    # Remove <sos>
    return translated_tokens[1:]


# -------------------------------------------------
# BLEU score (modern replacement of torchtext BLEU)
# -------------------------------------------------
def bleu_score_dataset(
    dataset,
    model,
    german_vocab,
    english_vocab,
    device,
    max_length: int = 50,
):
    predictions: List[str] = []
    references: List[List[str]] = []

    for example in dataset:
        src_sentence = example["translation"]["de"]
        trg_sentence = example["translation"]["en"]

        pred_tokens = translate_sentence(
            model,
            src_sentence,
            german_vocab,
            english_vocab,
            device,
            max_length,
        )

        # Remove <eos> if present
        if "<eos>" in pred_tokens:
            pred_tokens = pred_tokens[: pred_tokens.index("<eos>")]

        predictions.append(" ".join(pred_tokens))
        references.append([trg_sentence])

    bleu = sacrebleu.corpus_bleu(predictions, references)
    return bleu.score


# -------------------------------------------------
# Checkpoint helpers (unchanged, still correct)
# -------------------------------------------------
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])



# Imports

In [None]:
!pip uninstall -y torch torchtext


In [None]:
!pip install torch==2.2.2 torchtext==0.17.2


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import spacy
import numpy as np

from datasets import load_dataset
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter

from helpers import translate_sentence, bleu_score_dataset, save_checkpoint, load_checkpoint



In [None]:
import sys
print(sys.path)


In [None]:
!pip install sacrebleu


#Tokenizer

In [None]:
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

def tokenize_ger(text):
    return [tok.text.lower() for tok in spacy_ger(text)]

def tokenize_eng(text):
    return [tok.text.lower() for tok in spacy_eng(text)]


#Load Multi30k

In [None]:
from datasets import load_dataset

dataset = load_dataset("bentrevett/multi30k")

train_data = dataset["train"]
valid_data = dataset["validation"]
test_data  = dataset["test"]


In [None]:
SPECIALS = ["<unk>", "<pad>", "<sos>", "<eos>"]

def yield_tokens(data, lang, tokenizer):
    for example in data:
        yield tokenizer(example[lang])

german_vocab = build_vocab_from_iterator(
    yield_tokens(train_data, "de", tokenize_ger),
    specials=SPECIALS,
    min_freq=2,
    max_tokens=10000,
)
german_vocab.set_default_index(german_vocab["<unk>"])

english_vocab = build_vocab_from_iterator(
    yield_tokens(train_data, "en", tokenize_eng),
    specials=SPECIALS,
    min_freq=2,
    max_tokens=10000,
)
english_vocab.set_default_index(english_vocab["<unk>"])


In [None]:
def collate_fn(batch):
    src_batch, trg_batch = [], []

    for example in batch:
        src = ["<sos>"] + tokenize_ger(example["de"]) + ["<eos>"]
        trg = ["<sos>"] + tokenize_eng(example["en"]) + ["<eos>"]

        src_ids = torch.tensor([german_vocab[t] for t in src])
        trg_ids = torch.tensor([english_vocab[t] for t in trg])

        src_batch.append(src_ids)
        trg_batch.append(trg_ids)

    src_batch = pad_sequence(src_batch, padding_value=german_vocab["<pad>"])
    trg_batch = pad_sequence(trg_batch, padding_value=english_vocab["<pad>"])

    return src_batch, trg_batch


In [None]:
batch_size = 64

train_loader = DataLoader(
    train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)

valid_loader = DataLoader(
    valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)

test_loader = DataLoader(
    test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.dropout = nn.Dropout(p)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell


In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        predictions = self.fc(outputs).squeeze(0)
        return predictions, hidden, cell


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        trg_len = target.shape[0]
        trg_vocab_size = len(english_vocab)

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)

        hidden, cell = self.encoder(source)
        x = target[0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(len(german_vocab), 300, 1024, 2, 0.5).to(device)
decoder = Decoder(len(english_vocab), 300, 1024, len(english_vocab), 2, 0.5).to(device)

model = Seq2Seq(encoder, decoder).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

pad_idx = english_vocab["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

writer = SummaryWriter("runs/loss_plot")
step = 0


In [None]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    print(f"[Epoch {epoch+1}/{num_epochs}]")

    for src, trg in train_loader:
        src, trg = src.to(device), trg.to(device)

        output = model(src, trg)
        output = output[1:].reshape(-1, output.shape[2])
        trg = trg[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        writer.add_scalar("Loss/train", loss.item(), step)
        step += 1


[Epoch 3/20]


In [None]:
bleu = bleu_score_dataset(
    test_data[:100], model, german_vocab, english_vocab, device
)
print(f"BLEU score: {bleu:.2f}")
