<a href="https://colab.research.google.com/github/CrazySoda/Machine-Learning/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install Dependencies

In [None]:
!pip install -U spacy
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm


# Helpers.py





In [None]:
%%writefile helpers.py

# helpers.py

import torch
import spacy
import sacrebleu
from typing import List


# -------------------------------------------------
# Load spaCy models ONCE (important for speed)
# -------------------------------------------------

#Loads German and English NLP models
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

#Converts sentences to tokens and lowercases everything
#uses only the tokenizer from the nlp pipeline
def tokenize_de(text: str):
    return [tok.text.lower() for tok in spacy_ger(text)]


def tokenize_en(text: str):
    return [tok.text.lower() for tok in spacy_eng(text)]


# -------------------------------------------------
# Translate a single sentence
# -------------------------------------------------
def translate_sentence(
    model,
    sentence,
    german_vocab,
    english_vocab,
    device,
    max_length: int = 50,
):
    #put the model in the inference mode
    model.eval()

    # Tokenize input
    if isinstance(sentence, str):
        tokens = tokenize_de(sentence)
    else:
        tokens = [tok.lower() for tok in sentence]

    # Add <sos> and <eos>
    tokens = ["<sos>"] + tokens + ["<eos>"]

    # Convert to indices
    text_to_indices = [german_vocab[token] for token in tokens]

    # Shape: (seq_len, 1)
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    #encoder forward pass
    #compresses to hidden(summary of sentence) and cell(LSTM memory)
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    #initializa decoder
    outputs = [english_vocab["<sos>"]]

    #generate word for word english per step
    for _ in range(max_length):

        #input the last predicted word in the decoder
        prev_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            #output -> probabilities for all English words
            output, hidden, cell = model.decoder(prev_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        if best_guess == english_vocab["<eos>"]:
            break
    #convert numbers to words
    translated_tokens = [
        english_vocab.lookup_token(idx) for idx in outputs
    ]

    # Remove <sos>
    return translated_tokens[1:]


# -------------------------------------------------
# BLEU score (modern replacement of torchtext BLEU)
# -------------------------------------------------
def bleu_score_dataset(
    dataset,
    model,
    german_vocab,
    english_vocab,
    device,
    max_length: int = 50,
):
    #model translation
    predictions = []
    #ground truth translations
    references = []

    for example in dataset:
        #German input
        src_sentence = example["de"]
        #correct English translation
        trg_sentence = example["en"]

        #predicted sentence
        pred_tokens = translate_sentence(
            model,
            src_sentence,
            german_vocab,
            english_vocab,
            device,
            max_length,
        )

        # remove <eos> if present
        if "<eos>" in pred_tokens:
            pred_tokens = pred_tokens[: pred_tokens.index("<eos>")]

        predictions.append(" ".join(pred_tokens))
        references.append([trg_sentence])

    bleu = sacrebleu.corpus_bleu(predictions, references)
    return bleu.score



# -------------------------------------------------
# Checkpoint helpers (unchanged, still correct)
# -------------------------------------------------

#Checkpoint functions save the model and optimizer state so training can be resumed or the trained model can be reused later.
#Saves a snapshot of training to disk
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)

#trains after restoring model weights and optimizer states
def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])



Overwriting helpers.py


# Imports

In [None]:
!pip uninstall -y torch torchtext


In [None]:
!pip install torch==2.2.2 torchtext==0.17.2


In [None]:
import torch
import torch.nn as nn
#optim is an optimizer used to update model parameters (weights & biases) during training so that loss decreases
import torch.optim as optim
import random
import spacy
import numpy as np
#sacrebleu is the best metric to judge machine translation
import sacrebleu

from datasets import load_dataset
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter

from helpers import translate_sentence, bleu_score_dataset, save_checkpoint, load_checkpoint



In [None]:
import sys
print(sys.path)


In [None]:
!pip install sacrebleu




#Tokenizer

In [None]:
#initializes the built-in nlp pipeline
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

#tokenizes
def tokenize_ger(text):
    return [tok.text.lower() for tok in spacy_ger(text)]

def tokenize_eng(text):
    return [tok.text.lower() for tok in spacy_eng(text)]


#Load Multi30k

In [None]:
#from datasets import load_dataset

dataset = load_dataset("bentrevett/multi30k")

train_data = dataset["train"]
valid_data = dataset["validation"]
test_data  = dataset["test"]


In [None]:
SPECIALS = ["<unk>", "<pad>", "<sos>", "<eos>"]

#iterates over the dataset, extracts the sentence and tokenizes it
def yield_tokens(data, lang, tokenizer):
    for example in data:
        yield tokenizer(example[lang])
#Scan all German training sentences and assigns each word an id
german_vocab = build_vocab_from_iterator(
    #collects all tokens
    yield_tokens(train_data, "de", tokenize_ger),
    specials=SPECIALS,
    #only if the word came twice
    min_freq=2,
    max_tokens=10000,
)
german_vocab.set_default_index(german_vocab["<unk>"])

#same for english
english_vocab = build_vocab_from_iterator(
    yield_tokens(train_data, "en", tokenize_eng),
    specials=SPECIALS,
    min_freq=2,
    max_tokens=10000,
)
english_vocab.set_default_index(english_vocab["<unk>"])


#Collate function(Replaces BucketIterators)

In [None]:
#collate_fn converts a batch of raw German–English sentence pairs into padded
#numerical tensors suitable for batch training in a Seq2Seq model.

def collate_fn(batch):
    src_batch, trg_batch = [], []

    for example in batch:
        src = ["<sos>"] + tokenize_ger(example["de"]) + ["<eos>"]
        trg = ["<sos>"] + tokenize_eng(example["en"]) + ["<eos>"]

        #convert tokens to indices
        src_ids = torch.tensor([german_vocab[t] for t in src])
        trg_ids = torch.tensor([english_vocab[t] for t in trg])

        #each elements have different lengths
        src_batch.append(src_ids)
        trg_batch.append(trg_ids)

    #Neural networks need rectangular tensors
    src_batch = pad_sequence(src_batch, padding_value=german_vocab["<pad>"])
    trg_batch = pad_sequence(trg_batch, padding_value=english_vocab["<pad>"])

    return src_batch, trg_batch

    #returns the batch as the length of


#Data Loaders

In [None]:
batch_size = 64

train_loader = DataLoader(
    train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)

valid_loader = DataLoader(
    valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)

test_loader = DataLoader(
    test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)


#Model

#Encoder
Neural networks cannot translate raw text directly.
Reads the source sentence word by word.
Learns the context and order.
Stores the meaning in its internal states.
### different parameters for encoder
input_size : Size of German vocabulary

embedding_size : Dimension of word embeddings

hidden_size : LSTM hidden state size

num_layers : Number of stacked LSTM layers

p : Dropout probability

In [None]:
#pytorch neural network module
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super().__init__()
        #Converts word IDs → dense vectors
        self.embedding = nn.Embedding(input_size, embedding_size)
        #LSTM
        #Reads sequence step by step
        #Remembers long-term dependencies
        #Handles variable-length sentences
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        #dropout layer stops overfitting by randomly disables neurons during training
        self.dropout = nn.Dropout(p)

    #x -> input tensor of word vector(seq_length , batch_size)
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        #outputs->all the intermediary hidden states
        #hidden-> final hidden state
        #cell->final cell state
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell


#Decoder
##The decoder generates the target (English) sentence one word at a time using the encoder’s final hidden state and the previously generated word.

#Different Params
input_size->English vocabulary size

embedding_size->Word embedding dimension

hidden_size->LSTM hidden state size

output_size->English vocabulary size

num_layers->Number of LSTM layers

p->Dropout probability

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embedding_size) #dense vectors
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        #fully connected output layers
        #hidden state → vocabulary logits:Produces scores for every English word.
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)

    def forward(self, x, hidden, cell):
        #(batch_size) → (1, batch_size)
        x = x.unsqueeze(0)
        #shape -> (1, batch_size, embedding_size)
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        #Each row = scores for all English words
        predictions = self.fc(outputs).squeeze(0)
        return predictions, hidden, cell


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        trg_len = target.shape[0]
        trg_vocab_size = len(english_vocab)

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)

        #encoder side does encoder.forward from input tensor of word vector
        hidden, cell = self.encoder(source)

        #starts with <sos>
        x = target[0]

        #starts without <sos>
        for t in range(1, trg_len):
            #Embeds current word x
            #Runs one LSTM step
            #Produces:
            #output:vocabulary scores
            #updated hidden
            #updated cell
            output, hidden, cell = self.decoder(x, hidden, cell)
            #Saves predictions for timestep t
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


#Training Setup

In [1]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#len(german_vocab)->vocab size->input size
#300->300->embedding dimension
#1024->1024->hidden state size
#2->2->number of LSTM layers
#0.5->0.5->dropout probability
encoder = Encoder(len(german_vocab), 300, 1024, 2, 0.5).to(device)
decoder = Decoder(len(english_vocab), 300, 1024, len(english_vocab), 2, 0.5).to(device)

#Wraps encoder and decoder into one model
model = Seq2Seq(encoder, decoder).to(device)
#Creates an Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

pad_idx = english_vocab["<pad>"]
#loss function
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

#Creates a TensorBoard logger
writer = SummaryWriter("runs/loss_plot")
step = 0


SyntaxError: invalid syntax (ipython-input-454012560.py, line 3)

#Training Loop

In [None]:
num_epochs = 20

for epoch in range(num_epochs):
    #activates training behaviour of nn.module
    model.train()
    print(f"[Epoch {epoch+1}/{num_epochs}]")
    #train_loader is pytorch data_loader
    for src, trg in train_loader:
        src, trg = src.to(device), trg.to(device)

        #forward pass through seq2seq
        output = model(src, trg)
        output = output[1:].reshape(-1, output.shape[2])
        trg = trg[1:].reshape(-1)

        #clear old grads
        optimizer.zero_grad()
        loss = criterion(output, trg)
        #inbuilt backprop of nn
        loss.backward()
        #Gradient clipping prevents exploding grads
        #Scales gradients if norm > 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        #Update model parameters
        #weight = weight - learning_rate × gradient
        optimizer.step()

        writer.add_scalar("Loss/train", loss.item(), step)
        step += 1


[Epoch 3/20]
[Epoch 4/20]
[Epoch 5/20]
[Epoch 6/20]
[Epoch 7/20]
[Epoch 8/20]
[Epoch 9/20]
[Epoch 10/20]
[Epoch 11/20]
[Epoch 12/20]
[Epoch 13/20]
[Epoch 14/20]
[Epoch 15/20]
[Epoch 16/20]
[Epoch 17/20]
[Epoch 18/20]
[Epoch 19/20]
[Epoch 20/20]


#BLEU evaluation

In [None]:
bleu = bleu_score_dataset(
    test_data.select(range(100)),
    model,
    german_vocab,
    english_vocab,
    device
)
print(f"BLEU score: {bleu:.2f}")



BLEU score: 38.14


#Little Debugging(Ignore)

In [None]:
!sed -n '1,200p' helpers.py



# helpers.py

import torch
import spacy
import sacrebleu
from typing import List


# -------------------------------------------------
# Load spaCy models ONCE (important for speed)
# -------------------------------------------------
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")


def tokenize_de(text: str):
    return [tok.text.lower() for tok in spacy_ger(text)]


def tokenize_en(text: str):
    return [tok.text.lower() for tok in spacy_eng(text)]


# -------------------------------------------------
# Translate a single sentence
# -------------------------------------------------
def translate_sentence(
    model,
    sentence,
    german_vocab,
    english_vocab,
    device,
    max_length: int = 50,
):
    model.eval()

    # Tokenize input
    if isinstance(sentence, str):
        tokens = tokenize_de(sentence)
    else:
        tokens = [tok.lower() for tok in sentence]

    # Add <sos> and <eos>
    tokens = ["<sos>"] + tokens + ["<eos>"]



In [None]:
import importlib
import helpers
importlib.reload(helpers)

from helpers import translate_sentence, bleu_score_dataset, save_checkpoint, load_checkpoint
