In [1]:
import re
import random
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class Vocab:
    def __init__(self, tokens, min_freq=1):
        counter = Counter(tokens)
        self.token_to_idx = {
            '<pad>': 0,
            '<sos>': 1,
            '<eos>': 2,
            '<unk>': 3
        }
        for token, freq in counter.items():
            if freq >= min_freq and token not in self.token_to_idx:
                self.token_to_idx[token] = len(self.token_to_idx)
        self.idx_to_token = {idx: tok for tok, idx in self.token_to_idx.items()}

    def __len__(self):
        return len(self.token_to_idx)

    def encode(self, tokens):
        return [self.token_to_idx.get(tok, self.token_to_idx['<unk>']) for tok in tokens]

    def decode(self, indices):
        return [self.idx_to_token.get(idx, '<unk>') for idx in indices]


def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", r" ", sentence)
    sentence = re.sub(r"\s+", " ", sentence).strip()
    return word_tokenize(sentence)


def load_dataset(file_path, num_samples=None):
    with open(file_path, encoding='utf-8') as f:
        lines = f.read().strip().split('\n')

    if num_samples:
        lines = lines[:num_samples]

    input_sentences = []
    target_sentences = []

    for line in lines:
        en, es = line.split('\t')
        en_tokens = ['<sos>'] + preprocess_sentence(en) + ['<eos>']
        es_tokens = ['<sos>'] + preprocess_sentence(es) + ['<eos>']
        input_sentences.append(en_tokens)
        target_sentences.append(es_tokens)

    return input_sentences, target_sentences


def build_dataset(file_path='spa.txt', num_samples=10000):
    inputs, targets = load_dataset(file_path, num_samples)

    input_vocab = Vocab([tok for sent in inputs for tok in sent])
    target_vocab = Vocab([tok for sent in targets for tok in sent])

    input_indices = [torch.tensor(input_vocab.encode(s)) for s in inputs]
    target_indices = [torch.tensor(target_vocab.encode(s)) for s in targets]

    input_padded = pad_sequence(input_indices, batch_first=True, padding_value=input_vocab.token_to_idx['<pad>'])
    target_padded = pad_sequence(target_indices, batch_first=True, padding_value=target_vocab.token_to_idx['<pad>'])

    total = len(input_padded)
    train_end = int(0.8 * total)
    val_end = int(0.9 * total)

    inp_train = input_padded[:train_end]
    targ_train = target_padded[:train_end]

    inp_val = input_padded[train_end:val_end]
    targ_val = target_padded[train_end:val_end]

    inp_test = input_padded[val_end:]
    targ_test = target_padded[val_end:]

    return inp_train, inp_val, inp_test, targ_train, targ_val, targ_test, input_vocab, target_vocab

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


In [2]:
import torch
import torch.nn as nn


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers, dropout=dropout, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, num_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))  # [batch_size, output_dim]
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.shape
        trg_vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[:, 0]  # <sos>

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output

            top1 = output.argmax(1)
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm


def train_model(model, dataloader, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0

    for src, trg in tqdm(dataloader, desc="Training"):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()

        output = model(src, trg)  # [batch_size, trg_len, output_dim]
        output_dim = output.shape[-1]

        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [4]:
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0.0)

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [5]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize


def compute_bleu(model, data_loader, input_vocab, target_vocab):
    model.eval()
    total_score = 0
    count = 0
    smoothie = SmoothingFunction().method4

    for src, trg in data_loader:
        src, trg = src.to(device), trg.to(device)
        for i in range(src.size(0)):
            src_sentence = input_vocab.decode(src[i].tolist())
            trg_sentence = target_vocab.decode(trg[i].tolist())

            trg_sentence = [tok for tok in trg_sentence if tok not in ['<pad>', '<sos>', '<eos>']]
            predicted = translate_sentence(model, ' '.join(src_sentence[1:-1]), input_vocab, target_vocab, device)
            predicted_tokens = word_tokenize(predicted)

            score = sentence_bleu(
                [trg_sentence],
                predicted_tokens,
                smoothing_function=smoothie,
                weights=(0.25, 0.25, 0.25, 0.25)
            )
            total_score += score
            count += 1

    return total_score / count if count > 0 else 0

In [6]:
def translate_sentence(model, sentence, input_vocab, target_vocab, device, max_len=50):
    model.eval()
    tokens = ['<sos>'] + preprocess_sentence(sentence) + ['<eos>']
    numericalized = [input_vocab.token_to_idx.get(tok, input_vocab.token_to_idx['<unk>']) for tok in tokens]
    tensor = torch.tensor(numericalized, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(tensor)

    outputs = []
    input_tok = torch.tensor([target_vocab.token_to_idx['<sos>']], dtype=torch.long).to(device)

    for _ in range(max_len):
        with torch.no_grad():
            output, hidden, cell = model.decoder(input_tok, hidden, cell)
            top1 = output.argmax(1)
            outputs.append(top1.item())

            if top1.item() == target_vocab.token_to_idx['<eos>']:
                break
            input_tok = top1

    return ' '.join(target_vocab.decode(outputs))

In [7]:
# Load data with test split
inp_train, inp_val, inp_test, targ_train, targ_val, targ_test, input_vocab, target_vocab = build_dataset(
    '/kaggle/input/seq2seqdata/spa.txt', num_samples=10000
)

# DataLoaders
BATCH_SIZE = 64
train_dl = DataLoader(TensorDataset(inp_train, targ_train), batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(TensorDataset(inp_val, targ_val), batch_size=BATCH_SIZE)
test_dl = DataLoader(TensorDataset(inp_test, targ_test), batch_size=1)  # For BLEU, use batch_size=1

# Model Setup
INPUT_DIM = len(input_vocab)
OUTPUT_DIM = len(target_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=target_vocab.token_to_idx['<pad>'])

# Train
N_EPOCHS = 25
for epoch in range(N_EPOCHS):
    train_loss = train_model(model, train_dl, optimizer, criterion, clip=1, device=device)
    val_loss = evaluate_model(model, val_dl, criterion, device=device)
    print(f"Epoch {epoch+1}/{N_EPOCHS} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}")

# ✅ Evaluate on the test set
bleu = compute_bleu(model, test_dl, input_vocab, target_vocab)
print(f"\n📏 BLEU score on test set: {bleu:.4f}")

# Translate
print("\n Sample Translations:")
sample_sentences = ["Hello.", "I am fine.", "How are you?", "Where is the bathroom?"]
for sentence in sample_sentences:
    translation = translate_sentence(model, sentence, input_vocab, target_vocab, device)
    print(f"{sentence} → {translation}")

Training: 100%|██████████| 125/125 [00:04<00:00, 27.26it/s]


Epoch 1/25 | Train Loss: 4.616 | Val Loss: 4.731


Training: 100%|██████████| 125/125 [00:03<00:00, 32.30it/s]


Epoch 2/25 | Train Loss: 3.774 | Val Loss: 4.419


Training: 100%|██████████| 125/125 [00:03<00:00, 32.40it/s]


Epoch 3/25 | Train Loss: 3.309 | Val Loss: 4.251


Training: 100%|██████████| 125/125 [00:03<00:00, 32.73it/s]


Epoch 4/25 | Train Loss: 2.956 | Val Loss: 4.075


Training: 100%|██████████| 125/125 [00:03<00:00, 31.96it/s]


Epoch 5/25 | Train Loss: 2.605 | Val Loss: 4.062


Training: 100%|██████████| 125/125 [00:03<00:00, 32.33it/s]


Epoch 6/25 | Train Loss: 2.318 | Val Loss: 4.086


Training: 100%|██████████| 125/125 [00:03<00:00, 32.90it/s]


Epoch 7/25 | Train Loss: 2.055 | Val Loss: 4.021


Training: 100%|██████████| 125/125 [00:03<00:00, 32.55it/s]


Epoch 8/25 | Train Loss: 1.807 | Val Loss: 4.040


Training: 100%|██████████| 125/125 [00:03<00:00, 32.64it/s]


Epoch 9/25 | Train Loss: 1.598 | Val Loss: 4.100


Training: 100%|██████████| 125/125 [00:03<00:00, 32.53it/s]


Epoch 10/25 | Train Loss: 1.385 | Val Loss: 4.136


Training: 100%|██████████| 125/125 [00:03<00:00, 32.36it/s]


Epoch 11/25 | Train Loss: 1.188 | Val Loss: 4.184


Training: 100%|██████████| 125/125 [00:03<00:00, 32.37it/s]


Epoch 12/25 | Train Loss: 1.035 | Val Loss: 4.233


Training: 100%|██████████| 125/125 [00:03<00:00, 31.30it/s]


Epoch 13/25 | Train Loss: 0.913 | Val Loss: 4.215


Training: 100%|██████████| 125/125 [00:03<00:00, 32.65it/s]


Epoch 14/25 | Train Loss: 0.784 | Val Loss: 4.289


Training: 100%|██████████| 125/125 [00:03<00:00, 32.30it/s]


Epoch 15/25 | Train Loss: 0.690 | Val Loss: 4.364


Training: 100%|██████████| 125/125 [00:03<00:00, 32.35it/s]


Epoch 16/25 | Train Loss: 0.632 | Val Loss: 4.348


Training: 100%|██████████| 125/125 [00:03<00:00, 32.35it/s]


Epoch 17/25 | Train Loss: 0.571 | Val Loss: 4.400


Training: 100%|██████████| 125/125 [00:03<00:00, 32.24it/s]


Epoch 18/25 | Train Loss: 0.522 | Val Loss: 4.493


Training: 100%|██████████| 125/125 [00:03<00:00, 32.16it/s]


Epoch 19/25 | Train Loss: 0.506 | Val Loss: 4.540


Training: 100%|██████████| 125/125 [00:03<00:00, 32.34it/s]


Epoch 20/25 | Train Loss: 0.476 | Val Loss: 4.509


Training: 100%|██████████| 125/125 [00:03<00:00, 31.64it/s]


Epoch 21/25 | Train Loss: 0.466 | Val Loss: 4.536


Training: 100%|██████████| 125/125 [00:03<00:00, 32.10it/s]


Epoch 22/25 | Train Loss: 0.440 | Val Loss: 4.580


Training: 100%|██████████| 125/125 [00:03<00:00, 31.84it/s]


Epoch 23/25 | Train Loss: 0.443 | Val Loss: 4.589


Training: 100%|██████████| 125/125 [00:03<00:00, 31.94it/s]


Epoch 24/25 | Train Loss: 0.432 | Val Loss: 4.573


Training: 100%|██████████| 125/125 [00:03<00:00, 31.69it/s]


Epoch 25/25 | Train Loss: 0.398 | Val Loss: 4.648

📏 BLEU score on test set: 0.0807

 Sample Translations:
Hello. → pagar . <eos>
I am fine. → soy muy carro . <eos>
How are you? → ¿ c mo est is ? <eos>
Where is the bathroom? → ¿ d nde est par s ? <eos>
