In [7]:
valid_rows = []

file_path = "Sentence pairs in English-Spanish.tsv"

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 4:
            en = parts[1].strip()
            es = parts[3].strip()
            if en and es:
                valid_rows.append([en, es])

print(f"Loaded {len(valid_rows):,} English-Spanish sentence pairs.")

Loaded 265,512 English-Spanish sentence pairs.


In [8]:
import pandas as pd

df = pd.DataFrame(valid_rows, columns=["en", "es"])
df.head()

Unnamed: 0,en,es
0,Let's try something.,¡Intentemos algo!
1,I have to go to sleep.,Tengo que irme a dormir.
2,Today is June 18th and it is Muiriel's birthday!,¡Hoy es 18 de junio y es el cumpleaños de Muir...
3,Today is June 18th and it is Muiriel's birthday!,¡Hoy es el 18 de junio y es el cumpleaños de M...
4,Muiriel is 20 now.,"Ahora, Muiriel tiene 20 años."


In [9]:
#Process data into token IDs for transformer
from collections import Counter
import torch

def build_vocab(sentences, max_vocab=5000):
    counter = Counter()
    for sent in sentences:
        tokens = sent.lower().split()
        counter.update(tokens)
    vocab = {"<pad>":0, "<unk>":1, "<bos>":2, "<eos>":3}
    most_common = counter.most_common(max_vocab - len(vocab))
    for i, (word, _) in enumerate(most_common, len(vocab)):
        vocab[word] = i
    return vocab

def encode_sentence(sentence, vocab):
    tokens = sentence.lower().split()
    ids = [vocab.get(t, vocab["<unk>"]) for t in tokens]
    return [vocab["<bos>"]] + ids + [vocab["<eos>"]]

en_vocab = build_vocab(df["en"])
es_vocab = build_vocab(df["es"])

encoded_pairs = [(encode_sentence(en, en_vocab), encode_sentence(es, es_vocab)) for en, es in zip(df["en"], df["es"])]



In [10]:
#Small Transformer to stay within 100 neural unit limit
import torch.nn as nn

class SmallTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_size=32, nhead=2, dim_feedforward=64, num_layers=1):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab_size, emb_size)
        self.tgt_emb = nn.Embedding(tgt_vocab_size, emb_size)
        self.transformer = nn.Transformer(d_model=emb_size, nhead=nhead, 
                                          num_encoder_layers=num_layers, 
                                          num_decoder_layers=num_layers, 
                                          dim_feedforward=dim_feedforward)
        self.fc = nn.Linear(emb_size, tgt_vocab_size)

    def forward(self, src, tgt):
        src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(src.device)
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)

        src_emb = self.src_emb(src).permute(1, 0, 2)
        tgt_emb = self.tgt_emb(tgt).permute(1, 0, 2)

        output = self.transformer(src_emb, tgt_emb, src_mask=src_mask, tgt_mask=tgt_mask)
        return self.fc(output.permute(1, 0, 2))