<a href="https://colab.research.google.com/github/BraedenTd/CMPS385SpringSemester/blob/main/FinalProjectTranslator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [89]:
!pip install numpy



In [78]:
pip install torch




In [97]:
# %%
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np # Import numpy

# Hyperparameters
EMBED_SIZE = 32
NUM_HEADS = 2
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
FFN_HIDDEN = 64
VOCAB_SIZE_EN = 100  # Adjust based on tokenizer
VOCAB_SIZE_ES = 100
MAX_LEN = 30
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class TinyTransformer(nn.Module):
    def __init__(self, vocab_size_en, vocab_size_es, embed_size, num_heads, hidden_dim):
        super().__init__()
        self.encoder_embedding = nn.Embedding(vocab_size_en, embed_size)
        self.decoder_embedding = nn.Embedding(vocab_size_es, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(MAX_LEN, embed_size))

        encoder_layer = nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim)
        decoder_layer = nn.TransformerDecoderLayer(embed_size, num_heads, hidden_dim)

        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=NUM_ENCODER_LAYERS)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=NUM_DECODER_LAYERS)

        self.fc_out = nn.Linear(embed_size, vocab_size_es)

    def forward(self, src, tgt):
        src_seq_len = src.size(0)
        tgt_seq_len = tgt.size(0)

        src_embed = self.encoder_embedding(src) + self.positional_encoding[:src_seq_len]
        tgt_embed = self.decoder_embedding(tgt) + self.positional_encoding[:tgt_seq_len]

        memory = self.encoder(src_embed)
        output = self.decoder(tgt_embed, memory)

        return self.fc_out(output)

In [80]:
pip install torch torchtext sentencepiece




In [98]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import sentencepiece as spm
import os

# Use torchtext tokenizers (word-based for simplicity here)
tokenizer_en = get_tokenizer("spacy", language="en_core_web_sm")
tokenizer_es = get_tokenizer("spacy", language="es_core_news_sm")

# Train subword tokenizer using SentencePiece (in real use, load pre-trained or use full dataset)
def train_sentencepiece(corpus_path, model_prefix, vocab_size=1000):
    spm.SentencePieceTrainer.train(
        input=corpus_path,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        bos_id=1, eos_id=2, pad_id=0,
        unk_id=3,
        model_type='bpe'
    )

# Simulate corpus files for demo
with open("corpus.en", "w") as f:
    f.write("hello world\nhello\nworld")
with open("corpus.es", "w") as f:
    f.write("hola mundo\nhola\nmundo")

train_sentencepiece("corpus.en", "spm_en", vocab_size=32)
train_sentencepiece("corpus.es", "spm_es", vocab_size=32)

# Load trained SentencePiece models
sp_en = spm.SentencePieceProcessor(model_file="spm_en.model")
sp_es = spm.SentencePieceProcessor(model_file="spm_es.model")

# Tokenize + numericalize
def tokenize_en(text):
    return [1] + sp_en.encode(text, out_type=int) + [2]  # <sos> and <eos>

def tokenize_es(text):
    return [1] + sp_es.encode(text, out_type=int) + [2]

def detokenize_es(tokens):
    return sp_es.decode(tokens[1:-1])  # Remove <sos> and <eos>


In [99]:
import random
from torch.utils.data import Dataset, DataLoader

# Dummy paired sentences (for real use, load from dataset)
dummy_data = [
    ("hello world", "hola mundo"),
    ("hello", "hola"),
    ("world", "mundo")
]

# Dummy Spanish tokenizer
word2idx_es = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "hola": 3, "mundo": 4}
idx2word_es = {v: k for k, v in word2idx_es.items()}


def tokenize_es(sentence):
    return [1] + [word2idx_es.get(w, 0) for w in sentence.lower().split()] + [2]  # <sos> ... <eos>


class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src_text, tgt_text = self.data[idx]
        src_tokens = tokenize_en(src_text)
        tgt_tokens = tokenize_es(tgt_text)
        return torch.tensor(src_tokens), torch.tensor(tgt_tokens)


def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_lens = [len(x) for x in src_batch]
    tgt_lens = [len(x) for x in tgt_batch]
    src_padded = nn.utils.rnn.pad_sequence(src_batch, padding_value=0)
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=0)
    return src_padded, tgt_padded


# DataLoader
dataset = TranslationDataset(dummy_data)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)


In [100]:
def train(model, dataloader, optimizer, criterion, epochs=20):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for src, tgt in dataloader:
            src, tgt = src.to(DEVICE), tgt.to(DEVICE)
            tgt_input = tgt[:-1, :]
            tgt_target = tgt[1:, :]

            optimizer.zero_grad()
            output = model(src, tgt_input)
            output = output.reshape(-1, output.shape[-1])
            tgt_target = tgt_target.reshape(-1)

            loss = criterion(output, tgt_target)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")


In [None]:
model = TinyTransformer(VOCAB_SIZE_EN, VOCAB_SIZE_ES, EMBED_SIZE, NUM_HEADS, FFN_HIDDEN).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

train(model, loader, optimizer, criterion, epochs=50)


In [107]:
print(translate(model, "hello world"))  # Should now (hopefully) output: "hola mundo"


RuntimeError: shape '[4, 2, 16]' is invalid for input of size 512