In [1]:
import torch, torchdata, torchtext
import torch.nn as nn
import torch.nn.functional as F

import random, math, time

from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from datasets import load_dataset, DatasetDict
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

torch.__version__
torchtext.__version__

cpu


'0.16.2+cpu'

In [None]:
# 1. Data Loading
# Load the full dataset
dataset = load_dataset("opus100", "en-si")
print("Original size of dataset: ", dataset)

# Reduce the training set to 10,000 samples
# Otherwise I can't finish and submit this assientment!!!
# I try to maintain the ratio of 0.2 (traning set and test set)
small_train_dataset = dataset["train"].select(range(10000))
small_test_dataset = dataset["test"].select(range(2000))
small_validation_dataset = dataset["validation"].select(range(2000))

dataset = DatasetDict({
    "train": small_train_dataset,
    "test": small_test_dataset,
    "validation": small_validation_dataset
})

# Verify the new sizes
print(dataset)

In [None]:
# 2. Preprocessing: Tokenization and Numericalization
SRC_LANGUAGE = "en"
TRG_LANGUAGE = "si"

token_transform = {}
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

def sinhala_tokenizer(text):
    return indic_tokenize.trivial_tokenize(text, lang='si')

token_transform[TRG_LANGUAGE] = sinhala_tokenizer

# Function to yield tokenized sentences from training data
def yield_tokens(data, language):
    for data_sample in data:
        yield token_transform[language](data_sample["translation"][language])

# Define special tokens
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

# Build vocabulary from training set
vocab_transform = {}
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    vocab_transform[ln] = build_vocab_from_iterator(
        yield_tokens(dataset["train"], ln),
        min_freq=2,
        specials=special_symbols,
        special_first=True
    )

# Set default index for unknown words
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

sample_si = dataset["train"][10]["translation"]["si"]
print("Raw Sinhala:", sample_si)
print("Tokenized:", sinhala_tokenizer(sample_si))

# Save vocabulary to files (ADDED THIS SECTION)
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    torch.save(vocab_transform[ln], f'vocab_{ln}.pt')
    print(f"Saved {ln} vocabulary to 'vocab_{ln}.pt'")


# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([SOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))

# src and trg language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_batch(batch):
    src_batch, src_len_batch, trg_batch = [], [], []
    
    for item in batch:
        src_sample = item["translation"][SRC_LANGUAGE]  # Extract English sentence
        trg_sample = item["translation"][TRG_LANGUAGE]  # Extract Sinhala sentence

        processed_text = text_transform[SRC_LANGUAGE](src_sample.rstrip("\n"))
        src_batch.append(processed_text)
        trg_batch.append(text_transform[TRG_LANGUAGE](trg_sample.rstrip("\n")))
        src_len_batch.append(processed_text.size(0))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX, batch_first=True)

    return src_batch, torch.tensor(src_len_batch, dtype=torch.int64), trg_batch



# Create data loaders
BATCH_SIZE = 32
train_loader = torch.utils.data.DataLoader(dataset["train"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_loader = torch.utils.data.DataLoader(dataset["validation"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_loader  = torch.utils.data.DataLoader(dataset["test"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
train_loader_length = len(train_loader)