In [2]:
import torch, torchdata, torchtext
import torch.nn as nn
import torch.nn.functional as F

import random, math, time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

torch.__version__
torchtext.__version__

cpu


'0.16.2+cpu'

1. ETL: Loading the dataset

In [8]:
from datasets import load_dataset

dataset = load_dataset("opus100", "en-si")  # English-Sinhala translation

print(dataset)


Generating test split: 100%|██████████| 2000/2000 [00:00<00:00, 25017.10 examples/s]
Generating train split: 100%|██████████| 979109/979109 [00:00<00:00, 2446920.17 examples/s]
Generating validation split: 100%|██████████| 2000/2000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 979109
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})





In [15]:
# credits for the dataset 
# Dataset Source: OPUS-100 (https://huggingface.co/datasets/opus100)
# Configuration: English-Sinhala (en-si)
from datasets import load_dataset_builder

builder = load_dataset_builder("opus100", "en-si")
print(builder.info)

DatasetInfo(description='', citation='', homepage='', license='', features={'translation': Translation(languages=['en', 'si'], id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='opus100', config_name='en-si', version=0.0.0, splits={'test': SplitInfo(name='test', num_bytes=271735, num_examples=2000, shard_lengths=None, dataset_name='opus100'), 'train': SplitInfo(name='train', num_bytes=114950891, num_examples=979109, shard_lengths=None, dataset_name='opus100'), 'validation': SplitInfo(name='validation', num_bytes=271236, num_examples=2000, shard_lengths=None, dataset_name='opus100')}, download_checksums={'hf://datasets/opus100@805090dc28bf78897da9641cdf08b61287580df9/en-si/test-00000-of-00001.parquet': {'num_bytes': 154795, 'checksum': None}, 'hf://datasets/opus100@805090dc28bf78897da9641cdf08b61287580df9/en-si/train-00000-of-00001.parquet': {'num_bytes': 65815918, 'checksum': None}, 'hf://datasets/opus100@805090dc28bf78897da9641cdf08b61287580df9

2. EDA - simple investigation

In [19]:
#let's take a look at one example of train
print(dataset['train'][333]['translation'])


{'en': 'Okay.', 'si': 'හරි, ඔයාලා වැඩකරගෙන යන මොකක් වුණත්...'}


In [23]:
# confirm again the size of the dataset
test_size = dataset["test"].num_rows
print(test_size)
train_size = dataset["train"].num_rows
print(train_size)
validation_size = dataset["validation"].num_rows
print(validation_size)

2000
979109
2000


3. Preprocessing

Tokenizing

In [28]:
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from datasets import load_dataset
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

# Load dataset
dataset = load_dataset("opus100", "en-si")

# Define source (English) and target (Sinhala) languages
SRC_LANGUAGE = "en"
TRG_LANGUAGE = "si"

# Load tokenizer for English (spaCy) and Sinhala (Indic NLP)
token_transform = {}
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

def sinhala_tokenizer(text):
    return indic_tokenize.trivial_tokenize(text, lang='si')

token_transform[TRG_LANGUAGE] = sinhala_tokenizer

# Function to yield tokenized sentences from training data
def yield_tokens(data, language):
    for data_sample in data:
        yield token_transform[language](data_sample["translation"][language])

# Define special tokens
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']



Text to integers (Numericalization)

In [29]:
# Build vocabulary from training set
vocab_transform = {}
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    vocab_transform[ln] = build_vocab_from_iterator(
        yield_tokens(dataset["train"], ln),
        min_freq=2,
        specials=special_symbols,
        special_first=True
    )

# Set default index for unknown words
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

# Example: Tokenizing and numericalizing a sentence
sample_train = dataset["train"][300]["translation"]
print("English Sentence: ", sample_train[SRC_LANGUAGE])
print("Tokenized: ", token_transform[SRC_LANGUAGE](sample_train[SRC_LANGUAGE]))
print("Numericalized: ", vocab_transform[SRC_LANGUAGE](token_transform[SRC_LANGUAGE](sample_train[SRC_LANGUAGE])))

print("Sinhala Sentence: ", sample_train[TRG_LANGUAGE])
print("Tokenized: ", token_transform[TRG_LANGUAGE](sample_train[TRG_LANGUAGE]))
print("Numericalized: ", vocab_transform[TRG_LANGUAGE](token_transform[TRG_LANGUAGE](sample_train[TRG_LANGUAGE])))


English Sentence:  -Okay.
Tokenized:  ['-Okay', '.']
Numericalized:  [5199, 4]
Sinhala Sentence:  -හරි.ඔයා දැන් යන්න ඕනේ...
Tokenized:  ['-', 'හරි', '.', 'ඔයා', 'දැන්', 'යන්න', 'ඕනේ', '.', '.', '.']
Numericalized:  [7, 23, 4, 9, 30, 26, 47, 4, 4, 4]


In [30]:
#see some example
vocab_transform[SRC_LANGUAGE](['here', 'is', 'a', 'unknownword', 'a'])

[45, 18, 13, 0, 13]

In [41]:

mapping = vocab_transform[SRC_LANGUAGE].get_itos()[45]

print(mapping)  

here


In [42]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 64

# Helper function for sequential transformations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# Function to add BOS/EOS tokens and convert to tensor
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([SOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))

# Define text transformation pipeline
text_transform = {}
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln],  # Tokenization
                                               vocab_transform[ln],  # Numericalization
                                               tensor_transform)     # Add BOS/EOS & convert to tensor

# Function to collate data samples into batch tensors
def collate_batch(batch):
    src_batch, src_len_batch, trg_batch = [], [], []
    
    for sample in batch:
        src_text = sample["translation"][SRC_LANGUAGE].rstrip("\n")
        trg_text = sample["translation"][TRG_LANGUAGE].rstrip("\n")

        processed_src = text_transform[SRC_LANGUAGE](src_text)
        processed_trg = text_transform[TRG_LANGUAGE](trg_text)

        src_batch.append(processed_src)
        trg_batch.append(processed_trg)
        src_len_batch.append(processed_src.size(0))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX, batch_first=True)
    
    return src_batch, torch.tensor(src_len_batch, dtype=torch.int64), trg_batch




In [43]:
# Create data loaders
train_loader = DataLoader(dataset["train"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(dataset["validation"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_loader  = DataLoader(dataset["test"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)



In [44]:
# Check batch shapes
for en_batch, _, si_batch in train_loader:
    break
print("English shape: ", en_batch.shape)  # (batch_size, seq_len)
print("Sinhala shape: ", si_batch.shape)  # (batch_size, seq_len)

English shape:  torch.Size([64, 33])
Sinhala shape:  torch.Size([64, 26])
