In [1]:
import torch, torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np 
import pandas as pd 
from datasets import load_dataset
from numpy.random import default_rng
# import re
# import chardet
# import datasets
# import gc
# import os

import random, math, time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# make our work comparable if restarted the kernel
SEED = 122
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
# versions
torch.__version__, torchtext.__version__, pd.__version__, np.__version__

('2.3.0', '0.18.0', '2.2.2', '1.24.3')

# ETL: Loading the dataset

For this assignment, the language I chose is Nepali. And the dataset is taken from HuggingFace: https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ne

In [5]:
dataset = load_dataset("opus100", "en-ne")

In [6]:
# information about the dataset
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 406381
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [7]:
# preview
dataset['train'][8]

{'translation': {'en': 'S_ubject:', 'ne': 'विषय:'}}

# EDA - simple investigation

In [8]:
dataset['train']

Dataset({
    features: ['translation'],
    num_rows: 406381
})

In [9]:
# looking at training example
preview = next(iter(dataset['train']))
preview

{'translation': {'en': '_Inv', 'ne': 'Inv'}}

In [10]:
train_size = len(list(iter(dataset['train'])))
train_size

406381

In [11]:
dataset['train'][0]['translation']['en']

'_Inv'

I feel this object has too deeply nested structure. Reducing the dataset size and changing the structure for efficiency and code readability.

In [12]:
# random numnber generator
rand = default_rng(SEED)

# random index from the dataset
# reducing the size of the dataset to 200k
random_index = rand.choice(len(dataset['train']), 200000, replace=False)

# dataset['train'] = dataset['train'].filter(lambda data, index: index in random_index, with_indices=True)


In [13]:
dataset['train'] = dataset['train'].select(random_index)

In [14]:
# Extracts translation col for specific language
# eg: data['translation']['ne']
new_col_gen = lambda data, lang: {lang: data['translation'][lang]}

# New column for Nepali translation
dataset = dataset.map(new_col_gen, fn_kwargs={'lang': "ne"})

# New column for English translation
# Deleting the translation column
dataset = dataset.map(new_col_gen, remove_columns=['translation'], fn_kwargs={'lang': "en"})

In [15]:
# Rechecking the dataset
sample = next(iter(dataset['train']))
sample

{'ne': 'यस पत्रचार सूचीहरूको लागि सन्देशहरू फिल्टर गर्न एक नियम सिर्जना गर्नुहोस्',
 'en': 'Create a rule to filter messages to this mailing list'}

In [16]:
train_size = len(list(iter(dataset['train'])))
train_size

200000

In [17]:
dataset # easier to work with

DatasetDict({
    test: Dataset({
        features: ['ne', 'en'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['ne', 'en'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['ne', 'en'],
        num_rows: 2000
    })
})

# Preprocessing

## Tokenizing

**Note**: the models must first be downloaded using the following on the command line: 
```
python3 -m spacy download en_core_web_sm
```

First, since we have two languages, let's create some constants to represent that.  Also, let's create two dicts: one for holding our tokenizers and one for holding all the vocabs with assigned numbers for each unique word

In [18]:
# Place-holders
token_transform = {}
vocab_transform = {}

SRC_LANG= 'en'
TARG_LANG = 'ne'

In [19]:
from torchtext.data.utils import get_tokenizer
from nepalitokenizers import WordPiece



<div style="background-color: #d9edf7; border: 1px solid #bce8f1; padding: 10px; border-radius: 5px; color: black;">
    <strong>INFO:</strong> NEED TO WRITE BOUT WORDPIECE
</div>

In [20]:
token_transform["en"] = get_tokenizer('spacy', language='en_core_web_sm')
token_transform["ne"] = WordPiece()

In [21]:
print("Sentence: ", dataset['train']['ne'][2])
print("Tokenization: ", token_transform['ne'].encode(dataset['train']['ne'][2]).tokens)

Sentence:  फेस ब्राउजर सँग सादा
Tokenization:  ['[CLS]', 'फेस', 'ब्रा', '##उ', '##जर', 'सँग', 'सादा', '[SEP]']


In [22]:
def get_data_token(data, lang):
    '''Tokenize data for a given language'''
    if lang == 'ne':
        return {lang: token_transform[lang].encode(data[lang].lower()).tokens}
    else:
        return {lang: token_transform[lang](data[lang].lower())}
    # try:
    #     # Tokenize the data using the specified language tokenizer
    #     return {lang: token_transform[lang](data[lang].lower())}
    # except:
    #     # If an exception occurs, use WordPiece tokenizer for Nepali language
    #     return {lang: token_transform[lang].encode(data[lang].lower()).tokens}

In [23]:
tokenized_dataset = dataset.map(get_data_token, remove_columns=[SRC_LANG], fn_kwargs={'lang': SRC_LANG})

tokenized_dataset = tokenized_dataset.map(get_data_token, remove_columns=[TARG_LANG], fn_kwargs={'lang': TARG_LANG})

Map: 100%|██████████| 2000/2000 [00:00<00:00, 12975.90 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 12632.65 examples/s]
Map: 100%|██████████| 200000/200000 [00:12<00:00, 15394.38 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 12585.13 examples/s]


In [24]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

## Text to integers (Numericalization)

Next we gonna create function (torchtext called vocabs) that turn these tokens into integers.  Here we use built in factory function <code>build_vocab_from_iterator</code> which accepts iterator that yield list or iterator of tokens.

In [25]:
from torchtext.vocab import build_vocab_from_iterator

for ln in [SRC_LANG, TARG_LANG]:
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(tokenized_dataset['train'][ln],
                                                    min_freq=2,   #if not, everything will be treated as UNK
                                                    specials=special_symbols,
                                                    special_first=True) #indicates whether to insert symbols at the beginning or at the end
# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in [SRC_LANG, TARG_LANG]:
    vocab_transform[ln].set_default_index(UNK_IDX)



In [26]:
# saveing vocab
torch.save(vocab_transform, 'model/vocab')

In [27]:
#see some example
vocab_transform[SRC_LANG](['here', 'is', 'a', 'unknownword', 'a'])

[234, 20, 11, 0, 11]

In [28]:
#we can reverse it....
mapping = vocab_transform[SRC_LANG].get_itos()

#print 1816, for example
mapping[1891]

'average'

In [29]:
#let's try unknown vocab
mapping[0]
#they will all map to <unk> which has 0 as integer

'<unk>'

In [30]:
#let's try special symbols
mapping[1], mapping[2], mapping[3]

('<pad>', '<sos>', '<eos>')

In [31]:
#check unique vocabularies
len(mapping)

17142

<div style="background-color: #d9edf7; border: 1px solid #bce8f1; padding: 10px; border-radius: 5px; color: black;">
    <strong>INFO:</strong> NEED TO WRITE
</div>

# Preparing the dataloader

In [32]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 64

# Define a helper function to combine sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        # Iterate over the provided transformations
        for transform in transforms:
            try:
                # Apply the transformation to the text input
                txt_input = transform(txt_input)
            except:
                # If an exception occurs (e.g., if the transform is an encoding operation), catch it
                txt_input = transform.encode(txt_input).tokens
        # Return the transformed text input
        return txt_input
    # Return the function that applies sequential transformations
    return func


# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([SOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# src and trg language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANG, TARG_LANG]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_batch(batch):
    src_batch, src_len_batch, trg_batch = [], [], []
    for src_sample, trg_sample in batch:
        processed_text = text_transform[SRC_LANG](src_sample.rstrip("\n"))
        src_batch.append(processed_text)
        trg_batch.append(text_transform[TARG_LANG](trg_sample.rstrip("\n")))
        src_len_batch.append(processed_text.size(0))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first = True) #<----need this because we use linear layers mostly
    trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX, batch_first = True)
    return src_batch, torch.tensor(src_len_batch, dtype=torch.int64), trg_batch

Create train, val, and test dataloaders

In [33]:
# Set the batch size for data loaders
batch_size = 64

# Creating DataLoader for the training, validation, test set
train_loader = DataLoader(dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(dataset['validation'], batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(dataset['test'], batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [34]:
for ne, _, en in train_loader:
    break

In [35]:
print("English shape: ", en.shape)  # (batch_size, seq len)
print("Nepali shape: ", ne.shape)   # (batch_size, seq len)

English shape:  torch.Size([64, 5])
Nepali shape:  torch.Size([64, 3])


# Designing the model

## Encoder Layer

In [36]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, attn_variant, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm        = nn.LayerNorm(hid_dim)
        self.self_attention       = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, attn_variant, device)
        self.feedforward          = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout              = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len]   #if the token is padding, it will be 1, otherwise 0
        _src, _ = self.self_attention(src, src, src, src_mask)
        src     = self.self_attn_layer_norm(src + self.dropout(_src))
        #src: [batch_size, src len, hid dim]

        _src    = self.feedforward(src)
        src     = self.ff_layer_norm(src + self.dropout(_src))
        #src: [batch_size, src len, hid dim]

        return src

## Encoder

In [37]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length = 100):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers        = nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device)
                                           for _ in range(n_layers)])
        self.dropout       = nn.Dropout(dropout)
        self.scale         = torch.sqrt(torch.FloatTensor([hid_dim])).to(self.device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len    = src.shape[1]
        
        pos        = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        #pos: [batch_size, src_len]
        
        src        = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        #src: [batch_size, src_len, hid_dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
        #src: [batch_size, src_len, hid_dim]
        
        return src

# Training