### Dislaimer
This notebook requires knowledge in:
* Python
* Neural Networks
* Pytorch Datasets and Modules
* Machine Learning Process Understanding

### Word Embeddings Example
The goal of this notebook is to have a hands-on experience of words embeddings.\
We will do the following:
* Load a set of Arabic text as trigrams
* Build a simple neural network
* Train the network to **predict masked words**
* Use the network weights as embeddings

#### Build Dataset

To train an embedding, we will use the dataset we used in the previous notebook

In the previous notebook we added the token `<UNKOWN>` to be a surregate for any unkown word\
that might be entered to the model during testing or deployment. For training by masking we will\
add two more tokens `<PAD>` and `<MASK>`.

Unlike in the next word prediction task, masked word prediction\
task take a sequence of tokens of a specific length `max_length`. Some squences will be shorter\
than the maximum length, hence the usage of `<PAD>` to fill in the remaining slots in the\
input sequence. Some of these tokens will be masked and the the model will have to predict\
them, hence the usage of `<MASK>`.

During the dataset initlization we will load the entire dataset in `alaraby1k.json` and then\
generate sequnces based on sentences in the data set. We will split the text by `[! ? ، .]`\
and consider each sentence a sequence. If the sequence is longer than `max_length` we wil discard it.\
We set `max_length` to 100.

In [1]:
from torch.utils.data import Dataset
from torch import as_tensor
from collections import defaultdict
from sklearn.model_selection import train_test_split
import json, re, math
import numpy as np

class MyDataset(Dataset):

    def __init__(self, alaraby_filepath, max_length, is_train):
        self.raw_data = [article["text"] for article in json.load(open(alaraby_filepath, "r"))]
        self.train_raw_data, self.test_raw_data = train_test_split(self.raw_data, test_size=0.1, random_state=42)
        
        self.max_length = max_length
        self.SPECIAL_TOKENS_IDS = {"<UNKOWN>": 0, "<PAD>": 1, "<MASK>": 2}
        self.SPECIAL_IDS_TOKENS = {0: "<UNKOWN>", 1: "<PAD>", 2: "<MASK>"}

        self.train_sequences =  self.__generate_seq__(self.train_raw_data, self.max_length)
        self.test_sequences =  self.__generate_seq__(self.test_raw_data, self.max_length)
        self.vocab, self.id_to_word, self.word_to_id = self.__compute_vocab__(self.train_raw_data)


        self.is_train = is_train
    
    def __generate_seq__(self, texts, max_length):
        sentences = [re.split('\.|،|!|؟', text) for text in texts]
        sequences = [s for sequences in sentences for s in sequences]
        sequences = [s.split() for s in sequences]
        sequences = [s for s in sequences if len(s) <= max_length and len(s)>0]
        return sequences
    
    def __compute_vocab__(self, texts):
        words = set()
        for text in texts:
            words.update(set(text.split()))
        words_list = list(self.SPECIAL_TOKENS_IDS.keys()) + list(words)
        id_to_word = defaultdict(lambda: "<UNKNOWN>", {idx: value for idx, value in enumerate(words_list)})
        word_to_id = defaultdict(lambda: 0, {value: idx for idx, value in enumerate(words_list)})
        return words, id_to_word, word_to_id

    def __len__(self):
        return len(self.train_sequences) if self.is_train else len(self.test_sequences)

    def __getitem__(self, idx, mask_prob=15):
        mask_prob = 0.15
        sequence = self.train_sequences[idx]

        masked_indicies = np.random.choice(len(sequence), math.ceil(len(sequence)*mask_prob))
        mask = [i in masked_indicies for i,_ in enumerate(sequence)]
        masked_sequence = list(np.where(mask, "<MASK>", sequence))
        masked_sequence = ["<PAD>"]*(self.max_length - len(masked_sequence)) + masked_sequence
        masked_sequence = [ self.word_to_id[token] for token in masked_sequence]

        target = list(np.where(np.invert(mask), "<PAD>", sequence))
        target = ["<PAD>"]*(self.max_length - len(target)) + target
        target = [ self.word_to_id[token] for token in target]

        return as_tensor(masked_sequence), as_tensor(target)
    
    def get_word_from_id(self, idx):
        return self.id_to_word[idx]
    
    def get_word_id(self, word):
        return self.word_to_id[word]
    
    def get_vocab_size(self):
        return len(self.vocab) 

In [2]:
max_length = 100
train_dataset = MyDataset("../Dataset/alaraby1k.json", max_length, is_train= True)
test_dataset = MyDataset("../Dataset/alaraby1k.json", max_length, is_train= False)

In [3]:
print(f"Id of unkown word: {train_dataset.get_word_id('<UNKNOWN>')}")
print(f"ًWord of unkown id: {train_dataset.get_word_from_id(-1)}")
print(f"Vocab size: {train_dataset.get_vocab_size()}")

item = 0
seq = train_dataset.__getitem__(item)
print(f"Item {item} in ids: {str(seq)}")
print(f"Item {item} in words: [{str([train_dataset.get_word_from_id(t) for t in seq])}")

item = 33
seq = train_dataset.__getitem__(item)
print(f"Item {item} in ids: {str(seq)}")
print(f"Item {item} in words: [{str([train_dataset.get_word_from_id(t) for t in seq])}")

item = 1200
seq = train_dataset.__getitem__(item)
print(f"Item {item} in ids: {str(seq)}")
print(f"Item {item} in words: [{str([train_dataset.get_word_from_id(t) for t in seq])}")

Id of unkown word: 0
ًWord of unkown id: <UNKNOWN>
Vocab size: 58912
Item 0 in ids: (tensor([    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1, 25891,     2, 36154, 21778, 52419,
        27231, 21778,     2, 39289, 14038,  1474, 54317, 55068, 36243, 57838,
        34624, 40957, 30341, 36154,  5164, 36482, 40957, 46165, 32343, 44012,
        17995,  9434, 58731,     2, 17162,     2, 36154,  5955,     2, 36443,
        57299, 36819, 10247, 17162, 52138, 40957, 43168, 44585,     2, 52138]), tensor([    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1

#### Build Neural Network

The network task is framed as a straightforward classification task. Consider the `max_length`\
of the input sequence. The input size is `max_length x 1`. Where one correspondes to the token\
index. We are predicting masked words so the output has the same size as the input. We need to\
predict the same sequence arn't we. The logits size will be also `max_length x vocab_size` then.\
We will then compute pytorch `cross_entropy` loss. Which will map the logits into the classification\
target `max_length x 1`.

However, asking the model to predict the same output except for some masked words is reduntand.\
We will be computing loss over output tokens that we already know. To solve the problem \
we will do the following. We will compute the `cross_entropy` loss with a an altered verions\
of the input. Consider the following example:


Input  | Being | strong | is       | all | what      | matters | `<PAD>`
--     |    -- |     -- |       -- |  -- |        -- |      -- | --
Masked | Being | strong | `<MASK>` | all | `<MASK>`  | matters | `<PAD>`
Embed  | 22    | 433    | 1        | 333 | 1         | 331     | 2
Target | 2     | 2      | 190      | 2   | 355       | 2       | 2

As you can see we mapped all non-masked words in the target vecotr into `<PAD>`.\
And we keep the class indices of masked words. We then ignore the class index of\
`<PAD>` when computing the loss. By doing this we compute the loss and optimize\
the model weights based only on the masked words predictions. We can achive this\
behaviour by using the `ignore-index` parameter of [Pytorch](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) `CrossEntropyLoss`

In [33]:
import torch
import torch.nn as nn

class MaskedWordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(MaskedWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(max_length * embedding_dim, vocab_size)
        self.vocab_size = vocab_size
        
    def forward(self, x):
        embedded = self.embedding(x) # (batch_size, max_length, embedding_size)
        embedded = embedded.reshape(x.size()[0], -1) # (batch_size, max_length * embedding_size)
        output = self.linear(embedded) # (batch_size, max_length * vocab_size)
        output = output.reshape(x.size()[0], self.vocab_size, -1)
        return output

    def embed(self, x):
        return self.embedding(x)

    def word(self, x):
      distance = torch.norm(self.embedding.weight.data - x, dim=1)
      nearest = torch.argmin(distance)
      return nearest

#### Train Embedding

In this code chunk we simply train the network given a number of hyperparameter.

In [None]:
from torch.utils.data import DataLoader
from torch.optim import SGD
import os

device = torch.device("cuda:0" ) if torch.cuda.is_available() else torch.device("cpu" )

batch_size= 1500
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

vocab_size = 60_000
embbeding_dim = 1024
model = MaskedWordPredictor(vocab_size, embbeding_dim)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.01)

chkpnt_path = f"masked_word_predictor_{vocab_size}_{embbeding_dim}.chk"
if os.path.exists(chkpnt_path):
  model.load_state_dict(torch.load(chkpnt_path, map_location=device))


progress_path = "/masked_word_predictor_progress.json"
if os.path.exists(progress_path):
  progress = json.load(open(progress_path, "r"))
else:
  progress = {"chkpnt" : 0, "progress" : []}

# Training loop
epochs = 100
for epoch in range(progress["chkpnt"], epochs):
  for i, batch in enumerate(train_dataloader):
    optimizer.zero_grad()
    x1, x2, target = batch
    x1 = x1.to(device)
    x2 = x2.to(device)
    target = target.to(device)
    output = model(x1, x2)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    if(i % 10==0):
      print(f"Epoch {epoch } Batch {i}, Loss: {loss.item()}")
  progress["progress"].append({"epoch": epoch, "loss": loss.item()})
  progress["chkpnt"] = epoch   
  json.dump(progress, open(progress_path, "w"))
  torch.save(model.state_dict(), chkpnt_path)