### Dislaimer
This notebook requires knowledge in:
* Python
* Neural Networks
* Pytorch Datasets and Modules
* Machine Learning Process Understanding

### Word Embeddings Example
The goal of this notebook is to have a hands-on experience of words embeddings.\
We will do the following:
* Load a set of Arabic text as trigrams
* Build a simple neural network
* Train the network to **predict masked words**
* Use both word embeddings and positional embeddings and attention

#### Build Dataset

To train an embedding, we will use the dataset we used in the previous notebook

In [1]:
from torch.utils.data import Dataset
from torch import as_tensor
from collections import defaultdict
from sklearn.model_selection import train_test_split
import json, re, math
import numpy as np

class MyDataset(Dataset):

    def __init__(self, alaraby_filepath, max_length, is_train):
        self.raw_data = [article["text"] for article in json.load(open(alaraby_filepath, "r"))]
        self.train_raw_data, self.test_raw_data = train_test_split(self.raw_data, test_size=0.1, random_state=42, shuffle=False)
        
        self.max_length = max_length
        self.SPECIAL_TOKENS_IDS = {"<UNKOWN>": 0, "<PAD>": 1, "<MASK>": 2}
        self.SPECIAL_IDS_TOKENS = {0: "<UNKOWN>", 1: "<PAD>", 2: "<MASK>"}

        self.train_sequences =  self.__generate_seq__(self.train_raw_data, self.max_length)
        self.test_sequences =  self.__generate_seq__(self.test_raw_data, self.max_length)
        self.vocab, self.id_to_word, self.word_to_id = self.__compute_vocab__(self.train_raw_data)


        self.is_train = is_train
    
    def __generate_seq__(self, texts, max_length):
        sentences = [re.split('\.|،|!|؟', text) for text in texts]
        sequences = [s for sequences in sentences for s in sequences]
        sequences = [s.split() for s in sequences]
        sequences = [s for s in sequences if len(s) <= max_length and len(s)>0]
        return sequences
    
    def __compute_vocab__(self, texts):
        words = {}
        for text in texts:
            for word in text.split():
                words[word]= None
        words_list = list(self.SPECIAL_TOKENS_IDS.keys()) + list(words)
        id_to_word = defaultdict(lambda: "<UNKNOWN>", {idx: value for idx, value in enumerate(words_list)})
        word_to_id = defaultdict(lambda: 0, {value: idx for idx, value in enumerate(words_list)})
        return words, id_to_word, word_to_id

    def __len__(self):
        return len(self.train_sequences) if self.is_train else len(self.test_sequences)

    def __getitem__(self, idx, mask_prob=15):
        mask_prob = 0.15
        sequence = self.train_sequences[idx]

        masked_indicies = np.random.choice(len(sequence), math.ceil(len(sequence)*mask_prob))
        mask = [i in masked_indicies for i,_ in enumerate(sequence)]
        masked_sequence = list(np.where(mask, "<MASK>", sequence))
        masked_sequence =  masked_sequence + ["<PAD>"]*(self.max_length - len(masked_sequence))
        masked_sequence = [ self.word_to_id[token] for token in masked_sequence]

        target = list(np.where(np.invert(mask), "<PAD>", sequence))
        target = ["<PAD>"]*(self.max_length - len(target)) + target
        target = [ self.word_to_id[token] for token in target]

        return as_tensor(masked_sequence), as_tensor(target)
    
    def get_word_from_id(self, idx):
        return self.id_to_word[idx]
    
    def get_word_id(self, word):
        return self.word_to_id[word]
    
    def get_vocab_size(self):
        return len(self.vocab) 

In [2]:
max_length = 10
train_dataset = MyDataset("../Dataset/alaraby1k.json", max_length, is_train= True)
test_dataset = MyDataset("../Dataset/alaraby1k.json", max_length, is_train= False)

In [None]:
print(f"Id of unkown word: {train_dataset.get_word_id('<UNKNOWN>')}")
print(f"ًWord of unkown id: {train_dataset.get_word_from_id(-1)}")
print(f"Vocab size: {train_dataset.get_vocab_size()}")

item = 3100
seq, target = train_dataset.__getitem__(item)
print(f"\nItem {item} seq: ")
for word in [train_dataset.get_word_from_id(t.item()) for t in seq]:
  print(word)

#### Build Neural Network

We will use the same network we used in previous notebooks. However, this\
time we add a new layer ``Attention``.

In [5]:
import torch
from torch import nn

class Attention(nn.Module):

    def __init__(self, sequence_size, input_dim, output_dim):
        super(Attention, self).__init__()
        self.sequence_size = sequence_size
        self.input_dim = input_dim
        self.wq = torch.rand(input_dim, output_dim, requires_grad=True)
        self.wk = torch.rand(input_dim, output_dim, requires_grad=True)
        self.wv = torch.rand(input_dim, output_dim, requires_grad=True)
        self.softmax = nn.Softmax(1)

    def forward(self, x):

        K = torch.matmul(x, self.wk)
        Q = torch.matmul(x, self.wq)
        V = torch.matmul(x, self.wv)

        similarity_matrix = torch.matmul(Q, K.transpose(1, 2)) / torch.sqrt(torch.as_tensor(self.input_dim))
        similarity_matrix = self.softmax(similarity_matrix)

        return torch.matmul(similarity_matrix, V)

In [4]:
import torch
import torch.nn as nn

class MaskedWordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_length):
        super(MaskedWordPredictor, self).__init__()

        self.max_length = max_length
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)    
        self.linear = nn.Linear(max_length * embedding_dim, max_length * vocab_size)
        self.attention = Attention(max_length, embedding_dim, embedding_dim)

        self.positional_embedding = self.__generate_positional_embedding__(max_length, embedding_dim)
        
    def forward(self, x):
        batch_size= x.size()[0]
        output = self.embedding(x) # (batch_size, max_length, embedding_size)
        output = output + self.positional_embedding.unsqueeze(0).repeat(batch_size, 1, 1)
        output = self.attention(output) # (batch_size, max_length, embedding_size)
        output = output.reshape(batch_size, -1) # (batch_size, max_length * embedding_size)
        output = self.linear(output) # (batch_size, max_length * vocab_size)
        output = output.reshape(batch_size, self.max_length, self.vocab_size) # (batch_size, max_length, vocab_size)
        return output

    def embed(self, x):
        return self.embedding(x)

    def word(self, x):
      distance = torch.norm(self.embedding.weight.data - x, dim=1)
      nearest = torch.argmin(distance)
      return nearest
    
    def __generate_positional_embedding__(self, max_length, embedding_dim):
        layer = nn.Embedding(max_length, embedding_dim)
        pos_seq = torch.arange(start=0, end=max_length, requires_grad=False)
        embedding = layer(pos_seq).detach()
        return embedding
    
    def to(self, device):
      super(MaskedWordPredictor, self).to(device)
      self.positional_embedding = self.positional_embedding.to(device)

#### Train Embedding

In this code chunk we simply train the network given a number of hyperparameter.

In [None]:
from torch.utils.data import DataLoader
from torch.optim import SGD
import os

device = torch.device("cuda:0" ) if torch.cuda.is_available() else torch.device("cpu" )

batch_size= 10
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

vocab_size = 60_000
embbeding_dim = 128
model = MaskedWordPredictor(vocab_size, embbeding_dim, max_length)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.001)

chkpnt_path = f"masked_word_predictor_{vocab_size}_{embbeding_dim}.chk"
if os.path.exists(chkpnt_path):
  model.load_state_dict(torch.load(chkpnt_path, map_location=device))


progress_path = "/masked_word_predictor_progress.json"
if os.path.exists(progress_path):
  progress = json.load(open(progress_path, "r"))
else:
  progress = {"chkpnt" : 0, "progress" : []}

# Training loop
epochs = 100
for epoch in range(progress["chkpnt"], epochs):
  for i, batch in enumerate(train_dataloader):
    optimizer.zero_grad()
    seq, target = batch
    seq = seq.to(device)
    target = target.to(device)
    output = model(seq).transpose(1, 2) # cross entropy loss expects shape [batches, nb_classes, dim1]
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    if(i % 10==0):
      print(f"Epoch {epoch } Batch {i}, Loss: {loss.item()}")
  progress["progress"].append({"epoch": epoch, "loss": loss.item()})
  progress["chkpnt"] = epoch   
  json.dump(progress, open(progress_path, "w"))
  torch.save(model.state_dict(), chkpnt_path)

In [None]:
import torch 
import numpy as np

chkpnt = "/content/masked_word_predictor_60000_128.chk"
device = torch.device("cuda:0" ) if torch.cuda.is_available() else torch.device("cpu" )

max_length = 10

mask = 4
seq = np.array(["هل", "فرش",  "+ت", "ال+", "<MASK>", "يوما"])
seq = np.pad(seq, (0,max_length - len(seq)))
seq = [train_dataset.word_to_id[w] for w in seq]
seq = torch.as_tensor(seq).view(1, -1).to(device)

vocab_size = 60_000
embbeding_dim = 128
inference = MaskedWordPredictor(vocab_size, embbeding_dim, max_length).to(device)
inference.load_state_dict(torch.load(chkpnt, map_location=device))

output = inference(seq)
top_3 = np.argsort(output.detach().cpu().numpy(), axis=2)[:, :, -3:]
top_3 = top_3[0, 4, :]

print(f"Word 1: {train_dataset.id_to_word[top_3[0]]}")
print(f"Word 2: {train_dataset.id_to_word[top_3[1]]}")
print(f"Word 3: {train_dataset.id_to_word[top_3[2]]}")