<p style = "font-size: 37px">Phases</p>
<br>
<ul style = "font-size : 25px;">
<li ><a href = "#Loading the data">Loading the data</a></li>
<br/>    
<li><a href = "#Data preprocessing">Data preprocessing</a></li>
<br/>      
<li><a href = "#Model Building">Model Building</a></li>       
</ul>   

# Loading the Data
<p id = "Loading the data"></p>

In [None]:
import re                                                  # use regular expression to deal with text
from nltk.tokenize import sent_tokenize, word_tokenize     # for word and sentence tokenization
from nltk.corpus import stopwords                          # to help removing stopping words
from nltk.stem import WordNetLemmatizer                    # to lemmatize the words
from string import punctuation                             # to remove punctuation like ? ! . etc
import io                                                  # to read embeddings from text file
import math                                                # to get mathmatical functions sin cos etc
import pandas as pd                                        # to deal with dataframes    
import numpy as np                                         # to deal with numbers and dataframes  
import tensorflow as tf                                    # the main package to build the transformer
import sys                                                 # good printing tool
import pickle                                              # importing the parameters

In [None]:
# red the data as datafram
df = pd.read_csv(r"train.csv")

In [None]:
# let's load the embeddings as dictionary where key is the word and value it's corresponding embbeding
def load_vectors(fname):
    """
    this function takes the path of the embeddings file 'must be from fast text'
    and returns a dictionary where the words are keys and thier embeddings as values
    input :
    fname (String) ------> file path
    output :
    data (dictionary)----> dictionary to acess the embeddings with words as keys
    """
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] =  tokens[1:] 
    # add pad token as zero vector
    # we will expalain this later
    data["<PAD>"] = [0]*300    
    return data

In [None]:
dic = load_vectors("wiki.simple.vec.text")

# Data preprocessing
<p id = "Data preprocessing"></p>

In [None]:
def text_cleansing(text,maximum_len = 20):
    """
     This function performs the following steps:

    1. Lowercases the text
    2. Removes HTTP and WWW links
    3. Removes non-ASCII characters
    4. Removes numbers
    5. Tokenizes the text
    6. Removes stop words
    7. Lemmatizes the tokens
    8. Truncates tokens that are longer than the specified maximum length
    inputs :
    text (String) -----------> String to perform the cleaning
    maximum_len(int)--------->The maximum length of tokens to keep
    output:
    cleaned_tokens(text)----->A string containing the cleaned text

    """
    # lower case the text
    text = text.lower()
    # replace the text that have a ppatern of http link with empty link
    text = re.sub(r"(https?[\s+]?:?[\s+]?[(\w+|.)/?(\w+|.)?]+[\s]?)","",text)
    # replace the text that have a ppatern of only www. link
    text = re.sub(r"(\w+[\s+]?:[\s+]?www.\w+.\w+)","",text)
    # remove none ASCII characters like "| ¡ ¿ † ‡ ↔ ↑ ↓ • ¶"
    text = re.sub(r'[^\x00-\x7f]','',text)
    # remove numbers from the text 
    text = re.sub(r"\d","",text)
    # turn the text into list of words 
    tokens = word_tokenize(text)
    # cach the stopping words that in english
    stop_words = stopwords.words("english")
    # intialize a lemmatizer 
    lemmatizer = WordNetLemmatizer()
    # define a new list to keep the cleaned tokens
    cleaned_tokens = []
    # iterate over the tokens to remove the words that exceeds the maximum length 
    # remove stopping words
    # remove punctuation
    # lemmatize the words
    for i,token in enumerate(tokens):
        if (len(token) > maximum_len) or (token in stop_words):
            # do nothing 
            s = ""
        else:
            s = token.strip(punctuation)
            if s != "":
                cleaned_tokens.append(lemmatizer.lemmatize(s))

    return " ".join(cleaned_tokens)   

In [None]:
# apply the claning to each row of the data
df.iloc[:,1] = df.iloc[:,1].apply(text_cleansing)

In [None]:
def tokenizer(text):
    """
    this function truncate the sentences that exceeds 50 words
    and add paddings to senteces that is shorter than 50
    input :
    text (String) ------> text of the sentence
    output:
    tokens (list) ------> list of words 'tokens'
    """
    tokens = word_tokenize(text)  
    if len(tokens) > 50:
        tokens = tokens[:50]
        return tokens
    else :
        padding_len = 50 - len(tokens)
        tokens.extend(["<PAD>"]*padding_len)
        return tokens

In [None]:
# apply the tokenizer to each sentence
df.iloc[:,1] = df.iloc[:,1].apply(tokenizer)

# Model Building
<p id = "Model Building"></p>

In [None]:
class Model:
    def __init__(self, dic, n = 50, d = 300, h = 2, k = 300, c = 1200, m = 2):
        self.dic = dic
        self.n = n
        self.d = d
        self.h = h
        self.k = k
        self.c = c
        self.m = m
        self.positional_encoding      = self.positional_embeddings()
        self.encoder_parameters       = self.Encoder_parameters_intializer()
        self.head_parameters          = self.Head_parameters_intializer()
        self.parameters               = self.encoder_parameters + self.head_parameters
        
    def import_parameters(self,pname):
        with open(pname, 'rb') as file:
            # Call load method to deserialze
            self.parameters = pickle.load(file)
        self.encoder_parameters = self.parameters[:8]
        self.head_parameters    = self.parameters[8:]
        
    def input_embeddings(self, sentence):
        """
        Converts a sentence into a matrix of input embeddings of dim d x n
        d is embeddings length 
        n is the sentence length
        input :
        sentence (list) ------> list of words
        dic (dictionary)------> dictionary of embeddings
        output :
        tokens (Tensor)-------> tensor of size [d , n]

        """
        dic = self.dic
        
        tokens = []
        for word in sentence:
            if word in dic:
                tokens.append(dic[word])
            # if the word out-of-vocab use special token </s>    
            else :
                tokens.append(dic['</s>'])
        # this make sure that the array in size (d,n)        
        tokens =  np.array(tokens, dtype = np.float32).T 
        return tf.Variable(tokens)      
    def positional_embeddings(self, c = 10000):
        """
        get the positional embeddings of embeddings of size [d,n]
        inputs :
        d (int) -------> is the diemnsion of the embeddings
        n (int) -------> is the number of tokens in the sentence
        c (int) -------> hyperparameter by default 10000
        output :
        tokens (Tensor)------> positional embeddings for the tokens
        """
        d = self.d
        n = self.n
        
        tokens = []
        for position in range(n):
            tokens.append([])
            for i in range(d):
                # if i is even calculate even_pe
                if i%2 == 0:
                    even_d    = math.pow(c, i/d)
                    even_pe = math.sin(position/ even_d)
                    tokens[position].append(even_pe)
                # if i is odd calculate odd_pe    
                else:
                    odd_d    = math.pow(c, i/d)
                    odd_pe = math.cos(position/ odd_d)
                    tokens[position].append(odd_pe)            
        return tf.Variable(np.array(tokens).T, dtype = tf.float32, shape = [d,n] )
    def padding_mask(self, X):
        """
        Create mask which marks the zero padding values in the input by a 1
        this mask is used to force the model to ignore paddings
        inputs:
        X (Tensor) -------> matrix of embeddings of size [d,n]
        n (int)    -------> number of words in the sentences
        output:
        mask (Tesnor)----> mask with the same size as X
        """
        n = self.n
        
        # get boolean vector to indicate position of padding  
        mask = tf.math.equal(X, 0)
        mask = tf.reduce_all(mask, axis = 0)
        # cast the vector from boolean to float32 then rshape it as column vector
        mask = tf.cast(mask, tf.float32)
        mask = tf.reshape(mask, (1,-1))
        # repeat that column n times 
        mask = tf.concat([mask]*n, axis = 0)
        return mask
    def MHSA(self, X, U_q, U_k, V, W):
        """
        this function return multi head self attention tensor given the parameters and input matrix X
        inputs :
        X   (Tensor)----> input matrix of size [d,n]
        U_q (Tensor)---> matrix of size [h,k,d] is used to get Query matrix
        U_k (Tensor)---> matrix of size [h,k,d] is used to get Key matrix
        V   (Tensor)---> projection matrix of size [h,d,d]
        W   (Tensor)---> weight matrix with size [d,h*d] to project the concatenated heads back to X size
        h   (int)------> number of heads
        k   (int)------> dimension of Query,Key matrixes
        output :
        mhsa (Tensor)----> matrix of size [d,n]
        """
        d   = self.d
        n   = self.n
        h   = self.h
        k   = self.k
        
        Q   = tf.matmul(U_q, X)  
        K   = tf.matmul(U_k, X)
        m   = self.padding_mask(X) * -1e9 
        A   = tf.nn.softmax((tf.matmul(Q, K, transpose_a = True) + m)  / np.sqrt(k), axis= 1)
        Y    = V @ X @ A
        mhsa = W @ tf.reshape(Y, [h*d,n])
        return mhsa
    def Add_Norm(self, x1, x2):
        """
        simple function to add two tensors and then normalize the resultant tensor over their rows
        inputs :
        x1 (Tensor)-----> tensor of size [d,n]
        x2 (Tensor)-----> tensor of size [d,n]
        output:
        Normalized_X (Tenosr)-----> normalized tensor of size [d,n]
        """
        X = x1 + x2
        mean = tf.math.reduce_mean(X,axis = 1)
        mean = tf.reshape(mean, [-1,1])
        std = tf.math.reduce_std(X,axis = 1)
        std = tf.reshape(std, [-1,1]) +1e-6
        Normalized_X = (X - mean) / std
        return Normalized_X
    def feed_forward_NN(self,X, W1, W2, b1, b2):
        # c approxamatly 4 * d
        # in the paper d = 512 and c = 2048
        hidden_layer = tf.nn.relu(W1 @ X + b1)
        Dropout      = tf.nn.dropout(hidden_layer,0.5)
        output       = W2 @ Dropout + b2
        return output
    def Encoder_parameters_intializer(self):
        # this function just intialize encoder parameters
        # All the distributions have been chosen based on a lot of experiments
        h = self.h
        k = self.k
        d = self.d
        c = self.c
        
        U_q = tf.random.uniform((h,k,d), minval = 0, maxval = 1)
        U_q = tf.Variable(U_q)
        U_k = tf.random.uniform((h,k,d), minval = 0, maxval = 1)
        U_k = tf.Variable(U_k)
        V   = tf.random.uniform((h,d,d), minval = 0, maxval = 1)
        V = tf.Variable(V)
        W   = tf.random.uniform((d,h*d), minval = 0, maxval = 1)
        W = tf.Variable(W)
        W1  = tf.random.normal((c,d))
        W1 = tf.Variable(W1)
        W2  = tf.random.normal((d,c))
        W2 = tf.Variable(W2)
        b1  = tf.random.uniform((c,1))
        b1 = tf.Variable(b1)
        b2  = tf.random.uniform((d,1))
        b2 = tf.Variable(b2)
        return U_q,U_k,V,W,W1,W2,b1,b2
    
    def Encoder(self, X):
        # we have described the function in details through the mathematical notation above
        U_q,U_k,V,W,W1,W2,b1,b2 = self.encoder_parameters
        X   = X + self.positional_encoding
        X    = tf.nn.dropout(X,0.1)
        mhsa = self.MHSA(X, U_q, U_k, V, W)
        X    = self.Add_Norm(X, mhsa)
        fnn  = self.feed_forward_NN(X, W1, W2, b1, b2)
        X    = self.Add_Norm(X, fnn)
        
    def Transformer_block(self, X):
        """
        this function combine multiple layers of the encoder to get transformer output
        inputs :
        X          (Tensor)-----> tesnor of the embeddings of size [d,n]
        parameters (tuple)------> tuple of encoder parameters
        output:
        encoder_output (Tensor)---> the output of the transformer which has the same size as X
        """
        # define one layer outside the loop then use the output as input
        # repeat the process m-1 times 
        # don't forget the dropout to prevent overfitting
        encoder_output = self.Encoder(X)
        encoder_output = tf.nn.dropout(encoder_output,0.1)
        for i in range(self.m-1):
            encoder_output = self.Encoder(encoder_output)
            encoder_output = tf.nn.dropout(encoder_output,0.1)
