<p style = "font-size: 37px">Phases</p>
<br>
<ul style = "font-size : 25px;">
<li ><a href = "#Loading the data">Loading the data</a></li>
<br/>    
<li><a href = "#Data preprocessing">Data preprocessing</a></li>
<br/>      
<li><a href = "#Model Building">Model Building</a></li>       
</ul>   

# Loading the Data
<p id = "Loading the data"></p>

In [None]:
import re                                                  # use regular expression to deal with text
from nltk.tokenize import sent_tokenize, word_tokenize     # for word and sentence tokenization
from nltk.corpus import stopwords                          # to help removing stopping words
from nltk.stem import WordNetLemmatizer                    # to lemmatize the words
from string import punctuation                             # to remove punctuation like ? ! . etc
import io                                                  # to read embeddings from text file
import math                                                # to get mathmatical functions sin cos etc
import pandas as pd                                        # to deal with dataframes    
import numpy as np                                         # to deal with numbers and dataframes  
import tensorflow as tf                                    # the main package to build the transformer
import sys                                                 # good printing tool
import pickle                                              # importing the parameters

In [None]:
# red the data as datafram
df = pd.read_csv(r"train.csv")

In [None]:
# let's load the embeddings as dictionary where key is the word and value it's corresponding embbeding
def load_vectors(fname):
    """
    this function takes the path of the embeddings file 'must be from fast text'
    and returns a dictionary where the words are keys and thier embeddings as values
    input :
    fname (String) ------> file path
    output :
    data (dictionary)----> dictionary to acess the embeddings with words as keys
    """
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] =  tokens[1:] 
    # add pad token as zero vector
    # we will expalain this later
    data["<PAD>"] = [0]*300    
    return data

In [None]:
dic = load_vectors("wiki.simple.vec.text")

# Data preprocessing
<p id = "Data preprocessing"></p>

In [None]:
def text_cleansing(text,maximum_len = 20):
    """
     This function performs the following steps:

    1. Lowercases the text
    2. Removes HTTP and WWW links
    3. Removes non-ASCII characters
    4. Removes numbers
    5. Tokenizes the text
    6. Removes stop words
    7. Lemmatizes the tokens
    8. Truncates tokens that are longer than the specified maximum length
    inputs :
    text (String) -----------> String to perform the cleaning
    maximum_len(int)--------->The maximum length of tokens to keep
    output:
    cleaned_tokens(text)----->A string containing the cleaned text

    """
    # lower case the text
    text = text.lower()
    # replace the text that have a ppatern of http link with empty link
    text = re.sub(r"(https?[\s+]?:?[\s+]?[(\w+|.)/?(\w+|.)?]+[\s]?)","",text)
    # replace the text that have a ppatern of only www. link
    text = re.sub(r"(\w+[\s+]?:[\s+]?www.\w+.\w+)","",text)
    # remove none ASCII characters like "| ¡ ¿ † ‡ ↔ ↑ ↓ • ¶"
    text = re.sub(r'[^\x00-\x7f]','',text)
    # remove numbers from the text 
    text = re.sub(r"\d","",text)
    # turn the text into list of words 
    tokens = word_tokenize(text)
    # cach the stopping words that in english
    stop_words = stopwords.words("english")
    # intialize a lemmatizer 
    lemmatizer = WordNetLemmatizer()
    # define a new list to keep the cleaned tokens
    cleaned_tokens = []
    # iterate over the tokens to remove the words that exceeds the maximum length 
    # remove stopping words
    # remove punctuation
    # lemmatize the words
    for i,token in enumerate(tokens):
        if (len(token) > maximum_len) or (token in stop_words):
            # do nothing 
            s = ""
        else:
            s = token.strip(punctuation)
            if s != "":
                cleaned_tokens.append(lemmatizer.lemmatize(s))

    return " ".join(cleaned_tokens)   

In [None]:
# apply the claning to each row of the data
df.iloc[:,1] = df.iloc[:,1].apply(text_cleansing)

In [None]:
def tokenizer(text):
    """
    this function truncate the sentences that exceeds 50 words
    and add paddings to senteces that is shorter than 50
    input :
    text (String) ------> text of the sentence
    output:
    tokens (list) ------> list of words 'tokens'
    """
    tokens = word_tokenize(text)  
    if len(tokens) > 50:
        tokens = tokens[:50]
        return tokens
    else :
        padding_len = 50 - len(tokens)
        tokens.extend(["<PAD>"]*padding_len)
        return tokens

In [None]:
# apply the tokenizer to each sentence
df.iloc[:,1] = df.iloc[:,1].apply(tokenizer)

# Model Building
<p id = "Model Building"></p>

In [None]:
class Model:
    def __init__(self, dic, n = 50, d = 300, h = 2, k = 300, c = 1200, m = 2):
        self.dic = dic
        self.n = n
        self.d = d
        self.h = h
        self.k = k
        self.c = c
        self.m = m
        self.positional_encoding      = self.positional_embeddings()
        self.encoder_parameters       = self.Encoder_parameters_intializer()
        self.head_parameters          = self.Head_parameters_intializer()
        self.parameters               = self.encoder_parameters + self.head_parameters
        
    def import_parameters(self,pname):
        with open(pname, 'rb') as file:
            # Call load method to deserialze
            self.parameters = pickle.load(file)
        self.encoder_parameters = self.parameters[:8]
        self.head_parameters    = self.parameters[8:]
        
    def input_embeddings(self, sentence):
        """
        Converts a sentence into a matrix of input embeddings of dim d x n
        d is embeddings length 
        n is the sentence length
        input :
        sentence (list) ------> list of words
        dic (dictionary)------> dictionary of embeddings
        output :
        tokens (Tensor)-------> tensor of size [d , n]

        """
        dic = self.dic
        
        tokens = []
        for word in sentence:
            if word in dic:
                tokens.append(dic[word])
            # if the word out-of-vocab use special token </s>    
            else :
                tokens.append(dic['</s>'])
        # this make sure that the array in size (d,n)        
        tokens =  np.array(tokens, dtype = np.float32).T 
        return tf.Variable(tokens)      
    def positional_embeddings(self, c = 10000):
        """
        get the positional embeddings of embeddings of size [d,n]
        inputs :
        d (int) -------> is the diemnsion of the embeddings
        n (int) -------> is the number of tokens in the sentence
        c (int) -------> hyperparameter by default 10000
        output :
        tokens (Tensor)------> positional embeddings for the tokens
        """
        d = self.d
        n = self.n
        
        tokens = []
        for position in range(n):
            tokens.append([])
            for i in range(d):
                # if i is even calculate even_pe
                if i%2 == 0:
                    even_d    = math.pow(c, i/d)
                    even_pe = math.sin(position/ even_d)
                    tokens[position].append(even_pe)
                # if i is odd calculate odd_pe    
                else:
                    odd_d    = math.pow(c, i/d)
                    odd_pe = math.cos(position/ odd_d)
                    tokens[position].append(odd_pe)            
        return tf.Variable(np.array(tokens).T, dtype = tf.float32, shape = [d,n] )
    