# Word Embeddings 

## imports

if you don't have 'bidi' or 'arabic_reshaper' you have to install them as it will help print Arabic words in matplotlib

In [None]:
# ! pip install python-bidi

In [None]:
# ! pip install arabic-reshaper

In [None]:
import numpy as np
from utils2 import sigmoid, get_batches, get_dict_mod, normalize_arabic, remove_diacritics, get_arabic_and_full_stop
import nltk

import re
import os

from matplotlib import pyplot
from bidi.algorithm import get_display
import matplotlib.pyplot as plt
import arabic_reshaper

from sklearn.decomposition import PCA

## Process text

In [None]:
%%time

data = []
root = '.\\data\\Khaleej-2004\\Economy'
# get all files
files_paths = [ os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files]

# loop over files and process them, then append them to the data list
for file_path in files_paths:

    f = open(file_path, 'r', encoding='utf-8')

    for line in f :
        text = remove_diacritics(line)
        text = normalize_arabic(text)
        text = get_arabic_and_full_stop(text)
        data += text
    f.close()

In [None]:
%%time
print("Number of tokens:", len(data),'\n', data[:50])               #  print data sample

## Filter wrong words

In [None]:
%%time
data = [word for word in data if len(set(word))>=2 or word == '.']

In [None]:
len(data)

## compute frequency

In [None]:
%%time
# Compute the frequency distribution of the words in the dataset (vocabulary)
fdist = nltk.FreqDist(data)
print("Size of vocabulary: ",len(fdist) )
print("Most frequent tokens: ",fdist.most_common(20)) # print the 20 most frequent words and their freq.

## Filter rare words

In [None]:
%%time
data = [word for word in data if fdist[word] > 2]

In [None]:
len(data)

## Mapping words to indices and indices to words

In [None]:
%%time
word2Ind = get_dict_mod(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

In [None]:
word2Ind['مصر']

<a name='2'></a>
# 2 Training the Model

###  Initializing the model

In [None]:
def initialize_model(N,V, random_seed=1):
    
    np.random.seed(random_seed)
    
    # W1 has shape (N,V)
    W1 = np.random.rand(N,V)
    # W2 has shape (V,N)
    W2 = np.random.rand(V,N)
    # b1 has shape (N,1)
    b1 = np.random.rand(N,1)
    # b2 has shape (V,1)
    b2 = np.random.rand(V,1)

    return W1, W2, b1, b2

<a name='2.1'></a>
### 2.1 Softmax
Before we can start training the model, we need to implement the softmax function as defined in equation 5:  

<br>
$$ \text{softmax}(z_i) = \frac{e^{z_i} }{\sum_{i=0}^{V-1} e^{z_i} }  \tag{5} $$

In [None]:
def softmax(z):

    # Calculate yhat (softmax)
    e_z = np.exp(z)
    yhat = e_z/np.sum(e_z,axis=0)
    
    return yhat

<a name='2.2'></a>
### 2.2 Forward propagation

<a name='ex-03'></a>
### Exercise 03
Implement the forward propagation $z$ according to equations (1) to (3). <br>

\begin{align}
 h &= W_1 \  X + b_1  \tag{1} \\
 a &= ReLU(h)  \tag{2} \\
 z &= W_2 \  a + b_2   \tag{3} \\
\end{align}

For that, you will use as activation the Rectified Linear Unit (ReLU) given by:

$$f(h)=\max (0,h) \tag{6}$$

In [None]:
def forward_prop(x, W1, W2, b1, b2):

    # Calculate h
    h = np.dot(W1,x)+b1
    
    # Apply the relu on h (store result in h)
    h = np.maximum(0,h)
    
    # Calculate z
    z = np.dot(W2,h)+b2
    
    return z, h

<a name='2.3'></a>
## 2.3 Cost function

In [None]:
# compute_cost: cross-entropy cost functioN
def compute_cost(y, yhat, batch_size):
    # cost function 
    logprobs = np.multiply(np.log(yhat),y) + np.multiply(np.log(1 - yhat), 1 - y)
    cost = - 1/batch_size * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

<a name='2.4'></a>
## 2.4 Training the Model - Backpropagation

![back proba](imgs/back_porp.png)

In [None]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
    
    l1 = np.dot(W2.T,(yhat-y))
    # Apply relu to l1
    l1 = np.maximum(0,l1)
    # Compute the gradient of W1
    grad_W1 = (1/batch_size)*np.dot(l1,x.T)
    # Compute the gradient of W2
    grad_W2 = (1/batch_size)*np.dot(yhat-y,h.T)
    # Compute the gradient of b1
    grad_b1 = np.sum((1/batch_size)*np.dot(l1,x.T),axis=1,keepdims=True)
    # Compute the gradient of b2
    grad_b2 = np.sum((1/batch_size)*np.dot(yhat-y,h.T),axis=1,keepdims=True)
    
    return grad_W1, grad_W2, grad_b1, grad_b2

<a name='2.5'></a>
## Train

In [None]:
def train(data, word2Ind, N, V, num_iters, C=2, batch_size=128, alpha=0.03):

    W1, W2, b1, b2 = initialize_model(N,V, random_seed=282)
    iters = 0
    
    for x, y in get_batches(data, word2Ind, V, C, batch_size):

        # Get z and h
        z, h = forward_prop(x, W1, W2, b1, b2)
        # Get yhat
        yhat = softmax(z)
        # Get cost
        cost = compute_cost(y, yhat, batch_size)
        if ( (iters+1) % 2 == 0):
            print(f"iters: {iters + 1} cost: {cost:.6f}")
        # Get gradients
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size)
        
        # Update weights and biases
        W1 -= alpha*grad_W1 
        W2 -= alpha*grad_W2
        b1 -= alpha*grad_b1
        b2 -= alpha*grad_b2
        
        ### END CODE HERE ###
        
        iters += 1 
        if iters == num_iters: 
            break
        if iters % 100 == 0:
            alpha *= 0.66
            
    return W1, W2, b1, b2

In [None]:
%%time
C = 2
batch_size = 128
V = len(word2Ind)
N = 300
num_iters = 20
print("Call gradient_descent")
W1, W2, b1, b2 = train(data, word2Ind, N, V, num_iters, C, batch_size)

<a name='3'></a>
## 3.0 Visualizing the word vectors

In [None]:
# visualizing the word vectors here
from matplotlib import pyplot
%config InlineBackend.figure_format = 'svg'
words = ['ملك', 
         'ملكه',
         'رجل',
         'امراه',
         'طفل',
         'طفله',
         'حرب',
         'سلام',
         'الارض',
         'السماء',
         'الكواكب',
         'النجوم',
         'القمر',
        ]

idx = [word2Ind[word] for word in words]

embs = (W1.T[idx, :] + W2[idx, :])/2.0
 
X = embs
print(X.shape, idx)

In [None]:
words = [get_display(arabic_reshaper.reshape(word)) for word in words]

In [None]:
pca = PCA(n_components=2)
result = pca.fit_transform(X)

pyplot.scatter(result[:, 1], result[:, 0])
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 1], result[i, 0]))
pyplot.show()