# Word Embeddings

## Loading Datasets & Importing Modules

In [3]:
import sys
sys.path.append('./Files')
sys.path.append('./Datasets')

In [5]:
import nltk
import gensim
from gensim.models import Word2Vec
import pandas as pd
from Sequencer import Sequencer
from helper_fns import write_dict
from helper_fns import read_dict
import numpy as np
import matplotlib.pyplot as plt
import json
import string

In [6]:
import warnings
warnings.filterwarnings(action = 'ignore')

## Word2Vec

### CBOW

In [None]:
fpath = "Text Preprocessing Experimentations/exp9_train_rtp_rl_re_cat_rl.csv"
name = "exp9"
mc = 5
hc = 2
vs = 100

In [None]:
df = pd.read_csv(fpath)
df.dropna(inplace=True)

In [None]:
tweets = []
for i in df['tweet']:
    tweet = nltk.word_tokenize(i)
    tweets.append(tweet)

#### Implementation

In [13]:
tweets_size = len(tweets)
tweets_size

6637

In [14]:
flat_tweets = [item for sublist in tweets for item in sublist]
vocab = set(flat_tweets)

In [15]:
vocab_size = len(vocab)
embed_dim = 10
context_size = 2
vocab_size

18283

In [16]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

In [17]:
data = []
for i in range(tweets_size):
    for j in range(2, len(tweets[i]) - 2):
        context = [tweets[i][j - 2], tweets[i][j - 1], tweets[i][j + 1], tweets[i][j + 2]]
        target = tweets[i][j]
        data.append((context, target))
print(data[:7])
len(data)

[(['اهنئ', 'احمد', 'دين', 'قيادي'], 'جمال'), (['احمد', 'جمال', 'قيادي', 'بحزب'], 'دين'), (['جمال', 'دين', 'بحزب', 'مصر'], 'قيادي'), (['دين', 'قيادي', 'مصر', 'بمناسب'], 'بحزب'), (['قيادي', 'بحزب', 'بمناسب', 'صدر'], 'مصر'), (['بحزب', 'مصر', 'صدر', 'روايت'], 'بمناسب'), (['برادعي', 'يستقو', 'مرهاخر', 'يرسل'], 'بامريكا')]


35262

In [18]:
embeddings =  np.random.random_sample((vocab_size, embed_dim))
print(embeddings[:2])
len(embeddings)

[[0.14277518 0.81119964 0.00769278 0.3773396  0.13513933 0.82835643
  0.57586812 0.14529619 0.78881317 0.75376777]
 [0.78276127 0.17575534 0.47901921 0.83521343 0.27718658 0.23803615
  0.1102499  0.85755252 0.52170385 0.43743006]]


18283

In [19]:
def linear(m, theta):
    w = theta
    return m.dot(w)

def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum())

def NLLLoss(logs, targets):
    out = logs[range(len(targets)), targets]
    return -out.sum()/len(out)

def log_softmax_crossentropy_with_logits(logits,target):

    out = np.zeros_like(logits)
    out[np.arange(len(logits)),target] = 1
    
    softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)
    
    return (- out + softmax) / logits.shape[0]

def forward(context_idxs, theta):
    m = embeddings[context_idxs].reshape(1, -1)
    n = linear(m, theta)
    o = log_softmax(n)
    
    return m, n, o

def backward(preds, theta, target_idxs):
    m, n, o = preds
    
    dlog = log_softmax_crossentropy_with_logits(n, target_idxs)
    dw = m.T.dot(dlog)
    
    return dw

def optimize(theta, grad, lr=0.03):
    theta -= grad * lr
    return theta

In [20]:
theta = np.random.uniform(-1, 1, (2 * context_size * embed_dim, vocab_size))

In [21]:
epoch_losses = {}

for epoch in range(30):

    losses =  []

    for context, target in data:
        context_idxs = np.array([word_to_ix[w] for w in context])
        preds = forward(context_idxs, theta)

        target_idxs = np.array([word_to_ix[target]])
        loss = NLLLoss(preds[-1], target_idxs)

        losses.append(loss)

        grad = backward(preds, theta, target_idxs)
        theta = optimize(theta, grad, lr=0.03)
        
    epoch_losses[epoch] = losses

In [23]:
np.savetxt("Word Embeddings/embeddings.txt", embeddings)

#### Gensim Library

In [30]:
CBoW = Word2Vec(tweets,
                vector_size=50,
                window=2,
                min_count=2)

In [31]:
CBoW_dict = {}
for word in CBoW.wv.key_to_index:
    CBoW_dict[word] = CBoW.wv[word].tolist()

In [32]:
len(CBoW.wv.key_to_index)

7020

In [33]:
CBoW.save("Word Embeddings/cb_{}_{}_{}_{}.model".format(name, mc, hc, vs))

In [34]:
write_dict("Word Embeddings/cb_dict_{}_{}_{}_{}.txt".format(name, mc, hc, vs), CBoW_dict)

### Skip-Gram

In [8]:
fpath = "Text Preprocessing Experimentations/exp9_train_rtp_rl_re_cat_rl.csv"
name = "exp9"
mc = 5
hc = 2
vs = 100

In [9]:
df = pd.read_csv(fpath)
df.dropna(inplace=True)

In [10]:
tweets = []
for i in df['tweet']:
    tweet = nltk.word_tokenize(i)
    tweets.append(tweet)

In [11]:
SkipGram = Word2Vec(tweets,
                    min_count=mc,
                    window=hc,
                    vector_size=vs,
                    sorted_vocab=1,
                    sg=1)

In [12]:
skipgram_dict = {}
for word in SkipGram.wv.key_to_index:
    skipgram_dict[word] = SkipGram.wv[word].tolist()

In [13]:
len(SkipGram.wv.key_to_index)

2238

In [20]:
SkipGram.save("Word Embeddings/sg_{}_{}_{}_{}.model".format(name, mc, hc, vs))

In [21]:
write_dict("Word Embeddings/sg_dict_{}_{}_{}_{}.txt".format(name, mc, hc, vs), skipgram_dict)

## GloVe