# Word Embeddings

## Loading Datasets & Importing Modules

In [6]:
import sys
import os
sys.path.append('./Files')
sys.path.append('./Datasets')

In [7]:
os.system('pip install nltk')

0

In [8]:
os.system('pip install gensim')

0

In [9]:
import nltk
import gensim
from gensim.models import Word2Vec
import pandas as pd
from dataCleaner import preProcess
from Sequencer import Sequencer
from helper_fns import write_dict
from helper_fns import read_dict
import numpy as np
import matplotlib.pyplot as plt
import json
import string

In [10]:
import warnings
warnings.filterwarnings(action = 'ignore')

In [11]:
df_train = pd.read_csv('Datasets/merged_train_datasets.csv')
df_train = df_train.rename(columns={"text":"tweet"})

try:
    df_train.drop(columns=['Unnamed: 0'], inplace = True)
except:
    pass

## Word2Vec

### Preparing Data

In [12]:
preProcess(df_train)
df_train.head()

Unnamed: 0,tweet,sentiment
0,اهنئ احمد جمال دين قيادي بحزب مصر بمناسب صدر ر...,positive
1,برادعي يستقو بامريكا مرهاخر يرسل عصام عريان اش...,negative
2,والده اقول بخاطر حشيش تضحك بس اقول كيلك تعطين ...,neutral
3,انتخبوا العرص انتخبوا البرص مرسي رئيس اين رئيس...,neutral
4,يتقال ستريكر صريح كاريوك السكه شمال,positive


In [13]:
tweets = []
for i in df_train['tweet']:
    tweet = nltk.word_tokenize(i)
    tweets.append(tweet)

### CBOW

In [14]:
tweets_size = len(tweets)
tweets_size

6637

In [15]:
flat_tweets = [item for sublist in tweets for item in sublist]
vocab = set(flat_tweets)

In [16]:
vocab_size = len(vocab)
embed_dim = 10
context_size = 2
vocab_size

18283

In [17]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

In [18]:
data = []
for i in range(tweets_size):
    for j in range(2, len(tweets[i]) - 2):
        context = [tweets[i][j - 2], tweets[i][j - 1], tweets[i][j + 1], tweets[i][j + 2]]
        target = tweets[i][j]
        data.append((context, target))
print(data[:7])
len(data)

[(['اهنئ', 'احمد', 'دين', 'قيادي'], 'جمال'), (['احمد', 'جمال', 'قيادي', 'بحزب'], 'دين'), (['جمال', 'دين', 'بحزب', 'مصر'], 'قيادي'), (['دين', 'قيادي', 'مصر', 'بمناسب'], 'بحزب'), (['قيادي', 'بحزب', 'بمناسب', 'صدر'], 'مصر'), (['بحزب', 'مصر', 'صدر', 'روايت'], 'بمناسب'), (['برادعي', 'يستقو', 'مرهاخر', 'يرسل'], 'بامريكا')]


35262

In [19]:
embeddings =  np.random.random_sample((vocab_size, embed_dim))
print(embeddings[:2])
len(embeddings)

[[0.92131889 0.20200601 0.87526611 0.57946518 0.30870287 0.37040282
  0.08550405 0.35468319 0.28631841 0.92645871]
 [0.43604782 0.05923583 0.5999066  0.18521912 0.94554717 0.84428838
  0.42003081 0.32195936 0.28631063 0.26762193]]


18283

In [20]:
def linear(m, theta):
    w = theta
    return m.dot(w)

def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum())

def NLLLoss(logs, targets):
    out = logs[range(len(targets)), targets]
    return -out.sum()/len(out)

def log_softmax_crossentropy_with_logits(logits,target):

    out = np.zeros_like(logits)
    out[np.arange(len(logits)),target] = 1
    
    softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)
    
    return (- out + softmax) / logits.shape[0]

def forward(context_idxs, theta):
    m = embeddings[context_idxs].reshape(1, -1)
    n = linear(m, theta)
    o = log_softmax(n)
    
    return m, n, o

def backward(preds, theta, target_idxs):
    m, n, o = preds
    
    dlog = log_softmax_crossentropy_with_logits(n, target_idxs)
    dw = m.T.dot(dlog)
    
    return dw

def optimize(theta, grad, lr=0.03):
    theta -= grad * lr
    return theta

In [21]:
theta = np.random.uniform(-1, 1, (2 * context_size * embed_dim, vocab_size))

In [22]:
epoch_losses = {}

for epoch in range(30):

    losses =  []

    for context, target in data:
        context_idxs = np.array([word_to_ix[w] for w in context])
        preds = forward(context_idxs, theta)

        target_idxs = np.array([word_to_ix[target]])
        loss = NLLLoss(preds[-1], target_idxs)

        losses.append(loss)

        grad = backward(preds, theta, target_idxs)
        theta = optimize(theta, grad, lr=0.03)
        
     
    epoch_losses[epoch] = losses

In [29]:
np.savetxt("Word Embeddings/embeddings.txt", embeddings)

In [47]:
CBoW = Word2Vec(tweets,
                vector_size=50,
                window=2,
                min_count=2)

In [48]:
CBoW_dict = {}
for word in CBoW.wv.key_to_index:
    CBoW_dict[word] = CBoW.wv[word].tolist()

In [54]:
len(CBoW.wv.key_to_index)

{'في': [-0.04620405286550522, 0.019379625096917152, -0.002160284435376525, 0.025486275553703308, -0.02310458943247795, -0.10414595901966095, 0.10601409524679184, 0.15778449177742004, -0.1254119873046875, -0.0462932325899601, 0.007496371865272522, -0.10605614632368088, -0.046377282589673996, 0.060796741396188736, -0.06623806059360504, -0.0089451614767313, 0.03704259172081947, -0.009101580828428268, -0.10597638040781021, -0.07713925838470459, 0.033455464988946915, 0.09876830130815506, 0.15924811363220215, -0.05050913989543915, 0.09825316071510315, 0.034055445343256, -0.06988176703453064, -0.0006449068896472454, -0.14034615457057953, -0.01611592434346676, 0.04270974174141884, -0.01669662445783615, -0.032750971615314484, 0.04232356697320938, -0.045849431306123734, 0.06689079850912094, 0.07891196757555008, 0.03231113776564598, 0.06136718764901161, -0.08011683821678162, 0.062217168509960175, -0.02289152517914772, -0.03745679184794426, 0.006214838940650225, 0.15706923604011536, 0.034161422401

7020

In [50]:
CBoW.save("Word Embeddings/cbow_50_2_2.model")

In [51]:
write_dict("Word Embeddings/cbow-dict_50_2_2.txt", CBoW_dict)

### Skip-Gram

In [35]:
SkipGram = Word2Vec(tweets,
                    min_count=3,
                    window=2,
                    vector_size=50,
                    sorted_vocab=1,
                    sg=1)

In [36]:
embeddings_dict = {}
for word in SkipGram.wv.key_to_index:
    embeddings_dict[word] = SkipGram.wv[word].tolist()

In [37]:
# freq = {}
# for i, word in enumerate(nltk.tokenize.wordpunct_tokenize(" ".join(df_train['tweet'].to_numpy().flatten()))):
#     try:
#         freq[word] += 1
#     except:
#         freq[word] = 1
# sorted_freq = {key: val for key, val in sorted(freq.items(), key = lambda ele: ele[1], reverse=1)}

In [38]:
len(SkipGram.wv.key_to_index)

4349

In [39]:
sequencer = Sequencer(df_train['tweet'], embeddings_dict, 1000)

In [40]:
df_train.iloc[0,0]

'اهنئ احمد جمال دين قيادي بحزب مصر بمناسب صدر روايت'

In [41]:
(sequencer.text_to_vec(df_train.iloc[0,0])[0:50] == SkipGram.wv["اهنئ"]).sum()

50

In [42]:
(sequencer.text_to_vec(df_train.iloc[0,0])).shape

(400,)

In [43]:
(sequencer.padder(sequencer.text_to_vec(df_train.iloc[0,0]))).shape

(1000,)

In [44]:
SkipGram.save("Word Embeddings/sg_3_2_50.model")

In [45]:
write_dict("Word Embeddings/dict_3_2_50.txt", embeddings_dict)

## GloVe