In [18]:
import os
import json

import torch
import numpy as np
import pandas as pd

from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm

# Setup

In [231]:
data_dir = os.path.join(os.curdir, "data")
vocab_path = os.path.join(data_dir, "word-level-vocab.json")
dataset_path = os.path.join(data_dir, "clean-tweets.tsv")

with open(vocab_path, "rt") as f:
    vocab = json.load(f)
    
dataset = pd.read_csv(filepath_or_buffer=dataset_path, sep="\t")

In [232]:
tweets = dataset["clean_text"].tolist()

In [233]:
OOV_TOKEN = "[OOV]"
PAD_TOKEN = "[PAD]"

OOV_INDEX = vocab.get(OOV_TOKEN)

print(f"Vocab Size = {len(vocab)}")

Vocab Size = 10998


In [234]:
tokenized_tweets = [[vocab.get(token) for token in tweet.split(" ") if token in vocab] for tweet in tweets]

In [235]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Skip Gram with Negative Sampling

In [236]:
tokenized_tweets

[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
 [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 31, 31, 31, 31],
 [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48],
 [49, 50, 51, 52, 53, 54, 55, 56, 52, 57, 58, 59, 52, 60, 61, 62],
 [63, 64, 65, 66, 67, 68, 69, 70],
 [71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82],
 [83, 84, 85, 86, 87, 88, 89],
 [90, 91, 92],
 [93,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  61,
  104,
  105,
  106,
  107,
  108,
  109],
 [57,
  110,
  111,
  112,
  113,
  61,
  75,
  114,
  115,
  116,
  117,
  61,
  118,
  119,
  120,
  121,
  122],
 [123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139],
 [140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153],
 [154, 155, 156, 157, 158, 159, 160, 161, 23, 161],
 [162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175],
 [176,
  177,
  178,
  179,
  180

In [266]:
WINDOW_SIZE = 4
NEGATIVE_SAMPLES_COUNT = 5

In [267]:
samples = []
actual_tokens = sorted(vocab.values())[2:]

for tweet in tqdm(tokenized_tweets):
    for index in range(WINDOW_SIZE, len(tweet) - WINDOW_SIZE):
        target = tweet[index]
        context = tweet[index - WINDOW_SIZE: index] + tweet[index + 1: index + WINDOW_SIZE + 1]
        samples.extend([(target, c, 1) for c in context])
        
        negative_samples = np.random.choice(a=actual_tokens, size=WINDOW_SIZE* 2, replace=False)
        samples.extend([(target, n, 0) for n in negative_samples])

samples = np.random.permutation(samples)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [268]:
class Word2VecNgram(nn.Module):
    def __init__(self, embedding_size: int, vocab_size: int, padding_idx: int):
        super().__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.padding_idx = padding_idx
        
        self.target_embedding = nn.Embedding(num_embeddings=self.vocab_size, 
                                      embedding_dim=self.embedding_size, 
                                      padding_idx=self.padding_idx, max_norm=1)
        self.context_embedding = nn.Embedding(num_embeddings=self.vocab_size, 
                                      embedding_dim=self.embedding_size, 
                                      padding_idx=self.padding_idx, max_norm=1)
        
    
    def forward(self, target, context):
        target_embedding = self.target_embedding(target)
        context_embedding = self.context_embedding(context)
        
        output = torch.sum(target_embedding * context_embedding, -1).unsqueeze(1)
        
        return torch.sigmoid(output)

In [310]:
model = Word2VecNgram(embedding_size=256, vocab_size=len(vocab), padding_idx=vocab.get(PAD_TOKEN)).to(device)


In [311]:
dataset = torch.tensor(samples)
dataset = TensorDataset(dataset[:, 0], dataset[:, 1], dataset[:, 2])
dataloader = DataLoader(dataset=dataset, batch_size=64, shuffle=True)

In [312]:
critertion = nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

epochs = 5

In [313]:
for epoch in range(epochs):
    epoch_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        target, context, label = batch

        target, context, label = target.to(device), context.to(device), label.to(device)

        prediction = model(target, context)

        loss = critertion(prediction, label.unsqueeze(1).float())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(epoch_loss / len(dataloader))



0.6496591276021713
0.565964875004238
0.5492774272701857
0.5453381000473527
0.543660771795809


In [314]:
vocab

{'اومن': 2,
 'بان': 3,
 'الانسان': 4,
 'ينطفي': 5,
 'جماله': 6,
 'ابتعاد': 7,
 'يحب': 8,
 'بريق': 9,
 'العيون': 10,
 'يختفي': 11,
 'فيصبح': 12,
 'ذابلا': 13,
 'منطفيا': 14,
 'يتحول': 15,
 'ربيعه': 16,
 'خريف': 17,
 'الذاكره': 18,
 'عندما': 19,
 'اعتقد': 20,
 'كريستيانو': 21,
 'انه': 22,
 'افضل': 23,
 'لاعب': 24,
 'العالم': 25,
 'كاكا': 26,
 'ميسي': 27,
 'ثالثا': 28,
 'حدث': 29,
 'العكس': 30,
 '😂': 31,
 'نخلو': 32,
 'ضغوطات': 33,
 'الحياه': 34,
 'فنحن': 35,
 'نعيش': 36,
 'ارض': 37,
 'اعدت': 38,
 'للبلاء': 39,
 'ولم': 40,
 'يسلم': 41,
 'الانبياء': 42,
 'توكل': 43,
 'دايما': 44,
 'وكن': 45,
 'مطمينا': 46,
 'وواثقا': 47,
 'بالله': 48,
 'بتوصل': 49,
 'عالبيت': 50,
 'بنط': 51,
 'بقلك': 52,
 'جيت': 53,
 'بتقعد': 54,
 'لتتحدث': 55,
 'معو': 56,
 'شو': 57,
 'بتقوم': 58,
 'لتمشي': 59,
 'ناسي': 60,
 'شي': 61,
 '🤔': 62,
 'نصمت': 63,
 'لتسير': 64,
 'حياتنا': 65,
 'يرام': 66,
 'فالناس': 67,
 'تعد': 68,
 'كانت': 69,
 'نقيه': 70,
 'صاحب': 71,
 'السمو': 72,
 'الملكي': 73,
 'الامير': 74,
 'الدكتور': 75,


In [315]:
embed = nn.Embedding.from_pretrained(embeddings=model.target_embedding.weight).cpu()

In [316]:
torch.argmax(torch.matmul(embed.weight , embed.weight[168].unsqueeze(1))) 

tensor(2527)

In [317]:
itos = {value: key for key, value in vocab.items()}

In [366]:
[itos[index.item()] for index in  torch.topk(torch.nn.functional.cosine_similarity(embed.weight, embed.weight[6487]), k=10).indices] 

['النصر',
 'نري',
 'غدا',
 'بالموعد',
 'منذو',
 'زمان',
 'والمستوي',
 'بهذا',
 'الفوز',
 'مستوياته']

In [340]:
all_words = [word for tweet in tweets for word in tweet.split(" ") if word in vocab]

In [342]:
from collections import Counter

In [343]:
counter = Counter(all_words)

In [365]:
vocab["النصر"]

6487

In [345]:
counter.most_common()

[('😂', 267),
 ('ال', 145),
 ('اني', 113),
 ('️', 109),
 ('يوم', 108),
 ('انو', 106),
 ('قال', 105),
 ('شي', 104),
 ('اذا', 101),
 ('عندما', 100),
 ('❤', 100),
 ('الا', 100),
 ('الناس', 98),
 ('انه', 97),
 ('يكون', 96),
 ('كنت', 95),
 ('الحياه', 94),
 ('محمد', 94),
 ('حزب', 91),
 ('العالم', 90),
 ('سلمان', 90),
 ('شو', 89),
 ('يعني', 85),
 ('اي', 81),
 ('سعد', 81),
 ('الرييس', 81),
 ('وانا', 79),
 ('شيء', 78),
 ('رييس', 76),
 ('انك', 74),
 ('تكون', 74),
 ('خير', 74),
 ('السلام', 71),
 ('وان', 70),
 ('العراق', 69),
 ('يارب', 66),
 ('حدا', 65),
 ('رح', 65),
 ('يلي', 62),
 ('دوله', 62),
 ('لان', 61),
 ('وكل', 61),
 ('قلبي', 61),
 ('رب', 60),
 ('شخص', 59),
 ('وانت', 59),
 ('مره', 58),
 ('افضل', 57),
 ('جدا', 57),
 ('مو', 55),
 ('سوريا', 55),
 ('كلام', 54),
 ('ضد', 54),
 ('يقول', 54),
 ('علي', 54),
 ('ايران', 54),
 ('فريق', 53),
 ('حكومه', 53),
 ('اكبر', 52),
 ('عدم', 52),
 ('خلال', 51),
 ('فقط', 51),
 ('سنه', 51),
 ('بكل', 50),
 ('الشعب', 49),
 ('بالله', 48),
 ('الكل', 48),
 ('الخير', 47),
