### Pytorch SkipGram

In [25]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
!pip install -U torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
!pip install portalocker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [28]:
import torch
import torch.nn as nn
from functools import partial
from torch.utils.data import DataLoader
from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import WikiText2, WikiText103
import numpy as np

import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import math


import pandas as pd
from torchtext.vocab import vocab

In [29]:
WINDOW_SIZE = 5
BATCH_SIZE = 200
EMB_DIM = 200
EPOCHS = 10

In [30]:
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

In [31]:
!wget https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz -O wikitext-2.tar.gz
!tar -xvzf wikitext-2.tar.gz

--2023-05-16 12:05:07--  https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.225.80, 54.231.228.72, 52.217.234.32, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.225.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4070055 (3.9M) [application/x-tar]
Saving to: ‘wikitext-2.tar.gz’


2023-05-16 12:05:10 (2.49 MB/s) - ‘wikitext-2.tar.gz’ saved [4070055/4070055]

wikitext-2/
wikitext-2/train.csv
wikitext-2/test.csv


In [32]:
def load_data(filepath):
    with open(filepath) as f:
      return f.readlines()

In [33]:
train = load_data("wikitext-2/train.csv")
test = load_data("wikitext-2/test.csv")
data = train + test

In [34]:
tokenizer = get_tokenizer("basic_english", language="en")

In [35]:
def yield_tokens(data_obj):
    for text in data_obj:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(data), specials=["<unk>"], min_freq=20)
vocab.set_default_index(vocab["<unk>"])

In [36]:
len(vocab)

8627

In [37]:
vocab['asdasdasd']

0

In [38]:
text_pipeline = lambda x: vocab(tokenizer(x))

In [39]:
def build_contexts(row, window_size=3):
    contexts = []
    for i in range(len(row)):
      central_word = row[i]
      context = [row[i + delta] for delta in range(-window_size, window_size + 1) 
                       if delta != 0 and i + delta >= 0 and i + delta < len(row)]

      for c_w in context:
        contexts.append((central_word, c_w)) 
    return contexts

In [40]:
class Word2VecDataset(Dataset):
    def __init__(self, data, vocab, wsize=3):
        self.vocab_size = len(vocab)
        self.data = [text_pipeline(item) for item in data]
        self.data = [item for text in self.data for item in text]
        self.data = build_contexts(self.data, window_size=wsize)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [41]:
dataset = Word2VecDataset(data, vocab)
train_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [42]:
central_word, context = dataset[0]
central_word, context 

(9, 435)

In [43]:
class SkipGram_Model(nn.Module):
    def __init__(self, vocab_size: int):
        super(SkipGram_Model, self).__init__()
        self.embeddings = nn.Embedding(
           num_embeddings = vocab_size,
           embedding_dim = EMB_DIM
        )
        self.linear = nn.Linear(
            out_features = vocab_size,
            in_features = EMB_DIM
        )

    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = self.linear(x)
        return x

In [44]:
vocab_size = len(vocab)
model = SkipGram_Model(vocab_size)
if torch.cuda.is_available():
  model = model.cuda()

In [45]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [46]:
from tqdm import tqdm

In [47]:
def train_model(dataloader, model, optimizer, criterion):
  epoch_loss=[]      
  for word, context in tqdm(train_dataloader):
    optimizer.zero_grad() 
    prediction = model(word.to(device))
    lossValue = criterion(prediction, context.to(device))  
    lossValue.backward()      
    epoch_loss.append(lossValue.detach().cpu().numpy())
    optimizer.step()
  return np.mean(epoch_loss) 

In [48]:
for epoch in range(EPOCHS):
    loss = train_model(train_dataloader, model, optimizer, criterion)
    if epoch % 2 == 0:
      print(f'Epoch {epoch}: train loss {loss}')


100%|██████████| 53072/53072 [02:57<00:00, 298.24it/s]


Epoch 0: train loss 6.113578796386719


100%|██████████| 53072/53072 [02:56<00:00, 300.29it/s]
100%|██████████| 53072/53072 [02:59<00:00, 296.17it/s]


Epoch 2: train loss 5.913295269012451


100%|██████████| 53072/53072 [02:56<00:00, 300.13it/s]
100%|██████████| 53072/53072 [02:58<00:00, 297.52it/s]


Epoch 4: train loss 5.856536865234375


100%|██████████| 53072/53072 [02:57<00:00, 298.88it/s]
100%|██████████| 53072/53072 [02:57<00:00, 298.24it/s]


Epoch 6: train loss 5.825161457061768


100%|██████████| 53072/53072 [02:58<00:00, 297.64it/s]
100%|██████████| 53072/53072 [02:58<00:00, 297.39it/s]


Epoch 8: train loss 5.8068528175354


100%|██████████| 53072/53072 [02:59<00:00, 296.48it/s]


In [49]:
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(8627, 256)

In [50]:
def get_top_similar(word, n=10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    top_ids = np.argsort(-dists)[1 : n + 1]

    top_dict = {}
    for sim_word_id in top_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        top_dict[sim_word] = dists[sim_word_id]
    return top_dict

In [51]:
get_top_similar('hero')

{'guitar': 0.6829786,
 'downloadable': 0.39533365,
 'drums': 0.37854874,
 'playstation': 0.37289682,
 'dj': 0.35175693,
 'barbarian': 0.34761235,
 'marketed': 0.34507954,
 'activision': 0.3440398,
 'gameplay': 0.3438322,
 'arcade': 0.34281075}