### Pytorch SkipGram

In [3]:
!pip install torchdata



In [1]:
!pip install -U torchtext

Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=2.3.0 (from torchtext)
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.3.0->to

In [2]:
!pip install portalocker

Collecting portalocker
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2


In [4]:
import torch
import torch.nn as nn
from functools import partial
from torch.utils.data import DataLoader
from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import WikiText2, WikiText103
import numpy as np

import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import math


import pandas as pd
from torchtext.vocab import vocab



In [5]:
WINDOW_SIZE = 5
BATCH_SIZE = 64
EMB_DIM = 100
EPOCHS = 3

In [6]:
if torch.cuda.is_available():
 device = torch.device('cuda')
else:
 device = torch.device('cpu')

In [7]:
!wget https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz -O wikitext-2.tar.gz
!tar -xvzf wikitext-2.tar.gz

--2024-05-12 08:28:29--  https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.18.150, 52.217.136.224, 16.182.39.192, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.18.150|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4070055 (3.9M) [application/x-tar]
Saving to: ‘wikitext-2.tar.gz’


2024-05-12 08:28:29 (20.9 MB/s) - ‘wikitext-2.tar.gz’ saved [4070055/4070055]

wikitext-2/
wikitext-2/train.csv
wikitext-2/test.csv


In [8]:
def load_data(filepath):
    with open(filepath) as f:
      return f.readlines()

In [9]:
train = load_data("wikitext-2/train.csv")
test = load_data("wikitext-2/test.csv")
data = train + test

In [10]:
tokenizer = get_tokenizer("basic_english", language="en")

In [11]:
def yield_tokens(data_obj):
    for text in data_obj:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(data), specials=["<unk>"], min_freq=20)
vocab.set_default_index(vocab["<unk>"])

In [12]:
len(vocab)

8627

In [13]:
vocab['hero']

587

In [14]:
text_pipeline = lambda x: vocab(tokenizer(x))

In [15]:
def build_contexts(row, window_size=3):
    contexts = []
    for i in range(len(row)):
      central_word = row[i]
      context = [row[i + delta] for delta in range(-window_size, window_size + 1)
                       if delta != 0 and i + delta >= 0 and i + delta < len(row)]

      for c_w in context:
        contexts.append((central_word, c_w))
    return contexts

In [16]:
class Word2VecDataset(Dataset):
    def __init__(self, data, vocab, wsize=3):
        self.vocab_size = len(vocab)
        self.data = [text_pipeline(item) for item in data]
        self.data = [item for text in self.data for item in text]
        self.data = build_contexts(self.data, window_size=wsize)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [17]:
dataset = Word2VecDataset(data, vocab)
train_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [18]:
central_word, context = dataset[0]
central_word, context

(9, 435)

In [19]:
class SkipGram_Model(nn.Module):
    def __init__(self, vocab_size: int):
        super(SkipGram_Model, self).__init__()
        self.embeddings = nn.Embedding(
           num_embeddings=vocab_size,
           embedding_dim=EMB_DIM
        )
        self.linear = nn.Linear(
            in_features=EMB_DIM,
            out_features=vocab_size
        )

    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = self.linear(x)
        return x

In [20]:
vocab_size = len(vocab)
model =  SkipGram_Model(vocab_size)

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [22]:
def train_model(dataloader, model, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(dataloader):
       optimizer.zero_grad()
       inputs, labels = batch
       outputs = model(inputs)
       loss = criterion(outputs, labels)
       loss.backward()
       optimizer.step()

       epoch_loss += loss.item()

    return epoch_loss

In [23]:

for epoch in range(EPOCHS):
    epoch_loss = train_model(train_dataloader, model, optimizer, criterion)
    print(f'Epoch {epoch+1}, Loss: {epoch_loss}')

Epoch 1, Loss: 1589614.1384687424
Epoch 2, Loss: 1439547.3943657875
Epoch 3, Loss: 1401056.7418708801


In [24]:
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(8627, 100)

In [25]:
def get_top_similar(word, n=10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    top_ids = np.argsort(-dists)[1 : n + 1]

    top_dict = {}
    for sim_word_id in top_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        top_dict[sim_word] = dists[sim_word_id]
    return top_dict

In [26]:
print(get_top_similar('hero'))

{'limitations': 0.38720885, 'evening': 0.3556161, 'published': 0.3412636, 'violence': 0.34070134, 'laid': 0.33727676, 'submerged': 0.32469878, 'remember': 0.3139036, 'mariah': 0.30127, 'shear': 0.2883039, 'harold': 0.28707844}
