In [1]:
#For Colab Env
#!pip install torchtext
#!pip install torchdata

In [2]:
import urllib
import os
import collections
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
import torchtext
from torch.utils.data import Dataset, DataLoader



### Preparing Data

In [3]:

os.makedirs('data', exist_ok=True)
dataset_train = torchtext.datasets.WikiText2(root='./data')
tokenizer = torchtext.data.utils.get_tokenizer('basic_english', language="en")
dataset_train = list(dataset_train[0])

In [4]:
tokenized_data = []
counter = collections.Counter()

print('Start Tokenizing...')
for line in tqdm(dataset_train):
    tokens = tokenizer(line)
    tokenized_data.append(tokens)

print('Making Vocab...')
for line in tqdm(tokenized_data):
    counter.update(line)

vocab = torchtext.vocab.vocab(counter, min_freq=50, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

Start Tokenizing...


100%|██████████| 36718/36718 [00:01<00:00, 25483.61it/s]


Making Vocab...


100%|██████████| 36718/36718 [00:00<00:00, 79792.12it/s] 


In [5]:
class CBOWDataset(Dataset):
    def __init__(self, tokenized_data, window_size = 2, max_seq = 256):
        self.x = []
        self.y = []
        
        for tokens in tqdm(tokenized_data):
            token_ids = [vocab[token] for token in tokens]
            token_ids = token_ids[:max_seq]
            for i, token_id in enumerate(token_ids):
                if i-window_size >= 0 and i+window_size < len(token_ids):
                    self.x.append(token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
                    self.y.append(token_id)
        self.x = torch.LongTensor(self.x)
        self.y = torch.LongTensor(self.y)
        
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        return self.x[idx] , self.y[idx]

In [6]:
print('Making CBOW Dataset')
cbow_dataset = CBOWDataset(tokenized_data, window_size = 4)

Making CBOW Dataset


100%|██████████| 36718/36718 [00:06<00:00, 5852.89it/s]


In [7]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.linear = nn.Linear(emb_size, vocab_size)
    
    def forward(self, x):
        emb = torch.sum(self.embedding(x), dim=1)
        output = self.linear(emb)
        return output

In [8]:

cbow_model = CBOW(vocab_size=len(vocab), emb_size=300)

In [9]:
batch_size=256
learning_rate = 0.025
num_epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [10]:

cbow_dataloader = DataLoader(cbow_dataset, batch_size=batch_size, shuffle=True)

In [11]:
cbow_model.train()
cbow_model = cbow_model.to(device)
optimizer = torch.optim.SGD(cbow_model.parameters(), lr=learning_rate)
lr_lambda = lambda epoch: (num_epochs - epoch) / num_epochs
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
criterion  = nn.CrossEntropyLoss()

In [12]:

for epoch in range(num_epochs):
    for x,y in tqdm(cbow_dataloader):
        x, y = x.to(device), y.to(device)
        output = cbow_model(x)

        optimizer.zero_grad()
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
    lr_scheduler.step()
    if (epoch) % 1 == 0:
        print(f"Epoch: {epoch}, Loss: {loss.item()}")
print("Finish Training!")

100%|██████████| 7148/7148 [00:23<00:00, 303.08it/s]


Epoch: 0, Loss: 6.0054779052734375


100%|██████████| 7148/7148 [00:24<00:00, 295.40it/s]


Epoch: 1, Loss: 5.524990558624268


100%|██████████| 7148/7148 [00:24<00:00, 293.70it/s]


Epoch: 2, Loss: 4.990777492523193


100%|██████████| 7148/7148 [00:24<00:00, 288.61it/s]


Epoch: 3, Loss: 4.591667175292969


100%|██████████| 7148/7148 [00:24<00:00, 296.55it/s]

Epoch: 4, Loss: 5.301662921905518
Finish Training!





In [13]:
embeddings = list(cbow_model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()
print(embeddings.shape)

(4099, 300)


In [14]:
#Ref: Finding Similar Words: https://github.com/OlgaChernytska/word2vec-pytorch/blob/main/notebooks/Inference.ipynb

def get_top_similar(word, top_n = 10):
    word_id = vocab[word]
    word_vec = embeddings[word_id]
    word_vec = word_vec.reshape(len(word_vec), 1)
    dists = np.matmul(embeddings, word_vec).flatten()
    top_n_ids = np.argsort(-dists)[1: top_n + 1]

    top_n_dict = {}
    for top_n_id in top_n_ids:
        top_n_word = vocab.lookup_token(top_n_id)
        top_n_dict[top_n_word] = dists[top_n_id]
    return top_n_dict

    



In [15]:
get_top_similar('france')

{'revolutionary': 70.55626,
 'soon': 70.10284,
 '46': 61.53412,
 'quality': 59.146877,
 'founded': 58.093346,
 'realized': 57.03121,
 'cross': 53.89807,
 'heard': 53.49968,
 '×': 53.399788,
 'existing': 53.20304}

In [16]:
get_top_similar('mother')

{'step': 61.87981,
 'greater': 58.76099,
 'restored': 54.811356,
 'wheeler': 53.57195,
 'executive': 53.45034,
 '1950': 50.931454,
 'craft': 50.80931,
 'sports': 50.61734,
 'shot': 49.490524,
 'spores': 49.317406}

In [17]:
get_top_similar('king')

{'instance': 71.228645,
 '“': 56.584,
 'enterprise': 56.446896,
 'aniston': 56.24984,
 'twin': 53.923916,
 'premiered': 53.84211,
 'producers': 53.588257,
 'distribution': 52.932663,
 'yellow': 52.16742,
 'estate': 51.532257}

In [18]:
#Ref: https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0