In [187]:
#For Colab Env
!pip install torchtext
!pip install torchdata

[0m

In [188]:
#Ref: https://github.com/OlgaChernytska/word2vec-pytorch
import urllib
import os
import collections
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
import torchtext
from torch.utils.data import Dataset, DataLoader



### Preparing Data

In [189]:
os.makedirs('data', exist_ok=True)
dataset_train = torchtext.datasets.WikiText2(root='./data')
tokenizer = torchtext.data.utils.get_tokenizer('basic_english', language="en")
dataset_train = list(dataset_train[0])

In [190]:
tokenized_data = []
counter = collections.Counter()

print('Start Tokenizing...')
for line in tqdm(dataset_train):
    tokens = tokenizer(line)
    tokenized_data.append(tokens)

print('Making Vocab...')
for line in tqdm(tokenized_data):
    counter.update(line)

vocab = torchtext.vocab.vocab(counter, min_freq=50, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])


Start Tokenizing...


100%|██████████| 36718/36718 [00:01<00:00, 33441.01it/s]


Making Vocab...


100%|██████████| 36718/36718 [00:00<00:00, 134035.79it/s]


In [191]:
class CBOWDataset(Dataset):
    def __init__(self, tokenized_data, window_size = 2, max_seq = 256):
        self.x = []
        self.y = []
        
        for tokens in tqdm(tokenized_data):
            token_ids = [vocab[token] for token in tokens]
            token_ids = token_ids[:max_seq]
            for i, token_id in enumerate(token_ids):
                if i-window_size >= 0 and i+window_size < len(token_ids):
                    self.x.append(token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
                    self.y.append(token_id)
        self.x = torch.LongTensor(self.x)
        self.y = torch.LongTensor(self.y)
        
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        return self.x[idx] , self.y[idx]

In [192]:
print('Making CBOW Dataset')
cbow_dataset = CBOWDataset(tokenized_data, window_size = 4)


Making CBOW Dataset


100%|██████████| 36718/36718 [00:05<00:00, 6492.31it/s] 


In [193]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.linear = nn.Linear(emb_size, vocab_size)
    
    def forward(self, x):
        emb = torch.sum(self.embedding(x), dim=1)
        output = self.linear(emb)
        return output


        

In [194]:
cbow_model = CBOW(vocab_size=len(vocab), emb_size=300)

In [195]:
batch_size=256
learning_rate = 0.025
num_epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [196]:
cbow_dataloader = DataLoader(cbow_dataset, batch_size=batch_size, shuffle=True)

In [197]:
cbow_model.train()
cbow_model = cbow_model.to(device)
optimizer = torch.optim.SGD(cbow_model.parameters(), lr=learning_rate)
lr_lambda = lambda epoch: (num_epochs - epoch) / num_epochs
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
criterion  = nn.CrossEntropyLoss()

In [198]:

for epoch in range(num_epochs):
    for x,y in tqdm(cbow_dataloader):
        x, y = x.to(device), y.to(device)
        output = cbow_model(x)

        optimizer.zero_grad()
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
    lr_scheduler.step()
    if (epoch) % 1 == 0:
        print(f"Epoch: {epoch}, Loss: {loss.item()}")
print("Finish Training!")

100%|██████████| 7148/7148 [00:16<00:00, 421.99it/s]


Epoch: 0, Loss: 6.1642913818359375


100%|██████████| 7148/7148 [00:17<00:00, 407.63it/s]


Epoch: 1, Loss: 5.088748931884766


100%|██████████| 7148/7148 [00:17<00:00, 400.06it/s]


Epoch: 2, Loss: 6.064475059509277


100%|██████████| 7148/7148 [00:17<00:00, 407.52it/s]


Epoch: 3, Loss: 4.991163730621338


100%|██████████| 7148/7148 [00:18<00:00, 393.03it/s]

Epoch: 4, Loss: 5.016488075256348
Finish Training!





In [199]:
embeddings = list(cbow_model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()
print(embeddings.shape)

(4099, 300)


In [200]:
#Ref: Finding Similar Words: https://github.com/OlgaChernytska/word2vec-pytorch/blob/main/notebooks/Inference.ipynb

def get_top_similar(word, top_n = 10):
    word_id = vocab[word]
    word_vec = embeddings[word_id]
    word_vec = word_vec.reshape(len(word_vec), 1)
    dists = np.matmul(embeddings, word_vec).flatten()
    top_n_ids = np.argsort(-dists)[1: top_n + 1]

    top_n_dict = {}
    for top_n_id in top_n_ids:
        top_n_word = vocab.lookup_token(top_n_id)
        top_n_dict[top_n_word] = dists[top_n_id]
    return top_n_dict

    



In [205]:
get_top_similar('england')

{'way': 61.01765,
 'principal': 58.88688,
 'record': 51.147896,
 'flight': 50.603806,
 'transportation': 49.83083,
 'are': 49.626415,
 'nominated': 49.205944,
 'rebuilt': 47.76376,
 'elements': 46.621918,
 'hall': 46.60324}

In [204]:
get_top_similar('father')

{'personality': 59.5606,
 'however': 58.291,
 'expressed': 57.54368,
 'government': 56.87165,
 'volume': 55.61387,
 'contemporary': 55.53237,
 'numerous': 54.504128,
 'pitch': 54.057922,
 '80': 52.146004,
 'railroad': 51.321465}

In [203]:
#Ref: https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0