In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import pickle
import numpy as np

In [0]:
corpus = open("drive/My Drive/Data.txt", "r").readlines()
tokenized_corpus = [x.split() for x in corpus]

In [0]:
vocabulary = {}
word2idx = {}
min_count = 20
window_size = 2

for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary: vocabulary[token] = 0
        vocabulary[token] += 1

i = 0
for w, c in vocabulary.items():
    if c < min_count: continue
    word2idx[w] = i
    i += 1

vocabulary_size = len(word2idx)

In [0]:
contexts = []
targets = []

# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(window_size, len(indices) - window_size):
        targets.append([indices[center_word_pos]])
        contexts.append([indices[w] for w in range(center_word_pos - window_size, center_word_pos + window_size + 1) if w != center_word_pos])

targets = torch.tensor(targets)
if torch.cuda.is_available(): targets = targets.cuda()

In [0]:
def input(word_idx):
    x = torch.zeros(vocabulary_size)
    if torch.cuda.is_available(): x = x.cuda()
    x[word_idx] = 1.0
    return x

In [0]:
def cosine(a, b):
    return torch.dot(a, b) / (torch.norm(a) * torch.norm(b))

In [0]:
class Net(nn.Module):
    def __init__(self, embedding_dims, vocabulary_size):
        super(Net, self).__init__()
        self.W1 = nn.Linear(vocabulary_size, embedding_dims, bias=False)
        self.W2 = nn.Linear(embedding_dims, vocabulary_size, bias=False)

    def forward(self, x):
        z = self.W2(self.W1(x))
        log_softmax = F.log_softmax(z, dim=0)
        return log_softmax.view(1,-1)

In [0]:
num_epochs = 20
learning_rate = 0.1
embedding_dims = 30

model = Net(embedding_dims, vocabulary_size)
if torch.cuda.is_available(): model = model.cuda()

parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
optimiser = optim.SGD(parameters, lr=learning_rate)

iterations = 0
for epo in range(num_epochs):
    loss_val = 0
    for context, target in zip(contexts, targets):
        model.zero_grad()
        optimiser.zero_grad()
        
        prediction = model(input(context))
        loss = F.nll_loss(prediction, target)
        loss_val += loss.item()
        
        if iterations % 100 == 0:
            loss.backward()
            optimiser.step()
            iterations = 0
          
        iterations += 1
        
    print("Loss at epoch %d = %.2f" % (epo + 1, loss_val))

In [0]:
np.save("vectors", vectors)
pickle.dump(word2idx, open("word2idx.pkl", "wb"))

In [0]:
embedding = model.W2.weight

In [0]:
from heapq import nlargest

def most_similar(token):
    score = {}
    for word in vocabulary:
        a = embedding[word2idx[token]]
        b = embedding[word2idx[word]]
        score[word] = cosine(a, b).item()

    print(nlargest(10, score, key=score.get))

def closest(a):
    score = {}
    for word in vocabulary:
        b = embedding[word2idx[word]]
        score[word] = cosine(a, b).item()

    print(nlargest(10, score, key=score.get))

In [0]:
most_similar("doctor")

In [0]:
a = embedding[word2idx["woman"]]
b = embedding[word2idx["man"]]
c = embedding[word2idx["doctor"]]
closest(c - b + a)

In [0]:
a = embedding[word2idx["person"]]
b = embedding[word2idx["doctor"]]

cosine(a, b).item()