### Assignment 6 of Deep Learning CS6073
    By Akhil Kanna Devarashetti

#### Question:

    Write a Pytorch version of the Word2vec/skip-gram displayed in Chapter 14 of d2l.  
    In particular, make DL24.py error free 
    Implement get_similar_tokens as an application of the word embedding model 
    (14.4.3 of d2l and also the last slide in lecture)

In [1]:
# DL24.py CS5173/6073 2020 cheng
# making centers, contexts, and negatives for PennTreebank data
# building vocabulary, performing subsampling and negative sampling
# Skip-gram word embedding as a translation from MXNet to Pytorch of d2l chapter 14
# Usage: python DL24.py

import zipfile
import collections
import random
import math
import torch
import numpy as np

In [3]:
f = zipfile.ZipFile('data/ptb.zip', 'r')
raw_text = f.read('ptb/ptb.train.txt').decode("utf-8")
sentences = [line.split() for line in raw_text.split('\n')]
tokens = [tk for line in sentences for tk in line]
counter = collections.Counter(tokens)
uniq_tokens = [token for token, freq in list(counter.items()) if counter[token] >= 10]
idx_to_token, token_to_idx = [], dict()
for token in uniq_tokens:
    idx_to_token.append(token)
    token_to_idx[token] = len(idx_to_token) - 1


In [4]:
# Replacing the tokens that are < 10 in frequency with token_to_idx[0] token.
s = [[idx_to_token[token_to_idx.get(tk, 0)] for tk in line] for line in sentences]
tokens = [tk for line in s for tk in line]
counter = collections.Counter(tokens)
num_tokens = sum(counter.values())

In [5]:
# Sample the tokens which are rare.
order_of_magnitude = round(math.log10(num_tokens))
inverse_frequency = num_tokens / (10 ** order_of_magnitude)

subsampled = [[tk for tk in line if random.uniform(0, 1) < math.sqrt(inverse_frequency / counter[tk] * num_tokens)] for line in s]

In [6]:
corpus = [[token_to_idx.get(tk) for tk in line] for line in subsampled]

# corpus ~= [[1, 3, 4, 5], [43, 21, 44, 45]] <- indices of words

In [7]:
tokens = [tk for line in corpus for tk in line]
counter = collections.Counter(tokens)
sampling_weights = [counter[i]**0.75 for i in range(len(counter))]
population = list(range(len(sampling_weights)))
candidates = random.choices(population, sampling_weights, k=(10 ** order_of_magnitude))
# candidates = sampled tokens which occur rarely.

# The error lies in this loop!

In [None]:
window_range = 2
max_window_size = 5
K = 5
j = 0
data = {}
maxLen = 0
for line in corpus:
    if len(line) < 2:
        continue
    for i in range(len(line)):  # i is the center word
        # Find a list of context 
        context, old_negs = data.get(line[i], (set(), set()))
        for w in range(-window_range, window_range + 1):
            context_word_pos = i + w
            if context_word_pos < 0 or context_word_pos >= len(line) or i == context_word_pos or line[i] == line[context_word_pos]:
                continue
            context_word_idx = line[context_word_pos]
            context.add(context_word_idx)
            
        if len(context) == 0:
            continue

In [None]:
for line in corpus:
    if len(line) < 2:
        continue
    for i in range(len(line)):  # i is the center word
        
        context, neg = data.get(line[i], (set(), set()))

        while len(neg) < len(context) * K:
            ne = candidates[j]
            j += 1
            if j >= (10 ** order_of_magnitude):
                j = 0
            if ne not in context:
                neg.add(ne)
        

In [11]:
max_len = max(len(c) + len(n) for _, c, n in data)
centers, contexts_negatives, labels = [], [], []
for center, context, negative in data:
    cur_len = len(context) + len(negative)
    centers += [center]
    contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
    labels += [[1] * len(context) + [0] * (max_len - len(context))]

In [30]:
class PTBdataset(torch.utils.data.Dataset):
    def __init__(self):
        super(PTBdataset).__init__()
        self.centers = np.array(centers).reshape(-1, 1)
        self.contexts_negatives = np.array(contexts_negatives)
        self.labels = np.array(labels)

    def __len__(self):
        return len(self.centers)

    def __getitem__(self, idx):
        return self.centers[idx], self.contexts_negatives[idx], self.labels[idx]

In [31]:
pdata = PTBdataset()
data_iter = torch.utils.data.DataLoader(pdata, batch_size=512, shuffle=True)

vocab_size = len(idx_to_token)
embed_size = 100

In [32]:
import torch.nn as nn
import torch.optim as optim

net = nn.Sequential(
    nn.Embedding(vocab_size, embed_size),
    nn.Embedding(vocab_size, embed_size))
loss = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), 0.01)
m = nn.Sigmoid()

In [43]:
for epoch in range(2):
    print(f"\nEpoch: {epoch}")
    for i, batch in enumerate(data_iter):
        center, context_negative, label = batch
        v = net[0](center.to(torch.int64))
        u = net[1](context_negative.to(torch.int64))
        pred = torch.tensordot(v, torch.transpose(u, 1, 2))
        l = loss(m(pred), label.to(torch.float32))
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        if (i + 1) % 50 == 0:
            print(epoch, i, float(l))



Epoch: 0


KeyboardInterrupt: 

### Implementation of get_similar_tokens using d2l.ai and PyTorch's CosineSimilarity

In [None]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]  # Access with index == matmul

    # Compute the cosine similarity. Add 1e-9 for numerical stability
    cos_similarity = nn.CosineSimilarity(dim=0, eps=1e-6)
    cos = cos_similarity(W, x)

    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:  # Remove the input words
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))

get_similar_tokens('chip', 3, net[0])

In [37]:
words_for_similarity = ['chip', 'hardware', 'semiconductor']

for word in words_for_similarity:
    print(f"\nSimilarity for '{word}':")
    get_similar_tokens(word, 3, net[0])


Similarity for 'chip':
cosine sim=0.261: to
cosine sim=0.201: among
cosine sim=0.180: nov.

Similarity for 'hardware':
cosine sim=0.250: gold
cosine sim=0.243: to
cosine sim=0.188: cancer

Similarity for 'semiconductor':
cosine sim=0.287: although
cosine sim=0.195: ago
cosine sim=0.184: journal


The results for similarity aren't as great as shown in the textbook.
This might be because of limited training or/and lack of richer dataset.
