### Assignment 6 of Deep Learning CS6073
    By Akhil Kanna Devarashetti

#### Question:

    Write a Pytorch version of the Word2vec/skip-gram displayed in Chapter 14 of d2l.  
    In particular, make DL24.py error free 
    Implement get_similar_tokens as an application of the word embedding model 
    (14.4.3 of d2l and also the last slide in lecture)

In [1]:
# DL24.py CS5173/6073 2020 cheng
# making centers, contexts, and negatives for PennTreebank data
# building vocabulary, performing subsampling and negative sampling
# Skip-gram word embedding as a translation from MXNet to Pytorch of d2l chapter 14
# Usage: python DL24.py

import zipfile
import collections
import random
import math
import torch
import numpy as np

f = zipfile.ZipFile('data/ptb.zip', 'r')
raw_text = f.read('ptb/ptb.train.txt').decode("utf-8")
sentences = [line.split() for line in raw_text.split('\n')]
tokens = [tk for line in sentences for tk in line]
counter = collections.Counter(tokens)
uniq_tokens = [token for token, freq in list(counter.items()) if counter[token] >= 10]
idx_to_token, token_to_idx = [], dict()
for token in uniq_tokens:
    idx_to_token.append(token)
    token_to_idx[token] = len(idx_to_token) - 1
s = [[idx_to_token[token_to_idx.get(tk, 0)] for tk in line] for line in sentences]
tokens = [tk for line in s for tk in line]
counter = collections.Counter(tokens)
num_tokens = sum(counter.values())
subsampled = [[tk for tk in line if random.uniform(0, 1) < math.sqrt(1e-4 / counter[tk] * num_tokens)] for line in s]
corpus = [[token_to_idx.get(tk) for tk in line] for line in subsampled]
tokens = [tk for line in corpus for tk in line]
counter = collections.Counter(tokens)
sampling_weights = [counter[i]**0.75 for i in range(len(counter))]
population = list(range(len(sampling_weights)))
candidates = random.choices(population, sampling_weights, k=10000)
max_window_size = 5
K = 5
j = 0
data = []
maxLen = 0
for line in corpus:
    if len(line) < 2:
        continue
    for i in range(len(line)):
        window_size = random.randint(1, max_window_size)
        indices = list(range(max(0, i - window_size), min(len(line), i + 1 + window_size)))
        indices.remove(i)
        for idx in indices:
            context = [line[idx] for idx in indices]
        neg = []
        while len(neg) < len(context) * K:
            ne = candidates[j]
            j += 1
            if j >= 10000:
                j = 0
            if ne not in context:
                neg.append(ne)
        data.append([line[i], context, neg])

max_len = max(len(c) + len(n) for _, c, n in data)
centers, contexts_negatives, labels = [], [], []
for center, context, negative in data:
    cur_len = len(context) + len(negative)
    centers += [center]
    contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
    labels += [[1] * len(context) + [0] * (max_len - len(context))]

class PTBdataset(torch.utils.data.Dataset):
    def __init__(self):
        super(PTBdataset).__init__()
        self.centers = np.array(centers).reshape(-1, 1)
        self.contexts_negatives = np.array(contexts_negatives)
        self.labels = np.array(labels)

    def __len__(self):
        return len(self.centers)

    def __getitem__(self, idx):
        return self.centers[idx], self.contexts_negatives[idx], self.labels[idx]

pdata = PTBdataset()
data_iter = torch.utils.data.DataLoader(pdata, batch_size=512, shuffle=True)

vocab_size = len(idx_to_token)
embed_size = 100

import torch.nn as nn
import torch.optim as optim

net = nn.Sequential(
    nn.Embedding(vocab_size, embed_size),
    nn.Embedding(vocab_size, embed_size))
loss = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), 0.01)
m = nn.Sigmoid()

for epoch in range(5):
    for i, batch in enumerate(data_iter):
        center, context_negative, label = batch
        v = net[0](center.to(torch.int64))
        u = net[1](context_negative.to(torch.int64))
        pred = torch.tensordot(v, torch.transpose(u, 1, 2))
        l = loss(m(pred), label.to(torch.float32))
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        if (i + 1) % 50 == 0:
            print(epoch, i, float(l))


0 49 14.030824661254883
0 99 14.664448738098145
0 149 13.80379867553711
0 199 11.802692413330078
0 249 14.349369049072266
0 299 13.037077903747559
0 349 13.409360885620117
0 399 14.96679401397705
0 449 13.498037338256836
0 499 12.379167556762695
0 549 13.271138191223145
0 599 13.458529472351074
0 649 13.332769393920898
1 49 13.048604011535645
1 99 13.343057632446289
1 149 13.318685531616211
1 199 13.19548511505127
1 249 13.479415893554688
1 299 13.454062461853027
1 349 13.143341064453125
1 399 13.37532901763916
1 449 12.925860404968262
1 499 13.412955284118652
1 549 11.702332496643066
1 599 13.012636184692383
1 649 12.360490798950195
2 49 14.150681495666504
2 99 11.728529930114746
2 149 12.387529373168945
2 199 12.636744499206543
2 249 14.112762451171875
2 299 11.70865535736084
2 349 12.183820724487305
2 399 12.128985404968262
2 449 13.24563217163086
2 499 13.121143341064453
2 549 11.918919563293457
2 599 12.812511444091797
2 649 12.174520492553711
3 49 13.352450370788574
3 99 12.58363

### Implementation of get_similar_tokens using d2l.ai and PyTorch's CosineSimilarity

In [2]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]
    x = x.reshape([1] + list(x.shape))

    # Compute the cosine similarity. Add 1e-9 for numerical stability
    cos_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)
    cos = cos_similarity(W, x)

    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:  # Remove the input words
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))

get_similar_tokens('chip', 3, net[0])

cosine sim=0.344: anniversary
cosine sim=0.343: public
cosine sim=0.329: scores


In [3]:
words_for_similarity = ['five-cent', 'hardware', 'semiconductor']

for word in words_for_similarity:
    print(f"\nSimilarity for '{word}':")
    get_similar_tokens(word, 3, net[0])


Similarity for 'five-cent':
cosine sim=0.376: hard
cosine sim=0.335: trump
cosine sim=0.325: antar

Similarity for 'hardware':
cosine sim=0.341: mix
cosine sim=0.338: choice
cosine sim=0.317: fallen

Similarity for 'semiconductor':
cosine sim=0.438: senior
cosine sim=0.369: television
cosine sim=0.356: everyone


The results for similarity aren't as great as shown in the textbook.
This might be because of limited training or/and lack of richer dataset.
