### Assignment 6 of Deep Learning CS6073
    By Akhil Kanna Devarashetti

#### Question:

    Write a Pytorch version of the Word2vec/skip-gram displayed in Chapter 14 of d2l.  
    In particular, make DL24.py error free 
    Implement get_similar_tokens as an application of the word embedding model 
    (14.4.3 of d2l and also the last slide in lecture)

In [1]:
# DL24.py CS5173/6073 2020 cheng
# making centers, contexts, and negatives for PennTreebank data
# building vocabulary, performing subsampling and negative sampling
# Skip-gram word embedding as a translation from MXNet to Pytorch of d2l chapter 14
# Usage: python DL24.py

import zipfile
import collections
import random
import math
import torch
import numpy as np

In [2]:
f = zipfile.ZipFile('data/ptb.zip', 'r')
raw_text = f.read('ptb/ptb.train.txt').decode("utf-8")
sentences = [line.split() for line in raw_text.split('\n')]
tokens = [tk for line in sentences for tk in line]
counter = collections.Counter(tokens)
uniq_tokens = [token for token, freq in list(counter.items()) if counter[token] >= 10]
idx_to_token, token_to_idx = [], dict()
for token in uniq_tokens:
    idx_to_token.append(token)
    token_to_idx[token] = len(idx_to_token) - 1


In [3]:
# Replacing the tokens that are < 10 in frequency with token_to_idx[0] token.
s = [[idx_to_token[token_to_idx.get(tk, 0)] for tk in line] for line in sentences]
tokens = [tk for line in s for tk in line]
counter = collections.Counter(tokens)
num_tokens = sum(counter.values())

In [4]:
# Sample the tokens which are rare.
order_of_magnitude = round(math.log10(num_tokens))
inverse_frequency = num_tokens / (10 ** order_of_magnitude)

subsampled = [[tk for tk in line if random.uniform(0, 1) < math.sqrt(inverse_frequency / counter[tk] * num_tokens)] for line in s]

In [5]:
corpus = [[token_to_idx.get(tk) for tk in line] for line in subsampled]

# corpus ~= [[1, 3, 4, 5], [43, 21, 44, 45]] <- indices of words

In [6]:
tokens = [tk for line in corpus for tk in line]
counter = collections.Counter(tokens)
sampling_weights = [counter[i]**0.75 for i in range(len(counter))]
population = list(range(len(sampling_weights)))
candidates = random.choices(population, sampling_weights, k=(10 ** order_of_magnitude))
# candidates = sampled tokens which occur rarely.

# The error was in this loop!

In [7]:
window_range = 2
max_window_size = 5
K = 5
j = 0
data = []
maxLen = 0
for line in corpus:
    if len(line) < 2:
        continue
    for i in range(len(line)):  # i is the center word
        # Find a list of context 
        context = []
        for w in range(-window_range, window_range + 1):
            context_word_pos = i + w
            if context_word_pos < 0 or context_word_pos >= len(line) or i == context_word_pos or line[i] == line[context_word_pos]:
                continue
            context_word_idx = line[context_word_pos]
            context.append(context_word_idx)
            
        if len(context) == 0:
            continue

        neg = []
        while len(neg) < len(context) * K:
            ne = candidates[j]
            j += 1
            if j >= (10 ** order_of_magnitude):
                j = 0
            if ne not in context:
                neg.append(ne)
       
        data.append([line[i], context, neg])
        

In [8]:
max_len = max(len(c) + len(n) for _, c, n in data)
centers, contexts_negatives, labels = [], [], []
for center, context, negative in data:
    cur_len = len(context) + len(negative)
    centers += [center]
    contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
    labels += [[1] * len(context) + [0] * (max_len - len(context))]

In [9]:
class PTBdataset(torch.utils.data.Dataset):
    def __init__(self):
        super(PTBdataset).__init__()
        self.centers = np.array(centers).reshape(-1, 1)
        self.contexts_negatives = np.array(contexts_negatives)
        self.labels = np.array(labels)

    def __len__(self):
        return len(self.centers)

    def __getitem__(self, idx):
        return self.centers[idx], self.contexts_negatives[idx], self.labels[idx]

In [10]:
pdata = PTBdataset()
data_iter = torch.utils.data.DataLoader(pdata, batch_size=512, shuffle=True)

vocab_size = len(idx_to_token)
embed_size = 100

In [11]:
import torch.nn as nn
import torch.optim as optim

net = nn.Sequential(
    nn.Embedding(vocab_size, embed_size),
    nn.Embedding(vocab_size, embed_size))
loss = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), 0.01)
m = nn.Sigmoid()

In [12]:
for epoch in range(2):
    print(f"\nEpoch: {epoch}")
    for i, batch in enumerate(data_iter):
        center, context_negative, label = batch
        v = net[0](center.to(torch.int64))
        u = net[1](context_negative.to(torch.int64))
        pred = torch.tensordot(v, torch.transpose(u, 1, 2))
        l = loss(m(pred), label.to(torch.float32))
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        if (i + 1) % 50 == 0:
            print(epoch, i, float(l))



Epoch: 0
0 49 13.814745903015137
0 99 13.916586875915527
0 149 14.155638694763184
0 199 13.475334167480469
0 249 13.154608726501465
0 299 12.832062721252441
0 349 13.730950355529785
0 399 12.777384757995605
0 449 12.270665168762207
0 499 12.674140930175781
0 549 12.84369945526123
0 599 12.818958282470703
0 649 12.392891883850098
0 699 11.700922966003418
0 749 11.615740776062012
0 799 12.238015174865723
0 849 13.104876518249512
0 899 12.056645393371582
0 949 10.534405708312988
0 999 11.256355285644531
0 1049 10.76865291595459
0 1099 11.018086433410645
0 1149 11.31623363494873
0 1199 10.913350105285645
0 1249 10.382098197937012
0 1299 11.408995628356934
0 1349 10.547050476074219
0 1399 10.9283447265625
0 1449 11.507621765136719
0 1499 11.409274101257324
0 1549 10.506775856018066
0 1599 10.820523262023926
0 1649 10.904644966125488
0 1699 10.249361991882324

Epoch: 1
1 49 10.368977546691895
1 99 10.40188980102539
1 149 10.878504753112793
1 199 10.106024742126465
1 249 10.026631355285645
1

### Implementation of get_similar_tokens using d2l.ai and PyTorch's CosineSimilarity

In [13]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]  # Access with index == matmul

    # Compute the cosine similarity. Add 1e-9 for numerical stability
    cos_similarity = nn.CosineSimilarity(dim=0, eps=1e-6)
    cos = cos_similarity(W, x)

    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:  # Remove the input words
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))

get_similar_tokens('chip', 3, net[0])

cosine sim=0.165: cigarette
cosine sim=0.162: british
cosine sim=0.143: old


In [14]:
words_for_similarity = ['chip', 'hardware', 'semiconductor']

for word in words_for_similarity:
    print(f"\nSimilarity for '{word}':")
    get_similar_tokens(word, 3, net[0])



Similarity for 'chip':
cosine sim=0.165: cigarette
cosine sim=0.162: british
cosine sim=0.143: old

Similarity for 'hardware':
cosine sim=0.177: unit
cosine sim=0.171: were
cosine sim=0.171: <unk>

Similarity for 'semiconductor':
cosine sim=0.340: the
cosine sim=0.312: N
cosine sim=0.163: problem
