# Word2Vec Skipgram impl. in PyTorch

["Word2Vec in Pytorch - Continuous Bag of Words and Skipgrams: Pytorch implementation"](https://srijithr.gitlab.io/post/word2vec/)  
Srijith Rajamohan (2018-09-09)

which was adapted from <https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#an-example-n-gram-language-modeling>

In [24]:
import csv
import os
from os.path import dirname


max context 43
min context 1


In [None]:
MAX_ROW = 250000 # 231638

max_context = 0
min_context = 1000

recipes = []
vocab = set()

with open(dirname(os.getcwd()) + "/dat/all_ind.csv", "r") as f:
    reader = csv.reader(f)
    next(reader, None)

    for i, row in enumerate(reader):
        id = row[0]
        ind_set = set(eval(row[1]))

        recipes.append([id, ind_set])
        vocab.update(ind_set)
        
        max_context = max(max_context, len(ind_set))
        min_context = min(min_context, len(ind_set))

        if i > MAX_ROW:
            break

print("max context", max_context)
print("min context", min_context)

In [25]:
word_to_ix = {word: i for i, word in enumerate(vocab)}

print("vocab size", len(list(vocab)))
print(word_to_ix)

vocab size 14942
{'fennel pollen': 0, 'kombu': 1, 'kraft caramels': 2, 'candy covered plain chocolate candies': 3, 'pina colada concentrate': 4, 'shredded coconut': 5, 'ciabatta rolls': 6, 'crusty bread rolls': 7, 'vanilla gelato': 8, 'dried lemon peel': 9, 'pandan extract': 10, 'roasted garlic triscuits': 11, 'ground turmeric': 12, 'sweet cream': 13, 'bourbon vanilla': 14, 'drumstick leaves': 15, 'cornbread': 16, 'moose rump roast': 17, 'organic turkey': 18, 'boniato': 19, 'nonfat italian dressing': 20, 'marinated artichokes': 21, 'macadamia syrup': 22, 'lamb steaks': 23, 'dark mexican beer': 24, 'sourdough hamburger buns': 25, 'frozen pizza crust': 26, 'cheshire cheese': 27, 'cooked sweet potato': 28, 'recipe pizza dough': 29, 'anejo cheese': 30, 'roasted sweet red peppers': 31, 'four cheese pasta sauce': 32, 'twix candy bar': 33, 'lettuce head': 34, 'prepared sugar-free pudding': 35, 'dried pinto beans': 36, 'chicken drumstick': 37, 'ham shank': 38, 'pinhead oats': 39, 'diet orange 

In [26]:
recipes

[['137739',
  {'butter',
   'honey',
   'mexican seasoning',
   'mixed spice',
   'olive oil',
   'salt',
   'winter squash'}],
 ['31490',
  {'cheese',
   'eggs',
   'milk',
   'prepared pizza crust',
   'salt and pepper',
   'sausage patty'}],
 ['112140',
  {'cheddar cheese',
   'chili powder',
   'diced tomatoes',
   'ground beef',
   'ground cumin',
   'kidney beans',
   'lettuce',
   'rotel tomatoes',
   'salt',
   'tomato paste',
   'tomato soup',
   'water',
   'yellow onions'}],
 ['59389',
  {'new potatoes',
   'olive oil',
   'parsley',
   'pepper',
   'red bell pepper',
   'red wine vinegar',
   'salt',
   'shallots',
   'spreadable cheese with garlic and herbs',
   'tarragon',
   'yellow bell pepper'}],
 ['44061',
  {'apple cider vinegar',
   'cinnamon oil',
   'clove oil',
   'dry mustard',
   'pepper',
   'salt',
   'sugar',
   'tomato juice'}],
 ['5289',
  {'apple', 'frozen apple juice concentrate', 'milk', 'vanilla ice cream'}],
 ['25274',
  {'extra virgin olive oil',
   

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [None]:
class SkipgramModeler (nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(SkipgramModeler, self).__init__()
        self.context_size = context_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, context_size * vocab_size)

        
    def forward (self, inputs):
        # -1 implies size inferred for that index from the size of the data
        embeds = self.embeddings(inputs).view((1, -1))  
        #print(np.mean(np.mean(self.linear2.weight.data.numpy())))
        
        out1 = F.relu(self.linear1(embeds)) # output of first layer
        out2 = self.linear2(out1)           # output of second layer
        #print(embeds)
        
        log_probs = F.log_softmax(out2, dim=1).view(CONTEXT_SIZE, -1)
        return log_probs

    
    def predict (self, input):
        context_idxs = torch.tensor([word_to_ix[input]], dtype=torch.long)
        res = self.forward(context_idxs)
        res_arg = torch.argmax(res)
        res_val, res_ind = res.sort(descending=True)
        indices = [res_ind[i][0] for i in np.arange(0, self.context_size)]
        
        for arg in indices:
            print( [ (key, val) for key,val in word_to_ix.items() if val == arg ])


    def freeze_layer (self, layer):
        for name, child in model.named_children():
            print(name, child)
            
            if name == layer:
                for names, params in child.named_parameters():
                    print(names, params)
                    print(params.size())
                    params.requires_grad = False

                    
    def print_layer_parameters (self):
        for name, child in model.named_children():
                print(name, child)
                
                for names, params in child.named_parameters():
                    print(names, params)
                    print(params.size())

                    
    def write_embedding_to_file (self, filename):
        for i in self.embeddings.parameters():
            weights = i.data.numpy()


np.save(filename, weights)

In [None]:
from itertools import islice

def window (seq, n=2):
    """Returns a sliding window (of width n) over data from the iterable
       s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   """
    it = iter(seq)
    result = tuple(islice(it, n))
    
    if len(result) == n:
        yield result

    for elem in it:
        result = result[1:] + (elem,)
        yield result

list(window(["fee", "fi", "foe", "fum", "foo"], n=3))

In [None]:
CONTEXT_SIZE = 3
EMBEDDING_DIM = 10

ngrams = []

for id, ind_set in recipes:
    for ind in ind_set:
        full_context = ind_set.difference({ ind })
        
        if len(full_context) < CONTEXT_SIZE:
            full_context.add("")
        
        for skip_gram in window(full_context, n=CONTEXT_SIZE):
            ngrams.append([ ind, skip_gram ])

ngrams

In [None]:
MAX_EPOCH = 10

torch.manual_seed(1)
loss_function = nn.NLLLoss()

model = SkipgramModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(MAX_EPOCH):
    total_loss = 0

    for context, target in ngrams:
        # prep inputs: convert words into integer indices, wrap them in tensors
        context_idxs = torch.tensor([word_to_ix[context]], dtype=torch.long)

        # since PyTorch *accumulates* gradients, before passing in a new instance, we must to zero out the gradients from the old instance
        model.zero_grad()

        # run forward pass, obtaining log probabilities over the next words
        log_probs = model(context_idxs)

        # compute loss function (Torch needs the target word wrapped in a tensor)
        target_list = torch.tensor([word_to_ix[w] for w in target], dtype=torch.long)
        loss = loss_function(log_probs, target_list)

        # run backward pass, update the gradient
        loss.backward()
        optimizer.step()

        # coerce 1-element tensor into a numeric value
        total_loss += loss.item()
        
    print(epoch, total_loss)

In [None]:
model.write_embedding_to_file("tmp.npy")

In [None]:
# print the model layer parameters
model.print_layer_parameters()

In [None]:
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

def get_key (word_id):
    for key, val in word_to_ix.items():
        if val == word_id:
            print(key)

def cluster_embeddings (filename, k=3):
    X = np.load(filename)
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)

    center = kmeans.cluster_centers_
    distances = euclidean_distances(X, center)

    for i in np.arange(0, distances.shape[1]):
        print(distances)
        word_id = np.argmin(distances[:, i])
        print(word_id)
        get_key(word_id)

In [None]:
cluster_embeddings("tmp.npy", k=10)

In [None]:
# predict the next word given n context words
model.predict("butter")

In [None]:
input = "butter"
context_idxs = torch.tensor([word_to_ix[input]], dtype=torch.long)
print(context_idxs)

res = model.forward(context_idxs)
res_arg = torch.argmax(res)
res_val, res_ind = res.sort(descending=True)

indices = [res_ind[i][0] for i in np.arange(0, model.context_size)]
print(indices)
        
for arg in indices:
    print( [ (key, val) for key, val in word_to_ix.items() if val == arg ])