In [37]:
import re
import random
import torch.nn as nn
import torch
import numpy as np
from fastcore import *
from nbdev.showdoc import *
from tqdm import tqdm

# Goals of this project
The aim of this project was to act as a simple exploratory project to practice building a fairly fundamental tool in NLP.

I learned the concepts behind the word2vec model, and while it was fairly understandable I wanted to see how it would translate to code.

I also got to practice working more with the pytorch library as a result, which was a big win.

The biggest challenge for me in building this was getting the vector dimensions right for matrix multiplication. Learning to respect that process and approach it slowly was valuable.



In [38]:
def read_file(file_name):
    with open(file_name, 'r') as f:
        data = f.read()
    return data

raw_data = read_file('shakespeare.txt')


In [39]:





def get_scalar_loss(pos_score, neg_score, criterion, concatenated_data):
    """function to get the scalar loss. Unused because the results are generally bad from my current experiments."""
    score = torch.cat([pos_score, neg_score.flatten()], dim=0)
    combined_len = len(pos_score) + len(neg_score)
    pos_u_data = torch.ones(len(pos_score), 1)
    neg_v_data = torch.zeros(len(neg_score.flatten()), 1)
    loss = criterion(score, concatenated_data)
    return loss
    loss = get_scalar_loss( pos_score, neg_score, criterion, concat_data)


In [40]:
show_doc(non_scalar_loss)

---

### non_scalar_loss

>      non_scalar_loss (score, neg_score, lr, weight_decay, model)

In [5]:
def remove_non_alpha_characters(data):
    """Remove whitespaces and non alpha characters from a string"""
    data = data.lower()
    data = re.sub(r'[^a-zA-Z\s]', '', data)
    data = re.sub(r'\s+', ' ', data)
    return data

def remove_stopwords(data):
    stopwords = ['a', 'an', 'the', 'and', 'or', 'but', 'if', 'then', 'else', 'when', 'at', 'from', 'by', 'on', 'off', 'for', 'in', 'out', 'over', 'to', 'into', 'with', ""]
    data = [word for word in data if word not in stopwords]
    return data

data = remove_non_alpha_characters(raw_data)
data = data.split(" ")
data = remove_stopwords(data)
unique_words = list(set(data))

In [6]:
unique_dict = {word: i for i, word in enumerate(unique_words)}


### Create a dictionary of words and the positions they appear in the data
This is to speed up the time it takes to build our database

In [9]:
word_pos = {}
for i, word in enumerate(data):
    if word in word_pos:
        word_pos[word].append(i)
    else:
        word_pos[word] = [i]


In [57]:

def return_list_without_a_value(list, value):
    return [x for x in list if x != value]

window_size = 5
dataset = []
sample_data = data

for i, val in enumerate(sample_data):
    if i > len(sample_data) - window_size:
        break
    sub = sample_data[i:i+window_size]
    included = return_list_without_a_value(sub, val)
    for target in included:
        dataset.append((unique_dict[val],unique_dict[target]))    

In [192]:
def get_context_words(data, word, window_size):
    context_words = []
    for i in word_pos[word]:
        
        context_words.extend(data[i-window_size: i])
        context_words.extend(data[i+1:i+window_size])


    return context_words


window_size = 5
dataset = np.array([0,0]).reshape(-1,2)
sample_data = data

count = 0

for i, val in tqdm(enumerate(unique_words)):
    # if i > len(sample_data) - window_size:
    #     break
    context_words = get_context_words(sample_data, val, window_size)
    
    # using numpy broadcasting we create a numpy array with the unique_dict[val] as the first column and the context words as the second column
    new_data = np.array([unique_dict[val]] * len(context_words)).reshape(-1,1)
    new_data = np.append(new_data, np.array([unique_dict[word] for word in context_words]).reshape(-1,1), axis=1)
    
    dataset = np.append(dataset, new_data, axis=0)
 

30997it [03:17, 156.96it/s] 


In [194]:
batch_size = 15
n_iters = 3000
num_epochs = 100
num_epochs = int(num_epochs)
# create a train_loader that will randomly generate examples forever

train_loader = torch.utils.data.DataLoader(dataset=dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

In [196]:
count = 0
for i, val in enumerate(train_loader):
    print(val)
    count +=1 
    if count > 5:
        break

tensor([[27613, 22237],
        [21709, 10357],
        [21444, 20861],
        [ 9227,  7180],
        [ 7098, 30318],
        [17800, 21128],
        [24258, 24290],
        [29306, 10357],
        [25981,  5158],
        [  879, 14039],
        [29609, 10140],
        [22891,  7996],
        [19007, 10140],
        [20335, 23987],
        [ 3375, 14039]])
tensor([[ 9964, 28049],
        [14573,  2070],
        [12195, 17800],
        [15011, 14303],
        [17266, 17885],
        [ 3484, 13198],
        [  704,  9789],
        [ 3919,   998],
        [ 4521,  6803],
        [29280, 17800],
        [20334,  9046],
        [30318, 21221],
        [24597, 28378],
        [19007, 12011],
        [30278, 28049]])
tensor([[ 7036,  6102],
        [ 7580, 17204],
        [ 2709, 17290],
        [ 7996, 14956],
        [ 3135, 30178],
        [24081,  2004],
        [20335, 11720],
        [ 3919, 10710],
        [23698,   721],
        [ 3919, 10140],
        [14247, 28049],
        [ 6299

In [197]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.u_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.v_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.init_emb()

    def init_emb(self):
        init_mean = 0
        init_std = 0.01
        self.u_embeddings.weight.data.normal_(init_mean, init_std)
        self.v_embeddings.weight.data.normal_(init_mean, init_std)

    def forward(self, pos_u, pos_v, neg_v):
        emb_u = self.v_embeddings(pos_u).view(-1,1, self.embedding_dim).squeeze()
        emb_v = self.v_embeddings(pos_v).view(-1, self.embedding_dim).squeeze()
        score = torch.bmm(emb_u.unsqueeze(1), emb_v.unsqueeze(2)).squeeze()
        score = torch.sigmoid(score)
        neg_emb_v = self.v_embeddings(neg_v).view(-1, self.embedding_dim, neg_v.shape[1])
        neg_score = torch.bmm(emb_u.unsqueeze(1), neg_emb_v).squeeze()
        neg_score = torch.sigmoid(neg_score)
        return score, neg_score
    
    def forward_without_negatives(self, word1, word2):
        pos_u = torch.tensor([unique_dict[word1]])
        pos_v = torch.tensor([unique_dict[word2]])
        emb_u = self.u_embeddings(pos_u).view(-1, 1, self.embedding_dim).squeeze()
        emb_v = self.v_embeddings(pos_v).view(-1, self.embedding_dim).squeeze()
        score = torch.dot(emb_u, emb_v)
        score = torch.sigmoid(score)
        return score

    def get_dict_embeddings(self):
        return self.u_embeddings.weight.data.cpu().numpy()
    
    def get_embedding_from_word(self, word):
        index = unique_dict[word]
        return self.u_embeddings.weight.data[index]
    
    def get_embedding_from_index(self, index):
        return self.u_embeddings.weight.data[index]

    def save_embedding(self, id2word, file_name):
        embedding = self.u_embeddings
        fout = open(file_name, 'w')
        fout.write('{} {}\n'.format(len(id2word), self.embedding_dim))
        for wid, w in id2word.items():
            e = ' '.join(map(lambda x: str(x), self.get_embedding_from_index(wid)))
            fout.write('{} {}\n'.format(w, e))
        fout.close()
    
    def import_embeddings(self, file_name):
        fin = open(file_name, 'r')
        n, d = map(int, fin.readline().split())
        embedding = np.zeros((n, d))
        word2id = {}
        for line in fin:
            tokens = line.rstrip().split(' ')
            word2id[tokens[0]] = len(word2id)
            embedding[word2id[tokens[0]]] = list(map(float, tokens[1:]))
        return embedding, word2id
    
    def non_scalar_loss(self, score, neg_score, lr, weight_decay):
        pos_loss = -torch.mean(torch.log(score))
        neg_loss = -torch.mean(torch.sum(torch.log(1 - neg_score), dim=1))
        loss = pos_loss + neg_loss
        # add L2 regularization term
        l2_loss = 0
        for param in self.parameters():
            l2_loss += torch.sum(param**2)
            loss += weight_decay * l2_loss
        return loss


  
embedding_dim = 100
window_size = 5

dictionary_length = len(unique_words)

model = SkipGramModel(dictionary_length, embedding_dim)



In [206]:

criterion = nn.BCELoss()
learning_rate = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0001)
# optimizer = torch.optim.SparseAdam(model.parameters(), lr=learning_rate)

loss_sum = 0

negative_sample_length = 2

pos_u_data = torch.ones(batch_size)
neg_v_data = torch.zeros(batch_size*negative_sample_length)
concat_data = torch.cat([pos_u_data, neg_v_data], dim=0)
step_interval = 200
weight_decay = 0.0001
epochs = 15
for epoch in range(epochs):
    for i, val in enumerate(train_loader):

        pos_u = torch.tensor(val[:,0])
        pos_v = torch.tensor(val[:,1])
        neg_v = torch.randint(0, dictionary_length, (batch_size, negative_sample_length))
        optimizer.zero_grad()
        # pass in batch of pos u and pos v
        # print(neg_v.shape)
        # print(val.shape)

        pos_score, neg_score = model(pos_u, pos_v, neg_v)
        
        loss =model.non_scalar_loss(pos_score, neg_score, learning_rate, weight_decay)
        
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
        if i % step_interval == 0 and i>1:
            print(' Step [{}/{}], Loss: {:.4f}' 
                    .format(i+1, len(dataset)//batch_size, loss_sum/step_interval))
            loss_sum = 0
        if i > len(train_loader) - batch_size:
            break
        

  pos_u = torch.tensor(val[:,0])
  pos_v = torch.tensor(val[:,1])


 Step [201/479987], Loss: 1.9303
 Step [401/479987], Loss: 1.8715
 Step [601/479987], Loss: 1.8735
 Step [801/479987], Loss: 1.8821
 Step [1001/479987], Loss: 1.8817
 Step [1201/479987], Loss: 1.8808
 Step [1401/479987], Loss: 1.8786
 Step [1601/479987], Loss: 1.8785
 Step [1801/479987], Loss: 1.8912
 Step [2001/479987], Loss: 1.8740
 Step [2201/479987], Loss: 1.8702
 Step [2401/479987], Loss: 1.8824
 Step [2601/479987], Loss: 1.8868
 Step [2801/479987], Loss: 1.8704
 Step [3001/479987], Loss: 1.8730
 Step [3201/479987], Loss: 1.8702
 Step [3401/479987], Loss: 1.8719
 Step [3601/479987], Loss: 1.8735
 Step [3801/479987], Loss: 1.8669
 Step [4001/479987], Loss: 1.8799
 Step [4201/479987], Loss: 1.8618
 Step [4401/479987], Loss: 1.8779
 Step [4601/479987], Loss: 1.8751
 Step [4801/479987], Loss: 1.8755
 Step [5001/479987], Loss: 1.8610
 Step [5201/479987], Loss: 1.8620
 Step [5401/479987], Loss: 1.8706
 Step [5601/479987], Loss: 1.8667
 Step [5801/479987], Loss: 1.8690
 Step [6001/479987

[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

In [207]:
def subtract_vector(vector1,vector2):
    return get_emb(vector1) - get_emb(vector2)

def add_vector(vector1,vector2):
    return get_emb(vector1) + get_emb(vector2)

def cos_sim(vector1, vector2):
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

def cos_sim_word(word1, word2):
    vector1 = get_emb(word1)
    vector2 = get_emb(word2)
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

def get_emb(word):
    return model.get_embedding_from_word(word)

def invert_dictionary(dictionary):
    return {v: k for k, v in dictionary.items()}

def get_closest_vector(vector):
    max = 0
    target = None
    for key,item in unique_dict.items():
        comparative = get_emb(key)
        comparison = cos_sim(vector, comparative)
        if comparison > max:
            max = comparison
            target = key

        
    return target


In [208]:
vector = subtract_vector("king", "man")
vector = vector +get_emb("woman") 


In [209]:
print(cos_sim_word("flower", "rose"),("flower", "rose"))
print(cos_sim_word("flower", "tree"), ("flower", "tree"))
print(cos_sim_word("flower", "dog"), ("flower", "dog"))
print(cos_sim_word("flower", "cat"), ("flower", "cat"))
print(cos_sim_word("flower", "car"), ("flower", "car"))
print(cos_sim_word("cat", "dog"), ("cat", "dog"))
print(cos_sim_word("king", "queen"), ("king", "queen"))
print(cos_sim_word("king", "royalty"), ("king", "royalty"))
print(cos_sim_word("queen", "royalty"), ("queen", "royalty"))
print(cos_sim_word("man", "king"), ("man", "king"))
print(cos_sim_word("woman", "king"), ("woman", "king"))



-0.1443536 ('flower', 'rose')
-0.0018065075 ('flower', 'tree')
-0.051516347 ('flower', 'dog')
-0.12853625 ('flower', 'cat')
-0.09384106 ('flower', 'car')
0.094177715 ('cat', 'dog')
0.08185257 ('king', 'queen')
-0.081511214 ('king', 'royalty')
0.42667183 ('queen', 'royalty')
-0.045385525 ('man', 'king')
0.0038505516 ('woman', 'king')


In [204]:
reversed_unique_dict = invert_dictionary(unique_dict)
model.forward_without_negatives("king", "man")

tensor(0.4999, grad_fn=<SigmoidBackward0>)

In [205]:
index1 = unique_dict["king"]
index2 = unique_dict["man"]
vector = subtract_vector("king", "man")
vector = vector+ get_emb("woman")
print(get_closest_vector(vector))

king


In [None]:
# save embeddings
path = "embeddings"
model.save_embedding
model.save_embedding(reversed_unique_dict, "embeddings.emb")

In [None]:
print(model.u_embeddings.mean()))