In [1]:
import re
import random
import torch.nn as nn
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_file(file_name):
    with open(file_name, 'r') as f:
        data = f.read()
    return data

In [3]:
raw_data = read_file('shakespeare.txt')

In [4]:
def remove_non_alpha_characters(data):
    data = data.lower()
    # use regex to remove all non-alphanumeric characters
    data = re.sub(r'[^a-zA-Z\s]', '', data)
    # use regex to remove all whitespace characters
    data = re.sub(r'\s+', ' ', data)
    return data

def return_unique(data):
    unique = set(data)
    return list(unique)

def remove_stopwords(data):
    stopwords = ['a', 'an', 'the', 'and', 'or', 'but', 'if', 'then', 'else', 'when', 'at', 'from', 'by', 'on', 'off', 'for', 'in', 'out', 'over', 'to', 'into', 'with', ""]
    data = [word for word in data if word not in stopwords]
    return data

In [5]:
data = remove_non_alpha_characters(raw_data)
data = data.split(" ")
data = remove_stopwords(data)
unique_words = return_unique(data)

In [6]:
unique_dict = {word: i for i, word in enumerate(unique_words)}

# def one_hot_encode(words):
# words = ["boy", "car", "man"]

def one_hot_encode(words):
    length = len(words.keys())
    encoded_words = {}
    for key, value in words.items():
        one_hot = np.zeros(length)
        one_hot[value] = 1
        tensor = torch.from_numpy(one_hot).to(torch.int64)
        encoded_words[key] = tensor

    return encoded_words

encoded_data = one_hot_encode(unique_dict)

In [7]:
def return_list_without_a_value(data, value):
    return [x for x in data if x != value]



window_size = 5
dataset = []
sample_data = data

for i, val in enumerate(sample_data):
    if i > len(sample_data) - window_size:
        break
    sub = sample_data[i:i+window_size]
    included = return_list_without_a_value(sub, val)
    for target in included:
        dataset.append((unique_dict[val],unique_dict[target]))
    


    

In [8]:
class InfiniteDataLoader(torch.utils.data.DataLoader):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Initialize an iterator over the dataset.
        self.dataset_iterator = super().__iter__()

    def __iter__(self):
        return self

    def __next__(self):
        try:
            batch = next(self.dataset_iterator)
        except StopIteration:
            # Dataset exhausted, use a new fresh iterator.
            self.dataset_iterator = super().__iter__()
            batch = next(self.dataset_iterator)
        return batch

In [9]:
batch_size = 25
n_iters = 3000
num_epochs = 100
num_epochs = int(num_epochs)
# create a train_loader that will randomly generate examples forever

train_loader = torch.utils.data.DataLoader(dataset=dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

In [10]:
for i in range(1):
    for j, (x, y) in enumerate(train_loader):
        print(x)
        print(y)
        break

tensor([15270, 27243,  4041, 15531, 27917, 11384, 27285,  7566, 11190,  4266,
         6693, 21816,  9224, 28829,  2224,  3651,  7427, 21306, 22303, 15573,
        22528,    58,  7167, 19441,  6330])
tensor([11926, 11092, 13159, 15970,  7017, 26971, 16029, 29338, 21816, 11464,
         9978,  8610, 27867, 15603, 27191,  7711, 27891, 29792,  5489, 18879,
        14842,  2443, 13159, 10388, 27867])


In [38]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.u_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.v_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.init_emb()

    def init_emb(self):
        initrange = 0.5 / self.embedding_dim
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 1)

    def forward(self, pos_u, pos_v, neg_v):
        emb_u = self.u_embeddings(pos_u)
        emb_v = self.v_embeddings(pos_v)
        score = torch.mul(emb_u, emb_v)
        print(score)
        score = torch.sum(score, dim=1)
        score = torch.sigmoid(score)
        print(score)
        neg_emb_v = self.v_embeddings(neg_v)
        neg_score = torch.bmm(neg_emb_v, emb_u.unsqueeze(2)).squeeze()
        neg_score = torch.sigmoid(neg_score)
        return score, neg_score

    def get_dict_embeddings(self):
        return self.u_embeddings.weight.data.cpu().numpy()
    
    def get_embedding_from_word(self, word):
        index = unique_dict[word]
        return self.u_embeddings.weight.data[index]
    
    def get_embedding_from_index(self, index):
        return self.u_embeddings.weight.data[index]

    def save_embedding(self, id2word, file_name):
        embedding = self.input_embeddings()
        fout = open(file_name, 'w')
        fout.write('{} {}\n'.format(len(id2word), self.embedding_dim))
        for wid, w in id2word.items():
            e = ' '.join(map(lambda x: str(x), embedding[wid]))
            fout.write('{} {}\n'.format(w, e))
        fout.close()
    
    def import_embeddings(self, file_name):
        fin = open(file_name, 'r')
        n, d = map(int, fin.readline().split())
        embedding = np.zeros((n, d))
        word2id = {}
        for line in fin:
            tokens = line.rstrip().split(' ')
            word2id[tokens[0]] = len(word2id)
            embedding[word2id[tokens[0]]] = list(map(float, tokens[1:]))
        return embedding, word2id





In [39]:
embedding_dim = 100
window_size = 5

dictionary_length = len(unique_words)

model = SkipGramModel(dictionary_length, embedding_dim)



criterion = nn.BCELoss()
learning_rate = 0.05
optimizer = torch.optim.SparseAdam(model.parameters(), lr=learning_rate)

In [40]:


# for epoch in range(num_epochs):
loss_sum = 0
#TODO: add random negative sampling.

negative_sample_length = 5

pos_u_data = torch.ones(batch_size)
neg_v_data = torch.zeros(batch_size*negative_sample_length)
concat_data = torch.cat([pos_u_data, neg_v_data], dim=0)
print(concat_data)

epochs = 15
for epoch in range(epochs):
    for i, (x, y) in enumerate(train_loader):
        pos_u = x
        pos_v = y
        neg_v = torch.randint(0, dictionary_length, (batch_size, negative_sample_length))
        optimizer.zero_grad()
        pos_score, neg_score = model(pos_u, pos_v, neg_v)
        score = torch.cat([pos_score, neg_score.flatten()], dim=0)
        combined_len = len(pos_score) + len(neg_score)
        # add a column of ones to pos_u_data
        pos_u_data = torch.ones(len(pos_u), 1)
        neg_v_data = torch.zeros(len(neg_score.flatten()), 1)
        loss = criterion(score, concat_data) 
        
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
        if i % 1000 == 0:
            print(' Step [{}/{}], Loss: {:.4f}' 
                    .format(i+1, len(dataset)//batch_size, loss_sum/1000))
            loss_sum = 0

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.])
tensor([[-2.9831e-04, -1.5981e-03,  3.0355e-03,  ..., -5.5823e-04,
         -4.0636e-04, -3.2042e-03],
        [-1.7127e-03,  4.1686e-04, -4.2139e-04,  ...,  1.3538e-03,
          6.8071e-04,  3.8778e-04],
        [-1.4514e-03, -4.1527e-04,  6.2697e-04,  ..., -3.8426e-04,
          1.1767e-03, -2.5559e-03],
        ...,
     

RuntimeError: Tensors must have same number of dimensions: got 2 and 1

In [35]:
score

tensor([5.2269e-01, 4.4462e-07, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 2.4634e-12, 4.0539e-08, 9.9505e-01, 3.5667e-12, 1.0000e+00,
        1.0000e+00, 8.7406e-01, 1.0000e+00, 1.0000e+00, 1.0000e+00, 4.6503e-02,
        4.8850e-10, 4.9791e-01, 4.9963e-01, 5.0157e-01, 4.9422e-01, 4.9940e-01,
        1.5762e-18, 5.2390e-13, 1.3083e-18, 1.7323e-16, 1.4331e-10, 1.0000e+00,
        1.4966e-28, 1.8605e-13, 6.7696e-16, 1.1704e-28, 1.3062e-18, 3.9723e-14,
        4.0595e-29, 5.4408e-04, 3.7371e-23, 3.4942e-32, 1.7813e-34, 2.0635e-22,
        7.3659e-35, 2.0975e-20, 3.8495e-06, 4.4294e-03, 1.7150e-07, 2.1126e-05,
        2.4992e-03, 6.6902e-03, 1.6072e-08, 1.6487e-28, 4.8024e-22, 2.7475e-22,
        6.7192e-10, 1.0290e-04, 9.9847e-01, 8.8213e-10, 3.3383e-09, 1.7608e-10,
        6.7441e-08, 2.9392e-21, 1.0000e+00, 8.5875e-07, 1.8135e-02, 3.3512e-03,
        2.6741e-04, 1.5849e-05, 2.5614e-

In [140]:
# get cosine similarity between two vectors
def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [37]:
torch.sigmoid(torch.tensor([0,1]))

tensor([0.5000, 0.7311])

In [145]:
king_vec = model.get_embedding_from_word("king")
queen_vec = model.get_embedding_from_word("queen")

def get_emb(word):
    return model.get_embedding_from_word(word)

print(cos_sim(get_emb("man"), get_emb("queen")))

0.37350982


In [217]:
# save embeddings
path = "embeddings"
model.save_embedding(id_to_word, "outfile")

# To do
Clean up training code by adding epochs

Set up testing

run experiments

https://towardsdatascience.com/creating-word-embeddings-coding-the-word2vec-algorithm-in-python-using-deep-learning-b337d0ba17a8

https://www.deeplearningwizard.com/deep_learning/practical_pytorch/pytorch_logistic_regression/#step-3-building-model