In [0]:
from torch.autograd import Variable
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.optim as optim

In [0]:
context_size = 3
embed_size = 2
xmax = 2
alpha = 0.75
batch_size = 20
l_rate = 0.001
num_epochs = 2

In [0]:
corpus = [
    "In literary theory, a text is any object that can be read, whether this object is a work of literature, a street sign, an arrangement of buildings on a city block, or styles of clothing. It is a coherent set of signs that transmits some kind of informative message.[1] This set of signs is considered in terms of the informative message's content, rather than in terms of its physical form or the medium in which it is represented."
    "Within the field of literary criticism, text also refers to the original information content of a particular piece of writing; that is, the text of a work is that primal symbolic arrangement of letters as originally composed, apart from later alterations, deterioration, commentary, translations, paratext, etc. Therefore, when literary criticism is concerned with the determination of a text, it is concerned with the distinguishing of the original information content from whatever has been added to or subtracted from that content as it appears in a given textual document."
]

In [0]:
tokens = []
for i in range(len(corpus)):
    sents = corpus[i].split(".")
    for j in range(len(sents)):
        tokens = tokens +sents[j].lower().split(" ")

In [0]:
# Create word to index mapping
my_dict = list(set(tokens))

In [0]:
word2idx={}
idx2word={}
for ind,token in enumerate(my_dict):
    word2idx[token]=ind
    idx2word[ind]=token

In [0]:
ss=len(my_dict)
zero_matrix=np.zeros((ss,ss))
for idx in range (len(tokens)-1):
    ind1 = word2idx[tokens[idx]]
    ind2 = word2idx[tokens[idx+1]]
    #print(tokens[idx],tokens[idx+1])
    zero_matrix[ind1,ind2]+=1

In [0]:
coocs = np.transpose(np.nonzero(zero_matrix))

In [0]:
# Weight function
def wf(x):
    if x < xmax:
        return (x/xmax)**alpha
    return 1

In [0]:
vocab_size=len(my_dict)
w_list_size=len(tokens)

# Set up word vectors and biases
l_embed, r_embed = [
    [Variable(torch.from_numpy(np.random.normal(0, 0.01, (embed_size, 1))),
        requires_grad = True) for j in range(vocab_size)] for i in range(2)]
l_biases, r_biases = [
    [Variable(torch.from_numpy(np.random.normal(0, 0.01, 1)), 
        requires_grad = True) for j in range(vocab_size)] for i in range(2)]

In [0]:
# Set up optimizer
optimizer = optim.Adam(l_embed + r_embed + l_biases + r_biases, lr = l_rate)

In [0]:
# Batch sampling function
def gen_batch():
    sample = np.random.choice(np.arange(len(coocs)), size=batch_size, replace=False)
    l_vecs, r_vecs, covals, l_v_bias, r_v_bias = [], [], [], [], []
    for chosen in sample:
        ind = tuple(coocs[chosen])
        l_vecs.append(l_embed[ind[0]])
        r_vecs.append(r_embed[ind[1]])
        covals.append(zero_matrix[ind])
        l_v_bias.append(l_biases[ind[0]])
        r_v_bias.append(r_biases[ind[1]])
    return l_vecs, r_vecs, covals, l_v_bias, r_v_bias

In [33]:
# Train model
for epoch in range(num_epochs):
    num_batches = int(w_list_size/batch_size)
    avg_loss = 0.0
    for batch in range(num_batches):
        optimizer.zero_grad()
        l_vecs, r_vecs, covals, l_v_bias, r_v_bias = gen_batch()
        loss = sum([torch.mul((torch.dot(l_vecs[i].view(-1), r_vecs[i].view(-1)) +
                l_v_bias[i] + r_v_bias[i] - np.log(covals[i]))**2,
                wf(covals[i])) for i in range(batch_size)])
        avg_loss += loss.data[0]/num_batches
        loss.backward()
        optimizer.step()
    print("Average loss for epoch "+str(epoch+1)+": ", avg_loss)

Average loss for epoch 1:  tensor(1.4222, dtype=torch.float64)
Average loss for epoch 2:  tensor(1.0693, dtype=torch.float64)


In [0]:
# Visualize embeddings
if embed_size == 2:
    # Pick some random words
    word_inds = np.random.choice(np.arange(len(my_dict)), size=10, replace=False)
    for word_ind in word_inds:
        # Create embedding by summing left and right embeddings
        w_embed = (l_embed[word_ind].data + r_embed[word_ind].data).numpy()
        x, y = w_embed[0][0], w_embed[1][0]
        plt.scatter(x, y)
        plt.annotate(my_dict[word_ind], xy=(x, y), xytext=(5, 2),
            textcoords='offset points', ha='right', va='bottom')
    plt.savefig("glove.png")