# 3. GloVe: Global Vectors for Word Representation

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture3.pdf
* https://nlp.stanford.edu/pubs/glove.pdf

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
import matplotlib.pyplot as plt
import itertools
from gensim.models.word2vec import Text8Corpus

%matplotlib inline

Using TensorFlow backend.


In [2]:
print(torch.__version__)
print(nltk.__version__)

0.2.0+1449c2c
3.2.4


In [3]:
USE_CUDA = torch.cuda.is_available()

In [4]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [5]:
def make_vocab_vector(vocab, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], vocab))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else  Variable(torch.LongTensor(idxs))
    return tensor

def make_input_vector(input,word2index):
    tensor = Variable(torch.LongTensor([word2index[input]]) if input in word2index.keys() else torch.LongTensor([word2index["<UNK>"]]))
    if USE_CUDA:
        tensor = tensor.cuda()
    return tensor

#make_input_vector(train_data[0][1], word2index)

## Load corpus : CoNLL 2000 corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

wget http://mattmahoney.net/dc/text8.zip -P /tmp<br>
unzip text8.zip

In [6]:
nltk.corpus.conll2000.fileids()

['train.txt', 'test.txt']

In [7]:
corpus = list(nltk.corpus.conll2000.sents('train.txt'))[:1000]
corpus = [[word.lower() for word in sent] for sent in corpus]

In [8]:
len(corpus)

1000

In [6]:
corpus = list(itertools.islice(Text8Corpus('/tmp/text8'),None))[:5]

### Prepare train data 

In [9]:
vocab = list(set(flatten(corpus)))

In [10]:
len(vocab)

4558

In [11]:
word2index={}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)
        
index2word={v:k for k,v in word2index.items()}

In [12]:
WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>']*WINDOW_SIZE+c+['<DUMMY>']*WINDOW_SIZE,WINDOW_SIZE*2+1)) for c in corpus])

window_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if i==WINDOW_SIZE or window[i]=='<DUMMY>': continue
        window_data.append((window[WINDOW_SIZE],window[i]))


### Build Co-occurence Matrix X

In [13]:
X_i = Counter(flatten(corpus)) # X_i

In [14]:
X_ik_window_5 = Counter(window_data) # Co-occurece in window size 5

In [15]:
X_ik={}

Because of model complexity, It is important to determine whether a tighter bound can be placed on the number of nonzero elements of X.

In [16]:
for vi in vocab:
    X_ik[vi]={}
    for vk in vocab:
        target = (vi,vk)
        if target in X_ik_window_5.keys(): # nonzero elements
            co_occer = X_ik_window_5[target]
            X_ik[vi][vk]=co_occer+1 # log(Xik) -> log(Xik+1) to prevent divergence
        else:
            X_ik[vi][vk]=1
            continue

In [17]:
test = random.choice(window_data)
print(test)
print(X_ik[test[0]][test[1]]==X_ik[test[1]][test[0]])

('demand', '70')
True


### Weighting Function 

In [18]:
def weighting(w_i,w_j):
    x_ij = X_ik[w_i][w_j]
    x_max = 100 #100 # fixed in paper
    alpha = 0.75
    
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha
    else:
        result = 1
    
    return result

In [19]:
weighting_dic={}

In [20]:
for w_i in X_ik.keys():
    weighting_dic[w_i]={}
    for w_j in X_ik[w_i].keys():
        weighting_dic[w_i][w_j] = weighting(w_i,w_j)

In [21]:
len(X_ik) # < O(|V^2|) 

4558

### make train set 

In [None]:
u_p=[] # center vec
v_p=[] # context vec
co_p=[] # log(x_ij)
weight_p=[] # f(x_ij)

for w_i in X_ik.keys():
    for w_j in X_ik.keys():
        temp = make_input_vector(w_i,word2index)
        u_p.append(temp.view(1,-1))
        temp = make_input_vector(w_j,word2index)
        v_p.append(temp.view(1,-1))
        temp = torch.log(Variable(torch.Tensor([X_ik[w_i][w_j]]))).cuda if USE_CUDA else torch.log(Variable(torch.Tensor([X_ik[w_i][w_j]])))
        co_p.append(temp.view(1,-1))
        temp = Variable(torch.Tensor([weighting_dic[w_i][w_j]])).cuda() if USE_CUDA else Variable(torch.Tensor([weighting_dic[w_i][w_j]]))
        weight_p.append(temp.view(1,-1))
    
train_data = list(zip(u_p,v_p,co_p,weight_p))
print(train_data[0])

### Modeling 

In [84]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,projection_dim):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size,1)
        self.u_bias = nn.Embedding(vocab_size,1)
        
        initrange = (2.0 / (vocab_size+projection_dim))**0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) # init
        self.embedding_u.weight.data.uniform_(-initrange, initrange) # init
        self.v_bias.weight.data.uniform_(-initrange, initrange) # init
        self.u_bias.weight.data.uniform_(-initrange, initrange) # init
        
    def forward(self, inputs,targets,coocs,weights):
        center_embeds = self.embedding_v(inputs) # B x 1 x D
        context_embeds = self.embedding_u(targets) # B x 1 x D
        
        center_bias = self.v_bias(inputs).squeeze(1)
        context_bias = self.u_bias(targets).squeeze(1)
        
        inner_product = center_embeds.bmm(context_embeds.transpose(1,2)).squeeze(1) # Bx1
        
        loss = weights*torch.pow(inner_product +center_bias + context_bias - coocs,2)
        
        return torch.sum(loss)
    
    def prediction(self, inputs):
        v_embeds = self.embedding_v(inputs) # B x 1 x D
        u_embeds = self.embedding_u(inputs) # B x 1 x D
                
        return v_embeds+u_embeds # final embed

### Train 

In [85]:
PROJECTION = 30 # Embedding size
BATCH_SIZE = 128
STEP_SIZE = 50

In [86]:
losses = []
model = GloVe(len(word2index),PROJECTION)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [87]:
for step in range(STEP_SIZE):
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        
        inputs, targets, coocs, weights = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        coocs = torch.cat(coocs)
        weights = torch.cat(weights)
        model.zero_grad()

        loss = model(inputs,targets,coocs,weights)
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0])
    if step % 10==0:
        print("Step : %d, mean_loss : %.02f" % (step,np.mean(losses)))
        losses=[]

Step : 0, mean_loss : 18.76
Step : 10, mean_loss : 2.21
Step : 20, mean_loss : 0.86
Step : 30, mean_loss : 0.33
Step : 40, mean_loss : 0.23


### Test 

In [55]:
from scipy.spatial.distance import euclidean, cosine

In [56]:
def word_similarity(target,vocab):
    if USE_CUDA:
        target_V = model.prediction(make_input_vector(target,word2index)).data.cpu().numpy()
    else:
        target_V = model.prediction(make_input_vector(target,word2index)).data.numpy()
    similarities=[]
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.cpu().numpy()
        else:
            vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.numpy()
        #similarity.append([vocabs[i],np.dot(target_V,vector.T)[0][0]])
        similarities.append([vocab[i],cosine(target_V,vector)])
     
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [88]:
test = random.choice(list(vocab))
test

'times'

In [89]:
word_similarity(test,vocab)

[['october', 1.5907794111726647],
 ['freedoms', 1.5839901299081265],
 ['text', 1.5307233472804076],
 ['benson', 1.5257806489806165],
 ['300-a-share', 1.5212695405406258],
 ['freefall', 1.5198946796552519],
 ['ownership', 1.493166502678333],
 ['increasing', 1.4883770922655608],
 ['public', 1.4601176949709649],
 ['interpretations', 1.4579647795458324]]

# TODO 

* 이게 제대로 짠건지 검증을 못하겠네... 어케하지?
* model complexity 부분 좀 더 보기... 
* CoNLL-2003으로..

In [27]:
from glove import Corpus, Glove

In [31]:
cp = Corpus()

In [43]:
cp.fit(corpus, window=5)

In [44]:
glove = Glove(no_components=30, learning_rate=0.05)

In [45]:
cp.matrix

<4558x4558 sparse matrix of type '<class 'numpy.float64'>'
	with 69359 stored elements in COOrdinate format>

In [49]:
glove.fit(cp.matrix, epochs=30, no_threads=4, verbose=True)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [50]:
glove.add_dictionary(cp.dictionary)

In [95]:
test = random.choice(vocab)
test

'consist'

In [96]:
glove.most_similar(test,10)

[('midland', 0.8882721994231817),
 ('awful', 0.88121606196395497),
 ('charges', 0.8703808715710657),
 ('2.3', 0.86826721700041876),
 ('prevalent', 0.86458589178613077),
 ('dogged', 0.86454535645361885),
 ('nonpriority', 0.85623096074598981),
 ('heir', 0.8536517433282913),
 ('sign', 0.85201640079848728)]

In [97]:
word_similarity(test,vocab)

[['ranges', 1.5860707637730798],
 ['observers', 1.552180761069093],
 ['couple', 1.5496846987832402],
 ['friendly', 1.5461075304161658],
 ['putting', 1.5237145149685019],
 ['non-controlling', 1.5160237250838196],
 ['projections', 1.5040663254531168],
 ['story', 1.4871240647420989],
 ['features', 1.484685517512311],
 ['one-time', 1.4832189985658848]]