# 3. GloVe: Global Vectors for Word Representation

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture3.pdf
* https://nlp.stanford.edu/pubs/glove.pdf

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
import matplotlib.pyplot as plt
import itertools
from gensim.models.word2vec import Text8Corpus

%matplotlib inline

In [3]:
USE_CUDA = torch.cuda.is_available()

In [4]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [5]:
def make_vocab_vector(vocab, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], vocab))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else  Variable(torch.LongTensor(idxs))
    return tensor

def make_input_vector(input,word2index):
    tensor = Variable(torch.LongTensor([word2index[input]]) if input in word2index.keys() else torch.LongTensor([word2index["<UNK>"]]))
    if USE_CUDA:
        tensor = tensor.cuda()
    return tensor

#make_input_vector(train_data[0][1], word2index)

## Load corpus : CoNLL 2000 corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

wget http://mattmahoney.net/dc/text8.zip -P /tmp<br>
unzip text8.zip

In [68]:
corpus = list(itertools.islice(Text8Corpus('../dataset/corpus/text8'),None))[:5]

In [6]:
nltk.corpus.conll2000.fileids()

['train.txt', 'test.txt']

In [7]:
corpus = list(nltk.corpus.conll2000.sents('train.txt'))[:1000]
corpus = [[word.lower() for word in sent] for sent in corpus]

### Prepare train data 

In [69]:
vocab = list(set(flatten(corpus)))

In [70]:
len(vocab)

7792

In [71]:
%%time
word2index={}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)
        
index2word={v:k for k,v in word2index.items()}

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.17 ms


In [72]:
WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>']*WINDOW_SIZE+c+['<DUMMY>']*WINDOW_SIZE,WINDOW_SIZE*2+1)) for c in corpus])

window_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if i==WINDOW_SIZE or window[i]=='<DUMMY>': continue
        window_data.append((window[WINDOW_SIZE],window[i]))


### Build Co-occurence Matrix X

In [73]:
def weighting(w_i,w_j):
    try:
        x_ij = X_ik[(w_i,w_j)]
    except:
        x_ij = 1
        
    x_max = 100 #100 # fixed in paper
    alpha = 0.75
    
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha
    else:
        result = 1
    
    return result

In [74]:
X_i = Counter(flatten(corpus)) # X_i

In [75]:
X_ik_window_5 = Counter(window_data) # Co-occurece in window size 5

In [76]:
X_ik={}
weighting_dic={}

Because of model complexity, It is important to determine whether a tighter bound can be placed on the number of nonzero elements of X.

In [77]:
from itertools import combinations_with_replacement

In [78]:
for bigram in combinations_with_replacement(vocab, 2):
    if bigram in X_ik_window_5.keys(): # nonzero elements
        co_occer = X_ik_window_5[bigram]
        X_ik[bigram]=co_occer+1 # log(Xik) -> log(Xik+1) to prevent divergence
        X_ik[(bigram[1],bigram[0])]=co_occer+1
    else:
        pass
        
    weighting_dic[bigram] = weighting(bigram[0],bigram[1])
    weighting_dic[(bigram[1],bigram[0])] = weighting(bigram[1],bigram[0])

In [79]:
test = random.choice(window_data)
print(test)
try:
    print(X_ik[(test[0],test[1])]==X_ik[(test[1],test[0])])
except:
    1

('shall', 'the')
True


### Weighting Function 

### make train set 

In [81]:
u_p=[] # center vec
v_p=[] # context vec
co_p=[] # log(x_ij)
weight_p=[] # f(x_ij)

for pair in window_data: # 실은 여기서 vocab x vocab이어야 하지만..
    temp = make_input_vector(pair[0],word2index)
    u_p.append(temp.view(1,-1))
    temp = make_input_vector(pair[1],word2index)
    v_p.append(temp.view(1,-1))
    
    try:
        cooc = X_ik[pair]
    except:
        cooc = 1
    
    temp = torch.log(Variable(torch.Tensor([cooc]))).cuda() if USE_CUDA else torch.log(Variable(torch.Tensor([cooc])))
    co_p.append(temp.view(1,-1))
    temp = Variable(torch.Tensor([weighting_dic[pair]])).cuda() if USE_CUDA else Variable(torch.Tensor([weighting_dic[pair]]))
    weight_p.append(temp.view(1,-1))
                                  
train_data = list(zip(u_p,v_p,co_p,weight_p))
del u_p
del v_p
del co_p
del weight_p
print(train_data[0])

(Variable containing:
 3835
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
, Variable containing:
 4729
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
, Variable containing:
 0.6931
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]
, Variable containing:
1.00000e-02 *
  5.3183
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]
)


### Modeling 

In [82]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,projection_dim):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size,1)
        self.u_bias = nn.Embedding(vocab_size,1)
        
        initrange = (2.0 / (vocab_size+projection_dim))**0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) # init
        self.embedding_u.weight.data.uniform_(-initrange, initrange) # init
        self.v_bias.weight.data.uniform_(-initrange, initrange) # init
        self.u_bias.weight.data.uniform_(-initrange, initrange) # init
        
    def forward(self, inputs,targets,coocs,weights):
        center_embeds = self.embedding_v(inputs) # B x 1 x D
        context_embeds = self.embedding_u(targets) # B x 1 x D
        
        center_bias = self.v_bias(inputs).squeeze(1)
        context_bias = self.u_bias(targets).squeeze(1)
        
        inner_product = center_embeds.bmm(context_embeds.transpose(1,2)).squeeze(1) # Bx1
        
        loss = weights*torch.pow(inner_product +center_bias + context_bias - coocs,2)
        
        return torch.sum(loss)
    
    def prediction(self, inputs):
        v_embeds = self.embedding_v(inputs) # B x 1 x D
        u_embeds = self.embedding_u(inputs) # B x 1 x D
                
        return v_embeds+u_embeds # final embed

### Train 

In [83]:
PROJECTION = 50 # Embedding size
BATCH_SIZE = 256
STEP_SIZE = 50

In [84]:
losses = []
model = GloVe(len(word2index),PROJECTION)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [85]:
for step in range(STEP_SIZE):
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        
        inputs, targets, coocs, weights = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        coocs = torch.cat(coocs)
        weights = torch.cat(weights)
        model.zero_grad()

        loss = model(inputs,targets,coocs,weights)
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0])
    if step % 10==0:
        print("Step : %d, mean_loss : %.02f" % (step,np.mean(losses)))
        losses=[]

Step : 0, mean_loss : 99.30
Step : 10, mean_loss : 1.57
Step : 20, mean_loss : 0.19
Step : 30, mean_loss : 0.14
Step : 40, mean_loss : 0.12


### Test 

In [86]:
from scipy.spatial.distance import euclidean, cosine

In [87]:
def word_similarity(target,vocab):
    if USE_CUDA:
        target_V = model.prediction(make_input_vector(target,word2index)).data.cpu().numpy()
    else:
        target_V = model.prediction(make_input_vector(target,word2index)).data.numpy()
    similarities=[]
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.cpu().numpy()
        else:
            vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.numpy()
        #similarity.append([vocabs[i],np.dot(target_V,vector.T)[0][0]])
        similarities.append([vocab[i],cosine(target_V,vector)-1])
     
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [107]:
test = random.choice(list(vocab))
test

'original'

In [108]:
word_similarity(test,vocab)

[['historic', 0.52985090571123172],
 ['institutions', 0.51528080054999559],
 ['oman', 0.51192182815654541],
 ['qatar', 0.49771550991169367],
 ['athletics', 0.49280027896029033],
 ['mountainous', 0.48496822698777575],
 ['univ', 0.47659709787058979],
 ['behaved', 0.47069340673768933],
 ['statistical', 0.46401518878823866],
 ['heretics', 0.46206135905113332]]

# TODO 

* 이게 제대로 짠건지 검증을 못하겠네... 어케하지?
* model complexity 부분 좀 더 보기... 
* CoNLL-2003으로..

In [39]:
from glove import Corpus, Glove

In [40]:
cp = Corpus()

In [41]:
cp.fit(corpus, window=5)

In [43]:
len(cp.matrix.data)

69359

In [44]:
glove = Glove(no_components=50, learning_rate=0.05)

In [45]:
glove.fit(cp.matrix, epochs=30, no_threads=4, verbose=True)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [46]:
glove.add_dictionary(cp.dictionary)

In [66]:
glove.most_similar(test,10)

[('1.8470', 0.71143872269798525),
 ('break-up', 0.70466115535936069),
 ('mature', 0.69045816991044795),
 ('11th', 0.6800949534611328),
 ('players', 0.67941097775441706),
 ('perspective', 0.67904147363693679),
 ('naturally', 0.67851819839217864),
 ('quotes', 0.67562121808072884),
 ('third-largest', 0.67527938644901375)]