In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter

In [3]:
flatten = lambda l:[item for sublist in l for item in sublist]
random.seed(119)

In [4]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [5]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data) #랜덤
    sindex = 0 
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size #추가추가
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

### Data load and Preprocessing

In [6]:
corpus = list(nltk. corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]
corpus = [[word.lower() for word in sent] for sent in corpus]
print(corpus[0:5])  #소문자로 전환

[['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'], ['etymology', '.'], ['(', 'supplied', 'by', 'a', 'late', 'consumptive', 'usher', 'to', 'a', 'grammar', 'school', ')'], ['the', 'pale', 'usher', '--', 'threadbare', 'in', 'coat', ',', 'heart', ',', 'body', ',', 'and', 'brain', ';', 'i', 'see', 'him', 'now', '.'], ['he', 'was', 'ever', 'dusting', 'his', 'old', 'lexicons', 'and', 'grammars', ',', 'with', 'a', 'queer', 'handkerchief', ',', 'mockingly', 'embellished', 'with', 'all', 'the', 'gay', 'flags', 'of', 'all', 'the', 'known', 'nations', 'of', 'the', 'world', '.']]


### Build vocab

In [7]:
vocab = list(set(flatten(corpus)))
print(vocab[0:5])

['attacks', 'pedestrians', 'hermit', 'perdition', 'vessels']


In [8]:
word2index = {}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k,v in word2index.items()} 

In [9]:
WINDOW_SIZE=5
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + 
                                  ['<DUMMY>']*WINDOW_SIZE, WINDOW_SIZE*2+1))
                 for c in corpus])
print(windows[0])

('<DUMMY>', '<DUMMY>', '<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by', 'herman', 'melville')


In [10]:
window_data=[]
for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>':
            continue
        window_data.append((window[WINDOW_SIZE], window[i]))

In [11]:
def weighting(w_i, w_j):
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1
        
    x_max = 100 #100: fixed in paper
    alpha = 0.75
    
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha
    else:
        result = 1
        
    return result

### Build Co-occurence Matrix X

In [12]:
X_i = Counter(flatten(corpus)) #각 단어의 count 수 

In [13]:
X_ik_window_5 = Counter(window_data) #Co-occurence in window size 5
## 각 window_data 갯수 >> window_data가 여러개 일수록 발생확률 up

In [14]:
X_ik = {}
weighting_dic={}

In [15]:
from itertools import combinations_with_replacement

In [23]:
for bigram in combinations_with_replacement(vocab,2):  #vocab: 소문자화 시킨 단어들
    if X_ik_window_5.get(bigram) is not None: #nonzero elements
        co_occer = X_ik_window_5[bigram]
        X_ik[bigram] = co_occer + 1 #log(X_ik) -> log(X_ik+1) to prevent divergence
        X_ik[(bigram[1], bigram[0])] = co_occer+1
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1])
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0])

In [27]:
test = random.choice(window_data)
print(test)
try:
    print(X_ik[(test[0], test[1])] == X_ik[(test[1], test[0])]) # 2
except:
    1

('blanket', '?')
True


### Prepare train data

In [35]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] 
                    if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) 
                    if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

In [38]:
u_p = [] # 중심벡터
v_p = [] # 주변벡터
co_p = [] # log(X_ij)
weight_p = [] # f(X_ij)

In [42]:
for pair in window_data:
    u_p.append(prepare_word(pair[0], word2index).view(1,-1))
    v_p.append(prepare_word(pair[1], word2index).view(1,-1))
    
    try:
        cooc = X_ik[pair] #동시발생확률?
    except:
        cooc = 1
        
    co_p.append(torch.log(Variable(FloatTensor([cooc]))).view(1,-1))
    weight_p.append(Variable(FloatTensor([weighting_dic[pair]])).view(1,-1))
    
train_data = list(zip(u_p, v_p, co_p, weight_p))
#del u_p
#del v_p
#del co_p
#del weight_p
print(train_data[0])  #tuple (중심벡터 i , 주변벡터 j log(X_ij), weight f(w_ij)

(Variable containing:
 652
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
, Variable containing:
 1412
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
, Variable containing:
 0.6931
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]
, Variable containing:
1.00000e-02 *
  5.3183
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]
)


### GloVe Modeling

In [46]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(GloVe, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) #center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) #out embedding
        
        self.v_bias = nn.Embedding(vocab_size,1) #center bias
        self.u_bias = nn.Embedding(vocab_size,1) #context bias
        
        initrange = (6.0/ (vocab_size + projection_dim))**0.5 #Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) 
        self.embedding_u.weight.data.uniform_(-initrange, initrange) 
        self.v_bias.weight.data.uniform_(-initrange, initrange) 
        self.u_bias.weight.data.uniform_(-initrange, initrange) 
        
    def forward(self, center_words, target_words, coocs, weights):
        center_embeds = self.embedding_v(center_words) # B X 1 X D
        target_embeds = self.embedding_u(target_words) # B X 1 X D
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2) # B X 2
        
        loss = weights*torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        #torch.pow -> input값에 exp를 취해 tensor로 결과값 반환
        ##torch.pow(input, exponent, out=None) =>> 2 = exponent value
        
        return torch.sum(loss)
    
    def prediction(self, inputs):
        v_embeds = self.embedding_v(inputs) # B X 1 X D
        u_embeds = self.embedding_u(inputs) # B X 1 X D
        
        return v_embeds + u_embeds #final embed

### Train

In [47]:
EMBEDDING_SIZE = 50
BATCH_SIZE = 256
EPOCH = 50

In [48]:
losses = []
model = GloVe(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [50]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets, coocs, weights = zip(*batch)
        
        inputs = torch.cat(inputs) # B X 1
        targets = torch.cat(targets) # B X 1
        coocs = torch.cat(coocs)
        weights = torch.cat(weights)
        model.zero_grad()
        
        loss = model(inputs, targets, coocs, weights)
        
        loss.backward()
        optimizer.step()
        
        losses.append(loss.data.tolist()[0])
        
    if epoch % 10 == 0:
        print('Epoch : %d, mean_loss : %.02f'%(epoch, np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 119.20
Epoch : 10, mean_loss : 1.18
Epoch : 20, mean_loss : 0.06
Epoch : 30, mean_loss : 0.03
Epoch : 40, mean_loss : 0.02


### Test

In [52]:
def word_similarity(target, vocab):
    target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    
    for i in range(len(vocab)):
        if vocab[i] == target:
            continue
            
        vector = model.prediction(prepare_word(list(vocab)[i], word2index)) 
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key = lambda x:x[1], reverse=True)[:10]

In [53]:
test = random.choice(list(vocab))
test

'against'

In [54]:
word_similarity(test, vocab)

[['forty', 0.7925418615341187],
 ['set', 0.7201284170150757],
 ['call', 0.7059496641159058],
 ['one', 0.7039071321487427],
 ['sing', 0.7036358118057251],
 ['ruin', 0.697799563407898],
 ['limber', 0.6831038594245911],
 ['high', 0.6786397099494934],
 ['raphael', 0.6782793402671814],
 ['emigrant', 0.6740805506706238]]