In [268]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter

In [269]:
flatten = lambda l:[item for sublist in l for item in sublist]
#1차원으로 펴주기

In [270]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

In [271]:
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [272]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex=0
    eindex = batch_size
    
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex <= len(train_data):
        batch = train_data[sindex:]
        yield batch

## Data load and Preprocessing
### Gutenberg corpus

In [273]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [274]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:200]
#sampling sentences for test

In [275]:
corpus = [[word.lower() for word in sent] for sent in corpus] 
corpus
#전부 소문자로 변형

[['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'],
 ['etymology', '.'],
 ['(',
  'supplied',
  'by',
  'a',
  'late',
  'consumptive',
  'usher',
  'to',
  'a',
  'grammar',
  'school',
  ')'],
 ['the',
  'pale',
  'usher',
  '--',
  'threadbare',
  'in',
  'coat',
  ',',
  'heart',
  ',',
  'body',
  ',',
  'and',
  'brain',
  ';',
  'i',
  'see',
  'him',
  'now',
  '.'],
 ['he',
  'was',
  'ever',
  'dusting',
  'his',
  'old',
  'lexicons',
  'and',
  'grammars',
  ',',
  'with',
  'a',
  'queer',
  'handkerchief',
  ',',
  'mockingly',
  'embellished',
  'with',
  'all',
  'the',
  'gay',
  'flags',
  'of',
  'all',
  'the',
  'known',
  'nations',
  'of',
  'the',
  'world',
  '.'],
 ['he',
  'loved',
  'to',
  'dust',
  'his',
  'old',
  'grammars',
  ';',
  'it',
  'somehow',
  'mildly',
  'reminded',
  'him',
  'of',
  'his',
  'mortality',
  '.'],
 ['"',
  'while',
  'you',
  'take',
  'in',
  'hand',
  'to',
  'school',
  'others',
  ',',
  'and',
  'to',
  'te

### Extract Stopwords

In [276]:
word_count = Counter(flatten(corpus))
print(len(word_count))

1087


In [277]:
border = int(len(word_count)*0.01) #1%
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]

word_count.most_common() -> 빈도 높은 순서대로 정렬

list(reversed(word_count.most_common())) -> 빈도 낮은 순서대로 정렬 (리스트 형식)

In [278]:
stopwords
#빈도 가장 높은 상위 1% 단어들 + 빈도 가장 낮은 하위 1% 단어들 리스트

[(',', 197),
 ('the', 150),
 ('.', 132),
 ('of', 84),
 ('and', 75),
 ('in', 72),
 ('a', 69),
 ('--', 66),
 ('to', 63),
 ('"', 62),
 ('stage', 1),
 ('introduced', 1),
 ('gazette', 1),
 ('berlin', 1),
 ('papers', 1),
 ('pilot', 1),
 ('cooper', 1),
 ('!"', 1),
 ('fellow', 1),
 ('butt', 1)]

In [279]:
stopwords = [s[0] for s in stopwords] # 단어만 확인
stopwords

[',',
 'the',
 '.',
 'of',
 'and',
 'in',
 'a',
 '--',
 'to',
 '"',
 'stage',
 'introduced',
 'gazette',
 'berlin',
 'papers',
 'pilot',
 'cooper',
 '!"',
 'fellow',
 'butt']

### Build vocab

In [280]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')
print(vocab)
#중복 처리를 위해 set 설정 이후 다시 리스트화
#전체 코퍼스 중에서 빈도 상,하위 1% 단어들(stopwords) 제거

['hval', 'chance', 'spouted', 'commentator', 'thing', 'baleen', 'besides', 'important', 'voice', 'jefferson', 'arch', 'monument', 'danish', 'london', 'frequently', 'setting', 'piercing', 'compare', 'velocity', 'fierce', 'lins', 'bespeak', 'consumptive', 'spencer', 'stretched', 'things', 'insect', 'spiral', 'fangs', 'proceeded', 'mariner', 'size', 'harpooneers', 'unwieldy', 'come', 'asiatics', 'cetology', 'however', 'empty', 'could', 'john', 'sir', 'phil', 'appeared', 'pacific', 'together', 'anglo', 'huge', 'erromangoan', 'magnitude', 'creation', 'be', 'so', 'leaving', 'richardson', 'green', 'almost', 'belly', 'hoops', 'nations', 'these', 'including', 'an', 'saith', 'eyes', 'worm', 'raphael', 'what', 'into', 'plainly', 'dung', 'towards', 'sub', 'flies', 'gondibert', 'opening', 'goldsmith', 'whale', 'gay', 'veritable', 'raising', 'voracious', 'narrative', 'herrings', 'noble', 'seethe', 'raimond', 'icelandic', 'street', 'wantonness', 'swiftness', 'handkerchief', 'speak', 'hand', 'ibid', '

In [281]:
print(len(set(flatten(corpus))))
print(len(vocab))

1087
1068


In [282]:
word2index = {'<UNK>':0} #vocab 생성 시작

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
#만약 단어가 vocab 안에 없으면 vocab에 새로운 index 단어로 추가

In [283]:
index2word = {v:k for k,v in word2index.items()}
index2word

{0: '<UNK>',
 1: 'hval',
 2: 'chance',
 3: 'spouted',
 4: 'commentator',
 5: 'thing',
 6: 'baleen',
 7: 'besides',
 8: 'important',
 9: 'voice',
 10: 'jefferson',
 11: 'arch',
 12: 'monument',
 13: 'danish',
 14: 'london',
 15: 'frequently',
 16: 'setting',
 17: 'piercing',
 18: 'compare',
 19: 'velocity',
 20: 'fierce',
 21: 'lins',
 22: 'bespeak',
 23: 'consumptive',
 24: 'spencer',
 25: 'stretched',
 26: 'things',
 27: 'insect',
 28: 'spiral',
 29: 'fangs',
 30: 'proceeded',
 31: 'mariner',
 32: 'size',
 33: 'harpooneers',
 34: 'unwieldy',
 35: 'come',
 36: 'asiatics',
 37: 'cetology',
 38: 'however',
 39: 'empty',
 40: 'could',
 41: 'john',
 42: 'sir',
 43: 'phil',
 44: 'appeared',
 45: 'pacific',
 46: 'together',
 47: 'anglo',
 48: 'huge',
 49: 'erromangoan',
 50: 'magnitude',
 51: 'creation',
 52: 'be',
 53: 'so',
 54: 'leaving',
 55: 'richardson',
 56: 'green',
 57: 'almost',
 58: 'belly',
 59: 'hoops',
 60: 'nations',
 61: 'these',
 62: 'including',
 63: 'an',
 64: 'saith',
 65

### Prepare train data

In [284]:
WINDOW_SIZE = 3
windows = flatten(list(nltk.ngrams(['<DUMMY>']*WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE,
                                  WINDOW_SIZE*2+1)) for c in corpus)
windows

[('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by'),
 ('<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by', 'herman'),
 ('<DUMMY>', '[', 'moby', 'dick', 'by', 'herman', 'melville'),
 ('[', 'moby', 'dick', 'by', 'herman', 'melville', '1851'),
 ('moby', 'dick', 'by', 'herman', 'melville', '1851', ']'),
 ('dick', 'by', 'herman', 'melville', '1851', ']', '<DUMMY>'),
 ('by', 'herman', 'melville', '1851', ']', '<DUMMY>', '<DUMMY>'),
 ('herman', 'melville', '1851', ']', '<DUMMY>', '<DUMMY>', '<DUMMY>'),
 ('<DUMMY>', '<DUMMY>', '<DUMMY>', 'etymology', '.', '<DUMMY>', '<DUMMY>'),
 ('<DUMMY>', '<DUMMY>', 'etymology', '.', '<DUMMY>', '<DUMMY>', '<DUMMY>'),
 ('<DUMMY>', '<DUMMY>', '<DUMMY>', '(', 'supplied', 'by', 'a'),
 ('<DUMMY>', '<DUMMY>', '(', 'supplied', 'by', 'a', 'late'),
 ('<DUMMY>', '(', 'supplied', 'by', 'a', 'late', 'consumptive'),
 ('(', 'supplied', 'by', 'a', 'late', 'consumptive', 'usher'),
 ('supplied', 'by', 'a', 'late', 'consumptive', 'usher', 'to'),
 ('by', 'a', 'late', 'c

nltk.ngrams(word_list, n)

In [285]:
train_data=[]
for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': #중심 단어이거나 dummy 값일 경우
            continue
        train_data.append((window[WINDOW_SIZE], window[i])) #append(중심단어, 주변단어)
        
print(train_data[:WINDOW_SIZE*2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


In [286]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w:word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

In [287]:
X_p=[]
y_p=[]

In [288]:
for tr in train_data:  
    X_p.append(prepare_word(tr[0], word2index).view(1,-1)) #중심단어 
    y_p.append(prepare_word(tr[1], word2index).view(1,-1)) #주변단어

a = Variable(torch.LongTensor(word2index[ '[' ]))

print(len(a))

In [289]:
train_data[0]

('[', 'moby')

In [290]:
train_data = list(zip(X_p, y_p))

print(train_data[0])
print('='*40)
print(len(train_data))

(Variable containing:
 400
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
, Variable containing:
 614
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
)
17062


### Modeling

In [291]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # v : 중심단어 벡터
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # u : 주변단어 벡터
        
        self.embedding_v.weight.data.uniform_(-1,1) #init
        self.embedding_u.weight.data.uniform_(0,0) #init
        #self.out = nn.Linear(projection_dim, vocab_size)
        
    def forward(self, center_words, target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B X 1 X D  #중심단어
        target_embeds = self.embedding_u(target_words) # B X 1 X D  #주변단어 중 하나 
        outer_embeds = self.embedding_u(outer_words) # B X V X D  #주변단어 
        
        scores = target_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2) # BX1XD * BXDX1  => BX1
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2) #BXVXD * BXDX1  => BXV
        
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores),1).unsqueeze(1))) # log-softmax
        
        return nll # negative log likeligood
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

#### torch.uniform(a,b) -> (a,b) 균일분포에 따라 임의의 실수를 구한다?

#### torch.squeeze() -> 차원의 크기가 1인 차원을 제거
#### torch.unsqueeze() -> 차원의 크기가 1인 차원을 원하는 차원에 생성

### Train

In [292]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100

In [293]:
losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)

if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01) #lr = learning rate

In [294]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B X 1
        targets = torch.cat(targets) #B X 1
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab)) #B X V
        model.zero_grad()
        
        loss = model(inputs, targets, vocabs)
        
        loss.backward()
        optimizer.step()
        losses.append(loss.data.tolist()[0])
        
    if epoch % 10 == 0:
        print('Epoch: %d, mean_loss: %.02f' % (epoch,np.mean(losses)))
        losses=[]

Epoch: 0, mean_loss: 6.29
Epoch: 10, mean_loss: 4.38
Epoch: 20, mean_loss: 3.70
Epoch: 30, mean_loss: 3.58
Epoch: 40, mean_loss: 3.54
Epoch: 50, mean_loss: 3.52
Epoch: 60, mean_loss: 3.51
Epoch: 70, mean_loss: 3.50
Epoch: 80, mean_loss: 3.49
Epoch: 90, mean_loss: 3.49


#### torch.cat() -> 두개의 텐서 붙이기 (차원의 크기가 같아야 한다.)
dim=0 : 아래로    /    dim=1 : 오른쪽으로

### Test

In [295]:
def word_similarity(target, vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        
        if vocab[i] == target: continue
            
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector - model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x:x[1], reverse=True)[:10] # sort by similarity

In [296]:
test = random.choice(list(vocab))
test

'sprat'

In [297]:
word_similarity(test,vocab)

[['alfred', 0.6732354164123535],
 ['property', 0.6466714143753052],
 ['isaiah', 0.6361171007156372],
 ['asiatics', 0.6240059733390808],
 ['mariner', 0.6085211038589478],
 ['above', 0.6060542464256287],
 ['revenue', 0.6056627035140991],
 ['grounded', 0.6027454137802124],
 ['swam', 0.5989528298377991],
 ['boats', 0.5958307385444641]]