In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import nltk
import torchtext
from konlpy.tag import Kkma

tagger = Kkma()
import gensim

Using TensorFlow backend.


http://anie.me/On-Torchtext/

In [9]:
torchtext.vocab.pretrained_aliases

{'charngram.100d': <function torchtext.vocab.<lambda>>,
 'fasttext.en.300d': <function torchtext.vocab.<lambda>>,
 'fasttext.simple.300d': <function torchtext.vocab.<lambda>>,
 'glove.42B.300d': <function torchtext.vocab.<lambda>>,
 'glove.6B.100d': <function torchtext.vocab.<lambda>>,
 'glove.6B.200d': <function torchtext.vocab.<lambda>>,
 'glove.6B.300d': <function torchtext.vocab.<lambda>>,
 'glove.6B.50d': <function torchtext.vocab.<lambda>>,
 'glove.840B.300d': <function torchtext.vocab.<lambda>>,
 'glove.twitter.27B.100d': <function torchtext.vocab.<lambda>>,
 'glove.twitter.27B.200d': <function torchtext.vocab.<lambda>>,
 'glove.twitter.27B.25d': <function torchtext.vocab.<lambda>>,
 'glove.twitter.27B.50d': <function torchtext.vocab.<lambda>>}

In [None]:
# glove = torchtext.vocab.GloVe('42B',300)

## Gensim으로 빠르게 Word vector 훈련시키기 

In [2]:
corpus = open('data/corpus.txt','r',encoding="utf-8").readlines()
corpus = [c[:-1] for c in corpus]

In [4]:
tokenized = [tagger.morphs(c) for c in corpus]

In [5]:
model = gensim.models.Word2Vec(tokenized, size=30, window=5, min_count=2, workers=2)

In [6]:
model.most_similar("토치")

[('하', 0.972221851348877),
 ('는', 0.9705352187156677),
 ('이', 0.968646764755249),
 ('ㄴ', 0.9657939076423645),
 ('가', 0.9652295112609863),
 ('의', 0.9650665521621704),
 ('파이', 0.9618684649467468),
 ('쓰', 0.9588683843612671),
 ('어', 0.9586747884750366),
 ('은', 0.95864337682724)]

In [8]:
len(model.wv.index2word)

344

In [9]:
model.wv.save_word2vec_format("data/word_vector_sample.bin",binary=True) # 저장

## KeyedVectors -> Numpy 

In [10]:
pretrained_vectors_model = gensim.models.KeyedVectors.load_word2vec_format("data/word_vector_sample.bin",binary=True)

In [11]:
pretrained_vectors_model['토치']

array([ 0.08037015, -0.01953446,  0.00954626, -0.06019327,  0.09531091,
        0.02431943,  0.01465431,  0.0041258 ,  0.02636751,  0.04721881,
       -0.03696634,  0.10945647, -0.06328395,  0.05889855, -0.0422646 ,
        0.01652068,  0.04054054, -0.09195574,  0.03961183,  0.03660725,
        0.03861563, -0.05273944, -0.03175397, -0.02610846, -0.02252244,
        0.00111329,  0.02500392, -0.012768  , -0.06229461, -0.04578288],
      dtype=float32)

In [14]:
vocab = list(pretrained_vectors_model.vocab.keys()) # Word2Vec에서 사용한 vocab 

In [17]:
pretrained_vectors=[]
for vo in vocab:
    pretrained_vectors.append(pretrained_vectors_model[vo])
    
pretrained_vectors = np.vstack(pretrained_vectors)

In [18]:
pretrained_vectors.shape # 30차원의 벡터가 344개(vocab수)

(344, 30)

In [19]:
type(pretrained_vectors)

numpy.ndarray

## Init Embedding matrix 

In [22]:
class MyModel(nn.Module):
    def __init__(self,vocab_size,embed_size):
        super(MyModel,self).__init__()
        
        self.embed = nn.Embedding(vocab_size,embed_size)
        
    def init_embed(self,pretrained_vectors):
        self.embed.weight.data = torch.from_numpy(pretrained_vectors).float()
    
    def forward(self,inputs):
        return self.embed(inputs)

In [23]:
model = MyModel(len(vocab),30)

In [24]:
model.embed.weight

Parameter containing:
 0.2121  0.5539  0.2633  ...  -0.5771 -0.2661  0.6630
 0.1623 -0.8270 -1.6250  ...   0.5148  1.9730  0.1250
-1.3807  0.8325  0.7771  ...  -0.4856  0.2845 -0.1415
          ...             ⋱             ...          
-0.1952  0.1424  0.9637  ...   0.2973 -0.6650  0.0787
 1.5263  1.4560  0.6995  ...   0.5904  0.9722  0.3209
 1.0373  1.5450 -2.1754  ...   1.6473  0.2741  0.9955
[torch.FloatTensor of size 344x30]

In [25]:
model.init_embed(pretrained_vectors)

In [26]:
model.embed.weight

Parameter containing:
 2.0381e-02  7.0048e-03  1.3772e-02  ...  -1.4926e-02 -1.1164e-03  1.1190e-02
 4.1492e-02 -2.0101e-02 -1.0913e-02  ...   1.5439e-03 -2.0536e-02 -1.3507e-02
 7.2253e-03 -5.7071e-03 -1.1858e-02  ...  -1.0191e-02 -6.9318e-03  3.8166e-03
                ...                   ⋱                   ...                
 2.9367e-02 -1.2671e-02  6.2651e-03  ...  -3.0774e-03 -2.8645e-02 -2.8135e-02
-7.5866e-04  3.4414e-04  2.6011e-03  ...   6.4408e-03 -8.0483e-03 -1.6192e-03
 2.0476e-02  5.2759e-03  5.7881e-03  ...  -1.5944e-02 -5.3600e-03  8.7057e-03
[torch.FloatTensor of size 344x30]

## TODO 

- 다음의 코퍼스를 Gensim을 이용해서 Word Vector를 학습시킨 후(몇 차원으로 할지, min_count 몇으로 할지는 자율)
- 하나의 임베딩 매트릭스를 가진 파이토치 모델의 파라미터로 로드하라
- (텐서보드에 임베딩된 단어들을 시각화하라)

In [2]:
corpus = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')[:500]

In [3]:
corpus[:3] # tokenized sentence

[['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']'],
 ['ETYMOLOGY', '.'],
 ['(',
  'Supplied',
  'by',
  'a',
  'Late',
  'Consumptive',
  'Usher',
  'to',
  'a',
  'Grammar',
  'School',
  ')']]