In [13]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

import nltk
from konlpy.tag import Kkma
kor_tagger = Kkma()

## Tokenize 

In [14]:
token = nltk.word_tokenize("Hi, my name is sungdong. What's your name?")
print(token)

['Hi', ',', 'my', 'name', 'is', 'sungdong', '.', 'What', "'s", 'your', 'name', '?']


In [16]:
token = kor_tagger.morphs("안녕하세요! 저는 파이토치를 공부하는 중입니다.")
print(token)

['안녕', '하', '세요', '!', '저', '는', '파이', '토치', '를', '공부', '하', '는', '중', '이', 'ㅂ니다', '.']


## Build Vocab 

In [26]:
word2index={} # dictionary for indexing
for vo in token:
    if word2index.get(vo)==None:
        word2index[vo]=len(word2index)
print(word2index)

{'하': 1, '.': 13, '를': 8, '중': 10, '안녕': 0, '파이': 6, '세요': 2, '토치': 7, '저': 4, '!': 3, '공부': 9, 'ㅂ니다': 12, '이': 11, '는': 5}


## One-hot Encoding 

In [27]:
def one_hot_encoding(word,word2index):
    tensor = torch.zeros(len(word2index))
    index = word2index[word]
    tensor[index]=1.
    return tensor

In [30]:
torch_vector = one_hot_encoding("토치",word2index)
print(torch_vector)


 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 14]



In [32]:
py_vector = one_hot_encoding("파이",word2index)
py_vector.dot(torch_vector)

0.0

## Bag-of-Words 

In [1]:
train_data = [["배고프다 밥줘","FOOD"],
                    ["뭐 먹을만한거 없냐","FOOD"],
                    ["맛집 추천","FOOD"],
                    ["이 근처 맛있는 음식점 좀","FOOD"],
                    ["밥줘","FOOD"],
                    ["뭐 먹지?","FOOD"],
                    ["삼겹살 먹고싶어","FOOD"],
                    ["영화 보고싶다","MEDIA"],
                    ["요즘 볼만한거 있어?","MEDIA"],
                    ["영화나 예능 추천","MEDIA"],
                    ["재밌는 드라마 보여줘","MEDIA"],
                    ["신과 함께 줄거리 좀 알려줘","MEDIA"],
                    ["고등랩퍼 다시보기 좀","MEDIA"],
                    ["재밌는 영상 하이라이트만 보여줘","MEDIA"]]

test_data = [["쭈꾸미 맛집 좀 찾아줘","FOOD"],
                   ["매콤한 떡볶이 먹고싶다","FOOD"],
                   ["강남 씨지비 조조 영화 스케줄표 좀","MEDIA"],
                   ["효리네 민박 보고싶엉","MEDIA"]]

### Preprocessing 

In [73]:
train_X,train_y = list(zip(*train_data))

### 1. Tokenize 

In [74]:
train_X = [kor_tagger.morphs(x) for x in train_X] # Tokenize

In [75]:
train_X

[['배고프', '다', '밥', '주', '어'],
 ['뭐', '먹', '을', '만하', 'ㄴ', '거', '없', '냐'],
 ['맛', '집', '추천'],
 ['이', '근처', '맛있', '는', '음식', '점', '좀'],
 ['밥', '주', '어'],
 ['뭐', '먹', '지', '?'],
 ['삼겹살', '먹', '고', '싶', '어'],
 ['영화', '보', '고', '싶', '다'],
 ['요즘', '볼만', '하', 'ㄴ', '거', '있', '어', '?'],
 ['영화', '나', '예능', '추천'],
 ['재밌', '는', '드라마', '보여주', '어'],
 ['신', '과', '함께', '줄거리', '좀', '알려주', '어'],
 ['고등', '랩', '푸', '어', '다시', '보', '기', '좀'],
 ['재밌', '는', '영상', '하이라이트', '만', '보여주', '어']]

### 2. Build Vocab 

In [76]:
word2index={'<unk>' : 0}
for x in train_X:
    for token in x:
        if word2index.get(token)==None:
            word2index[token]=len(word2index)
            
class2index = {'FOOD' : 0, 'MEDIA' : 1}
print(word2index)
print(class2index)

{'만': 52, '줄거리': 43, '밥': 3, '푸': 47, '과': 41, '맛': 14, '뭐': 6, '알려주': 44, '다시': 48, '기': 49, '있': 34, '함께': 42, '요즘': 31, '고등': 45, '음식': 21, '점': 22, '랩': 46, '근처': 18, '?': 25, '영화': 29, '싶': 28, '지': 24, '추천': 16, '영상': 50, '좀': 23, '재밌': 37, '맛있': 19, '하': 33, '<unk>': 0, '냐': 13, '집': 15, '어': 5, '배고프': 1, '만하': 9, '주': 4, '고': 27, '예능': 36, '없': 12, '보여주': 39, '하이라이트': 51, 'ㄴ': 10, '먹': 7, '드라마': 38, '삼겹살': 26, '나': 35, '볼만': 32, '보': 30, '거': 11, '다': 2, '신': 40, '이': 17, '을': 8, '는': 20}
{'FOOD': 0, 'MEDIA': 1}


### 3. Prepare tensor 

In [77]:
def make_BoW(seq,word2index):
    tensor = torch.zeros(len(word2index))
    for w in seq:
        index = word2index.get(w)
        if index!=None:
            tensor[index]+=1.
        else:
            index = word2index['<unk>']
            tensor[index]+=1.
    
    return tensor

In [78]:
train_X = torch.cat([Variable(make_BoW(x,word2index)).view(1,-1) for x in train_X])
train_y = torch.cat([Variable(torch.LongTensor([class2index[y]])) for y in train_y])

In [79]:
print(train_X.size())

torch.Size([14, 53])


### 4. Modeling

In [80]:
class BoWClassifier(nn.Module):
    def __init__(self,vocab_size,output_size):
        super(BoWClassifier,self).__init__()
        
        self.linear = nn.Linear(vocab_size,output_size)
    
    def forward(self,inputs):
        return self.linear(inputs)

### 5. Train 

In [102]:
STEP = 100
LR = 0.1
model = BoWClassifier(len(word2index),2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR)

In [103]:
for step in range(STEP):
    model.zero_grad()
    preds = model(train_X)
    loss = loss_function(preds,train_y)
    if step % 10 == 0:
        print(loss.data[0])
    loss.backward()
    optimizer.step()

0.7569531202316284
0.5769873857498169
0.45888298749923706
0.37734800577163696
0.3184089958667755
0.2741793394088745
0.23996441066265106
0.21282777190208435
0.19085267186164856
0.17274217307567596


### 6. Test 

In [111]:
index2class = {v:k for k,v in class2index.items()}

In [115]:
for test in test_data:
    X = kor_tagger.morphs(test[0])
    X = Variable(make_BoW(X,word2index)).view(1,-1)
    
    pred = model(X)
    pred = pred.max(1)[1].data[0]
    print("Input : %s" % test[0])
    print("Prediction : %s" % index2class[pred])
    print("Truth : %s" % test[1])
    print("\n")

Input : 쭈꾸미 맛집 좀 찾아줘
Prediction : FOOD
Truth : FOOD


Input : 매콤한 떡볶이 먹고싶다
Prediction : FOOD
Truth : FOOD


Input : 강남 씨지비 조조 영화 스케줄표 좀
Prediction : MEDIA
Truth : MEDIA


Input : 효리네 민박 보고싶엉
Prediction : MEDIA
Truth : MEDIA


