In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

import nltk
from konlpy.tag import Kkma
kor_tagger = Kkma()

## Tokenize 

In [2]:
token = nltk.word_tokenize("Hi, my name is sungdong. What's your name?")
print(token)

['Hi', ',', 'my', 'name', 'is', 'sungdong', '.', 'What', "'s", 'your', 'name', '?']


In [3]:
token = kor_tagger.morphs("안녕하세요! 저는 파이토치를 공부하는 중입니다.")
print(token)

['안녕', '하', '세요', '!', '저', '는', '파이', '토치', '를', '공부', '하', '는', '중', '이', 'ㅂ니다', '.']


## Build Vocab 

In [4]:
token

['안녕',
 '하',
 '세요',
 '!',
 '저',
 '는',
 '파이',
 '토치',
 '를',
 '공부',
 '하',
 '는',
 '중',
 '이',
 'ㅂ니다',
 '.']

In [5]:
word2index={} # dictionary for indexing
for vo in token:
    if word2index.get(vo)==None:
        word2index[vo]=len(word2index)
print(word2index)

{'공부': 9, '파이': 6, '세요': 2, '하': 1, 'ㅂ니다': 12, '는': 5, '안녕': 0, '이': 11, '를': 8, '!': 3, '.': 13, '중': 10, '저': 4, '토치': 7}


## One-hot Encoding 

In [8]:
def one_hot_encoding(word,word2index):
    tensor = torch.zeros(len(word2index))
    index = word2index[word] 
    tensor[index]=1.
    return tensor

In [9]:
torch_vector = one_hot_encoding("토치",word2index)
print(torch_vector)


 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 14]



In [10]:
py_vector = one_hot_encoding("파이",word2index)
py_vector.dot(torch_vector)

0.0

## Bag-of-Words 

In [11]:
train_data = [["배고프다 밥줘","FOOD"],
                    ["뭐 먹을만한거 없냐","FOOD"],
                    ["맛집 추천","FOOD"],
                    ["이 근처 맛있는 음식점 좀","FOOD"],
                    ["밥줘","FOOD"],
                    ["뭐 먹지?","FOOD"],
                    ["삼겹살 먹고싶어","FOOD"],
                    ["영화 보고싶다","MEDIA"],
                    ["요즘 볼만한거 있어?","MEDIA"],
                    ["영화나 예능 추천","MEDIA"],
                    ["재밌는 드라마 보여줘","MEDIA"],
                    ["신과 함께 줄거리 좀 알려줘","MEDIA"],
                    ["고등랩퍼 다시보기 좀","MEDIA"],
                    ["재밌는 영상 하이라이트만 보여줘","MEDIA"]]

test_data = [["쭈꾸미 맛집 좀 찾아줘","FOOD"],
                   ["매콤한 떡볶이 먹고싶다","FOOD"],
                   ["강남 씨지비 조조 영화 스케줄표 좀","MEDIA"],
                   ["효리네 민박 보고싶엉","MEDIA"]]

### Preprocessing 

In [12]:
train_X,train_y = list(zip(*train_data))

### 1. Tokenize 

In [15]:
train_X = [kor_tagger.morphs(x) for x in train_X] # Tokenize

In [16]:
train_X

[['배고프', '다', '밥', '주', '어'],
 ['뭐', '먹', '을', '만하', 'ㄴ', '거', '없', '냐'],
 ['맛', '집', '추천'],
 ['이', '근처', '맛있', '는', '음식', '점', '좀'],
 ['밥', '주', '어'],
 ['뭐', '먹', '지', '?'],
 ['삼겹살', '먹', '고', '싶', '어'],
 ['영화', '보', '고', '싶', '다'],
 ['요즘', '볼만', '하', 'ㄴ', '거', '있', '어', '?'],
 ['영화', '나', '예능', '추천'],
 ['재밌', '는', '드라마', '보여주', '어'],
 ['신', '과', '함께', '줄거리', '좀', '알려주', '어'],
 ['고등', '랩', '푸', '어', '다시', '보', '기', '좀'],
 ['재밌', '는', '영상', '하이라이트', '만', '보여주', '어']]

### 2. Build Vocab 

In [19]:
word2index={'<unk>' : 0}
for x in train_X:
    for token in x:
        if word2index.get(token)==None:
            word2index[token]=len(word2index)
            
class2index = {'FOOD' : 0, 'MEDIA' : 1}
print(word2index)
print(class2index)

{'<unk>': 0, '밥': 3, '맛있': 19, '다시': 48, '하이라이트': 51, '뭐': 6, '추천': 16, '만하': 9, '냐': 13, '요즘': 31, '푸': 47, '예능': 36, '는': 20, '좀': 23, '신': 40, '먹': 7, '과': 41, '보': 30, '나': 35, '다': 2, '알려주': 44, '지': 24, '싶': 28, '랩': 46, '점': 22, '기': 49, '거': 11, '줄거리': 43, '있': 34, '함께': 42, '집': 15, '드라마': 38, 'ㄴ': 10, '어': 5, '고등': 45, '맛': 14, '고': 27, '만': 52, '하': 33, '을': 8, '음식': 21, '이': 17, '재밌': 37, '삼겹살': 26, '?': 25, '없': 12, '볼만': 32, '배고프': 1, '영화': 29, '보여주': 39, '근처': 18, '주': 4, '영상': 50}
{'MEDIA': 1, 'FOOD': 0}


In [39]:
len(word2index)

53

### 3. Prepare tensor 

In [23]:
result

In [25]:
word2index.get("패스트")

In [26]:
def make_BoW(seq,word2index):
    tensor = torch.zeros(len(word2index))
    for w in seq:
        index = word2index.get(w)
        if index!=None:
            tensor[index]+=1.
        else:
            index = word2index['<unk>']
            tensor[index]+=1.
    
    return tensor

In [27]:
train_X = torch.cat([Variable(make_BoW(x,word2index)).view(1,-1) for x in train_X])
train_y = torch.cat([Variable(torch.LongTensor([class2index[y]])) for y in train_y])

In [28]:
print(train_X.size())

torch.Size([14, 53])


### 4. Modeling

In [30]:
train_X.size()

torch.Size([14, 53])

In [31]:
class BoWClassifier(nn.Module):
    def __init__(self,vocab_size,output_size):
        super(BoWClassifier,self).__init__()
        
        self.linear = nn.Linear(vocab_size,output_size)
    
    def forward(self,inputs):
        return self.linear(inputs)

### 5. Train 

In [36]:
STEP = 100
LR = 0.1
model = BoWClassifier(len(word2index),2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR)

In [37]:
for step in range(STEP):
    model.zero_grad()
    preds = model(train_X)
    loss = loss_function(preds,train_y)
    if step % 10 == 0:
        print(loss.data[0])
    loss.backward()
    optimizer.step()

0.7061382532119751
0.5515249967575073
0.44623735547065735
0.37121906876564026
0.31569892168045044
0.2733078598976135
0.24009300768375397
0.21349629759788513
0.1918020397424698
0.17382386326789856


### 6. Test 

In [34]:
index2class = {v:k for k,v in class2index.items()}

In [38]:
for test in test_data:
    X = kor_tagger.morphs(test[0])
    X = Variable(make_BoW(X,word2index)).view(1,-1)
    
    pred = model(X)
    pred = pred.max(1)[1].data[0]
    print("Input : %s" % test[0])
    print("Prediction : %s" % index2class[pred])
    print("Truth : %s" % test[1])
    print("\n")

Input : 쭈꾸미 맛집 좀 찾아줘
Prediction : FOOD
Truth : FOOD


Input : 매콤한 떡볶이 먹고싶다
Prediction : FOOD
Truth : FOOD


Input : 강남 씨지비 조조 영화 스케줄표 좀
Prediction : MEDIA
Truth : MEDIA


Input : 효리네 민박 보고싶엉
Prediction : MEDIA
Truth : MEDIA


