In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext
from konlpy.tag import Kkma
from torchtext.data import Field,Iterator,Example, TabularDataset

http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

In [2]:
torchtext.__version__

'0.2.1'

<img src="image/torchtext.png">

## 1. Field 선언 

Field는 데이터의 전처리 파이프라인을 정의하는 클래스

http://torchtext.readthedocs.io/en/latest/data.html#field

### 문장 - 클래스 

In [39]:
tagger = Kkma()
tokenize = tagger.morphs
preprocessing = lambda x:0 if x=="FOOD" else 1

TEXT = Field(tokenize=tokenize,use_vocab=True,lower=True, include_lengths=True, batch_first=True) 
LABEL = Field(sequential=False, use_vocab=False,preprocessing=preprocessing)

## 2. 데이터셋 로드

In [40]:
train_data, test_data = TabularDataset.splits(
                                   path="data/", # 데이터가 있는 root 경로
                                   train='train.txt', validation="test.txt",
                                   format='tsv', # \t로 구분
                                   #skip_header=True, # 헤더가 있다면 스킵
                                   fields=[('TEXT',TEXT),('LABEL',LABEL)])

In [41]:
one_example = train_data.examples[0]

In [42]:
one_example.TEXT

['배고프', '다', '밥', '주', '어']

In [43]:
one_example.LABEL

0

## 3. Vocabulary 구축

In [44]:
TEXT.build_vocab(train_data)

In [45]:
len(TEXT.vocab)

54

In [46]:
TEXT.vocab.itos

['<unk>',
 '<pad>',
 '어',
 '는',
 '먹',
 '좀',
 '?',
 'ㄴ',
 '거',
 '고',
 '다',
 '뭐',
 '밥',
 '보',
 '보여주',
 '싶',
 '영화',
 '재밌',
 '주',
 '추천',
 '고등',
 '과',
 '근처',
 '기',
 '나',
 '냐',
 '다시',
 '드라마',
 '랩',
 '만',
 '만하',
 '맛',
 '맛있',
 '배고프',
 '볼만',
 '삼겹살',
 '신',
 '알려주',
 '없',
 '영상',
 '예능',
 '요즘',
 '을',
 '음식',
 '이',
 '있',
 '점',
 '줄거리',
 '지',
 '집',
 '푸',
 '하',
 '하이라이트',
 '함께']

## 4. Iterator 선언 

In [78]:
# make iterator for splits
train_iter, test_iter = Iterator.splits(
    (train_data, test_data), batch_size=3, device=-1, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.TEXT),sort_within_batch=True,repeat=False) # x.TEXT 길이 기준으로 정렬

In [102]:
for batch in train_iter:
    print(batch.TEXT)
    print(batch.LABEL)
    break

(Variable containing:
 35   4   9  15   2
 17   3  27  14   2
 31  49  19   1   1
[torch.LongTensor of size 3x5]
, 
 5
 5
 3
[torch.LongTensor of size 3]
)
Variable containing:
 0
 1
 0
[torch.LongTensor of size 3]



## 5. Modeling and Training 

In [60]:
class EmbedClassifier(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size):
        super(EmbedClassifier,self).__init__()
        
        # 각 단어의 임베딩을 평균해서 문장 단위의 임베딩 표현
        self.sentence_embed = nn.EmbeddingBag(vocab_size,embedding_size)
        self.linear = nn.Linear(embedding_size,output_size)
    
    def forward(self,inputs):
        outputs = self.sentence_embed(inputs)
        outputs = self.linear(outputs)
        return outputs

In [124]:
STEP=50
LR = 0.1

model = EmbedClassifier(len(TEXT.vocab),20,2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR)

In [125]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(train_iter):
        inputs,lengths = batch.TEXT
        targets = batch.LABEL
        model.zero_grad()
        preds = model(inputs)
        loss = loss_function(preds,targets)
        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()
    if step % 10==0:
        print(np.mean(losses))
        losses=[]

0.782355523109436
0.4344502389431
0.40328202545642855
0.27725195586681367
0.2750774651765823


## 6. Test

파라미터를 학습하기에 데이터수가 너무 적음..!!

In [126]:
for test in test_data.examples:
    input, length = TEXT.numericalize(([test.TEXT],[len(test.TEXT)]),train=False,device=-1)
    pred = model(input)
    pred = pred.max(1)[1]
    print(pred.data[0],test.LABEL)

1 0
1 0
1 1
1 1


### numericalize 

문장 ==> 인덱스에 맞는 Variable(LongTensor)로 변환

In [127]:
TEXT.numericalize(([test.TEXT],[len(test.TEXT)]),train=False,device=-1)

(Variable containing:
     0     0     0     0    13     9     0
 [torch.LongTensor of size 1x7], 
  7
 [torch.LongTensor of size 1])