In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext
from konlpy.tag import Kkma
from torchtext.data import Field,Iterator,Example, TabularDataset

http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

In [4]:
torchtext.__version__

'0.2.1'

<img src="image/torchtext.png">

## 1. Field 선언 

Field는 데이터의 전처리 파이프라인을 정의하는 클래스

http://torchtext.readthedocs.io/en/latest/data.html#field

### 문장 - 클래스 

In [20]:
tagger = Kkma()
tokenize = tagger.morphs

# tokenize => 함수를 넘겨줘야함
TEXT = Field(tokenize=tokenize,use_vocab=True,lower=True, include_lengths=False, batch_first=True) 
LABEL = Field(sequential=False,unk_token=None, use_vocab=True)

## 2. 데이터셋 로드

In [21]:
train_data, test_data = TabularDataset.splits(
                                   path="data/", # 데이터가 있는 root 경로
                                   train='train.txt', validation="test.txt",
                                   format='tsv', # \t로 구분
                                   #skip_header=True, # 헤더가 있다면 스킵
                                   fields=[('inputs',TEXT),('targets',LABEL)])

In [22]:
one_example = train_data.examples[0]

## 3. Vocabulary 구축

In [23]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [24]:
len(TEXT.vocab)

119

In [25]:
LABEL.vocab.itos

['FOOD', 'MEDIA']

## 4. Iterator 선언 

In [49]:
train_data

<torchtext.data.dataset.TabularDataset at 0x7f33cc420f28>

In [50]:
test_data

<torchtext.data.dataset.TabularDataset at 0x7f33cc57e748>

In [31]:
# make iterator for splits
train_iter, test_iter = Iterator.splits(
    (train_data, test_data), batch_size=3, device=-1, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.inputs),sort_within_batch=True,repeat=False) # x.TEXT 길이 기준으로 정렬

In [32]:
TEXT.vocab.itos[1]

'<pad>'

In [33]:
for batch in train_iter:
    print(batch.inputs)
    print(batch.targets)
    break

Variable containing:
   59    75   113     2    67    20    63    10
   95    73     9    12    17    16     1     1
   15    31    41     4     1     1     1     1
[torch.LongTensor of size 3x8]

Variable containing:
 1
 0
 1
[torch.LongTensor of size 3]



## 5. Modeling and Training 

In [54]:
class EmbedClassifier(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size):
        super(EmbedClassifier,self).__init__()
        
        # 각 단어의 임베딩을 평균해서 문장 단위의 임베딩 표현
        self.sentence_embed = nn.EmbeddingBag(vocab_size,embedding_size)
        self.linear = nn.Linear(embedding_size,output_size)
    
    def forward(self,inputs):
        outputs = self.sentence_embed(inputs)
        outputs = self.linear(outputs)
        return outputs

In [59]:
STEP=50
LR = 0.1

model = EmbedClassifier(len(TEXT.vocab),30,2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR)

In [60]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(train_iter):
        inputs,lengths = batch.TEXT
        targets = batch.LABEL
        model.zero_grad()
        preds = model(inputs)
        loss = loss_function(preds,targets)
        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()
    if step % 10==0:
        print(np.mean(losses))
        losses=[]

0.7632251190287727
0.4217357113957405
0.2671015997018133
0.16162016548748528
0.10611527439739023


## 6. Test

파라미터를 학습하기에 데이터수가 너무 적음..!!

In [61]:
for test in test_data.examples:
    input, length = TEXT.numericalize(([test.TEXT],[len(test.TEXT)]),train=False,device=-1)
    pred = model(input)
    pred = pred.max(1)[1]
    print(pred.data[0],test.LABEL)

0 FOOD
0 FOOD
0 MEDIA
0 MEDIA


### numericalize 

문장 ==> 인덱스에 맞는 Variable(LongTensor)로 변환

In [62]:
TEXT.numericalize(([test.TEXT],[len(test.TEXT)]),train=False,device=-1)

(Variable containing:
     0     0     0     0    20    12     0
 [torch.LongTensor of size 1x7], 
  7
 [torch.LongTensor of size 1])