In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import torchtext
import nltk
from konlpy.tag import Mecab
from torchtext.data import Field,BucketIterator, TabularDataset, Dataset
import os
DATA_PATH = os.environ['DATA_PATH']
tagger = Mecab()

## 데이터 로드 

Naver sentiment movie corpus v1.0 (https://github.com/e9t/nsmc)

In [2]:
USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

In [106]:
def pad_under_five(toknized):
    """
    모델에서 5-gram 단위 필터를 사용하기 때문에
    5-gram이 안되는 문장에 <pad>로 채워준다
    """
    if len(toknized) < 5:
        toknized.extend(["<pad>"]*(5-len(toknized)))
    return toknized

In [68]:
TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five)
LABEL = Field(sequential=False,use_vocab=True,unk_token=None)

In [69]:
train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/NSMC/',
                                                  train='ratings_train.txt',test='ratings_test.txt',
                                                  format='tsv',
                                                  skip_header=True,
                                                  fields=[('id',None),('text',TEXT),('label',LABEL)],
                                                  filter_pred = lambda x: True if len(x.text) > 1 else False) # 토큰 레벨 문장의 길이가 1 이상인 경우만 허용

In [70]:
TEXT.build_vocab(train_data,min_freq=2)
LABEL.build_vocab(train_data)

In [72]:
print(len(TEXT.vocab),len(LABEL.vocab))

29974 2


In [73]:
train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True,
                                           repeat=False,shuffle=True,
                                           batch_size=32,device=DEVICE)

In [74]:
for batch in train_loader:
    break

## TODO : 모델링 

In [76]:
class  CNNClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNNClassifier,self).__init__()
        
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(1, kernel_dim, embedding_dim * K, stride=embedding_dim) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)

    def forward(self, inputs):
        
        # TODO : 완성하시오

## 트레이닝 

In [77]:
EPOCH = 5
BATCH_SIZE = 32
EMBED = 300
KERNEL_SIZES = [3,4,5]
KERNEL_DIM = 100
LR = 0.001

In [78]:
model = CNNClassifier(len(TEXT.vocab), EMBED, 1, KERNEL_DIM, KERNEL_SIZES)

if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3], gamma=0.1)

In [79]:
model.train()
for epoch in range(EPOCH):
    losses=[]
    scheduler.step()
    for i,batch in enumerate(train_loader):
        inputs, targets = batch.text, batch.label.float()
        if USE_CUDA:
            inputs = inputs.cuda()
            targets = targets.cuda()
        model.zero_grad()
        preds = model(inputs)
        loss = loss_function(preds.squeeze(1),targets)
        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()
        
        if i % 1000 == 0:
            print("epoch : %d mean_loss : %.3f , lr : %.5f" % (epoch,np.mean(losses), scheduler.get_lr()[0]))
            losses=[]

epoch : 0 mean_loss : 0.754 , lr : 0.00100
epoch : 0 mean_loss : 0.524 , lr : 0.00100
epoch : 0 mean_loss : 0.434 , lr : 0.00100
epoch : 0 mean_loss : 0.403 , lr : 0.00100
epoch : 0 mean_loss : 0.391 , lr : 0.00100
epoch : 1 mean_loss : 0.574 , lr : 0.00100
epoch : 1 mean_loss : 0.329 , lr : 0.00100
epoch : 1 mean_loss : 0.338 , lr : 0.00100
epoch : 1 mean_loss : 0.337 , lr : 0.00100
epoch : 1 mean_loss : 0.325 , lr : 0.00100
epoch : 2 mean_loss : 0.314 , lr : 0.00100
epoch : 2 mean_loss : 0.278 , lr : 0.00100
epoch : 2 mean_loss : 0.286 , lr : 0.00100
epoch : 2 mean_loss : 0.295 , lr : 0.00100
epoch : 2 mean_loss : 0.295 , lr : 0.00100
epoch : 3 mean_loss : 0.264 , lr : 0.00010
epoch : 3 mean_loss : 0.226 , lr : 0.00010
epoch : 3 mean_loss : 0.218 , lr : 0.00010
epoch : 3 mean_loss : 0.218 , lr : 0.00010
epoch : 3 mean_loss : 0.211 , lr : 0.00010
epoch : 4 mean_loss : 0.031 , lr : 0.00010
epoch : 4 mean_loss : 0.196 , lr : 0.00010
epoch : 4 mean_loss : 0.199 , lr : 0.00010
epoch : 4 m

## 테스트 

### 정량적 테스트 : Accruacy 

In [82]:
model.eval()
num_hit=0
for i,batch in enumerate(test_loader):
    inputs, targets = batch.text, batch.label.float()
    if USE_CUDA:
        inputs = inputs.cuda()
        targets = targets.cuda()
    inputs = inputs
    targets = targets
    preds = model(inputs)
    preds = preds.round()
    num_hit+=torch.eq(preds.squeeze(),targets.squeeze()).sum().data[0]

print(num_hit/len(test_data)*100)

86.45


### 정성적 테스트 

In [111]:
test_inputs = ["헐 진짜 개별로다..", "진짜 너무 재밌는 영화다 오랜만에","오..이건 진짜 봐야함", "진짜 쓰레기 같은 영화","노잼","존잼","꾸울잼","핵노잼"]

for test_input in test_inputs:
    tokenized = tagger.morphs(test_input)
    tokenized = pad_under_five(tokenized)
    input_ = TEXT.numericalize([tokenized],device=-1)
    if USE_CUDA: input_ = input_.cuda()

    prediction = model(input_)
    prediction = prediction.round()
    prediction = "긍정" if prediction.data[0][0] == 1 else "부정"
    if prediction=="긍정":
        print(test_input,"\033[1;01;36m" + prediction + "\033[0m")
    else:
        print(test_input,"\033[1;01;31m" + prediction + "\033[0m")

헐 진짜 개별로다.. [1;01;31m부정[0m
진짜 너무 재밌는 영화다 오랜만에 [1;01;36m긍정[0m
오..이건 진짜 봐야함 [1;01;36m긍정[0m
진짜 쓰레기 같은 영화 [1;01;31m부정[0m
노잼 [1;01;31m부정[0m
존잼 [1;01;36m긍정[0m
꾸울잼 [1;01;36m긍정[0m
핵노잼 [1;01;31m부정[0m
