In [8]:
!pip install torchtext==0.9.0 --user



In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
from torchtext.legacy import data
import random
import numpy as np
import pandas as pd
from konlpy.tag import Kkma
kkma = Kkma()

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1e8ff5026d0>

In [8]:
KERNEL_SIZE = 5

def tokenizer(text):
    token = kkma.morphs(text)
    if len(token) < KERNEL_SIZE:
            for i in range(0, KERNEL_SIZE-len(token)):
                token.append('<PAD>')
    return token

In [12]:
REVIEW = data.Field(tokenize = tokenizer, batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

fields = {'review': ('review', REVIEW), 'label': ('label',LABEL)}

train_data, test_data = data.TabularDataset.splits(path = './', train = 'train_data.csv', test = 'test_data.csv',format = 'csv', fields = fields)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))
print(vars(train_data[3]))

훈련 샘플의 개수 : 279
테스트 샘플의 개수 : 99
{'review': ['페이스', '허', '거', '같', '음', 'ㅋㅋㅋㅋㅋ'], 'label': '0'}


In [16]:
MAX_VOCAB_SIZE = 1000

REVIEW.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = 'fasttext.simple.300d', unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

print('단어 집합의 크기 : {}'.format(len(REVIEW.vocab)))
print(REVIEW.vocab.stoi)

단어 집합의 크기 : 1002
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001E8A6650550>>, {'<unk>': 0, '<pad>': 1, '이': 2, '하': 3, 'ㄴ': 4, '는': 5, '.': 6, '고': 7, '영화': 8, '다': 9, '보': 10, '도': 11, '었': 12, '어': 13, '아': 14, '의': 15, '은': 16, '에': 17, '을': 18, '..': 19, '...': 20, 'ㄹ': 21, '가': 22, '<PAD>': 23, '았': 24, '들': 25, '있': 26, '것': 27, '지': 28, ',': 29, '없': 30, '나': 31, '게': 32, '음': 33, '더': 34, '를': 35, '정말': 36, '는데': 37, '!': 38, '네요': 39, '안': 40, '기': 41, '만': 42, '?': 43, 'ㅁ': 44, '거': 45, 'ㄴ다': 46, '같': 47, '면': 48, '않': 49, '어요': 50, '에서': 51, '연기': 52, '너무': 53, '되': 54, '로': 55, '....': 56, '싶': 57, 'ㅂ니다': 58, '진짜': 59, '수': 60, '어서': 61, '재밌': 62, '나오': 63, '라': 64, '만들': 65, '아니': 66, '알': 67, '잘': 68, '점': 69, '좋': 70, '~': 71, '감동': 72, '내용': 73, '네': 74, '습니다': 75, '왜': 76, '주': 77, '남': 78, '완전': 79, '좀': 80, '지만': 81, 'ㄴ데': 82, '감독': 83, '겠': 84, '때': 85, '말': 86, '모르': 87, '못': 88, '사람': 89, '아서': 90, '오': 91, '요': 92, 

In [18]:
BATCH_SIZE = 16

train_loader, valid_loader, test_loader = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size = BATCH_SIZE, sort_key = lambda x: len(x.review), sort_within_batch = True)

In [27]:
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, embedding_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=0)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        embed = self.dropout(self.embedding(x))
        output, _ = self.rnn(embed)
        output = self.linear(output[:, -1, :])
        return output
    
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [28]:
INPUT_DIM = len(REVIEW.vocab)
EMBEDDING_DIM = 300
OUTPUT_DIM = 1
DROPOUT = 0.5

In [29]:
model = LSTM(INPUT_DIM, 32, OUTPUT_DIM, EMBEDDING_DIM, DROPOUT)

In [30]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [31]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds==y).float()
    acc = correct.sum() / len(correct)
    return acc

In [40]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.review).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [44]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [45]:
def train(model, iterator, optimizer, critertion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.review).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [46]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, train_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.687 | Train Acc: 54.22%
	 Val. Loss: 0.673 |  Val. Acc: 61.26%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.680 | Train Acc: 62.30%
	 Val. Loss: 0.663 |  Val. Acc: 65.77%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.678 | Train Acc: 61.36%
	 Val. Loss: 0.656 |  Val. Acc: 64.38%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.681 | Train Acc: 56.50%
	 Val. Loss: 0.646 |  Val. Acc: 66.47%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.671 | Train Acc: 59.77%
	 Val. Loss: 0.635 |  Val. Acc: 71.23%


In [49]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_kernels, kernel_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim, padding_idx = pad_idx)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1, out_channels = n_kernels, kernel_size = (ksize, embedding_dim)) for ksize in kernel_sizes])
        self.fc = nn.Linear(len(kernel_sizes)*n_kernels, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, review):
        embedded = self.embedding(review)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        res = self.fc(cat)
        return self.fc(cat)

In [50]:
N_KERNELS = 10
KERNEL_SIZES = [3,4,5]
PAD_IDX = REVIEW.vocab.stoi[REVIEW.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_KERNELS, KERNEL_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

print('모델 파라미터 수 : ', sum(param.numel() for param in model.parameters() if param.requires_grad))

모델 파라미터 수 :  336661


In [51]:
UNK_IDX = REVIEW.vocab.stoi[REVIEW.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [56]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, train_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.727 | Train Acc: 53.42%
	 Val. Loss: 0.692 |  Val. Acc: 52.13%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.744 | Train Acc: 46.97%
	 Val. Loss: 0.693 |  Val. Acc: 51.09%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.729 | Train Acc: 54.66%
	 Val. Loss: 0.693 |  Val. Acc: 51.44%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.702 | Train Acc: 52.13%
	 Val. Loss: 0.692 |  Val. Acc: 52.13%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.715 | Train Acc: 53.17%
	 Val. Loss: 0.693 |  Val. Acc: 50.40%
