In [99]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader

import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

import torchdata

from konlpy.tag import Okt 
import pandas as pd
from tqdm.notebook import tqdm
import time

In [100]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [101]:
train_df = pd.read_table('ratings_train.txt')
test_df = pd.read_table('ratings_test.txt')

In [102]:
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [103]:
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [104]:
train_df = train_df.drop(['id'],axis=1)
test_df = test_df.drop(['id'],axis=1)

In [105]:
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","",regex=True)
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","",regex=True)
train_df.drop(train_df[train_df['document']==''].index, inplace=True)
test_df.drop(test_df[test_df['document']==''].index, inplace=True)

In [106]:
print('train 결측치 정보')
print(train_df.isna().sum())
train_df.dropna(inplace=True)
print('\n train결측치 정보')
print(train_df.isna().sum())

train 결측치 정보
document    5
label       0
dtype: int64

 train결측치 정보
document    0
label       0
dtype: int64


In [107]:
print('test 결측치 정보')
print(test_df.isna().sum())
test_df.dropna(inplace=True)
print('\ntest 결측치 정보')
print(test_df.isna().sum())

test 결측치 정보
document    3
label       0
dtype: int64

test 결측치 정보
document    0
label       0
dtype: int64


In [108]:
train_array,val_array = train_df.iloc[:int(len(train_df)*0.8),:].values, train_df.iloc[int(len(train_df)*0.8):,:].values
test_array = test_df.values

---
#### 불용어
---

In [109]:
train_array

array([['아 더빙 진짜 짜증나네요 목소리', 0],
       ['흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나', 1],
       ['너무재밓었다그래서보는것을추천한다', 0],
       ...,
       ['와 진짜 내가 왠만해서 진짜 씹노잼이다와 이거 점주는인간들 머지 도대체 영화를 볼줄아는건가 ㅋㅋ', 0],
       ['다빈씨 하늘에선 행복 하시길', 1],
       ['편 본사람은 완전실망비디오용도 고려 울아거들도 무표정', 0]], dtype=object)

In [110]:
tokenizer = Okt()

stop_words_file = open('stopwords_kor.txt','r')
stop_words_list = stop_words_file.read().split('\n')

def yield_tokenizer(array):
    for sentence,_ in tqdm(array):
        sentence = tokenizer.morphs(sentence)
        sentence = [word for word in sentence if not word in stop_words_list]
        yield sentence

vocab = build_vocab_from_iterator(iterator=yield_tokenizer(train_array), 
                                  specials=['<pad>', '<unk>'], 
                                  special_first=True, 
                                  max_tokens=5000)


  0%|          | 0/119348 [00:00<?, ?it/s]

In [111]:
def text_pipeline(sentence:str):
    output = list()
    sentence = tokenizer.morphs(sentence)
    for word in sentence:
        if word in vocab:
            output.append(word)
        else:
            output.append('<unk>')
    return vocab(output)

In [112]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for text,label in batch:
        label_list.append(label)
        processed_text = text_pipeline(text)
        processed_text = torch.tensor(processed_text, dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [152]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
        x, _ = self.gru(x, h_0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
        h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
        self.dropout(h_t)
        logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [153]:
vocab_size = len(vocab)
model = GRU(1, 256, vocab_size, 128, 2, 0.5).to(device)

In [154]:
EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_BATCHS = len(train_array)/BATCH_SIZE

In [155]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=0.1)

train_loader = DataLoader(train_array, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_array, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_array, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

In [156]:
for epoch in tqdm(range(EPOCHS)):
    model.train()
    epoch_correct = 0
    epoch_loss = 0.0

    for labels, texts, offsets in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(texts, offsets)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_correct += (outputs.argmax(-1)==labels).sum().item()
        epoch_loss += loss/NUM_BATCHS
    epoch_accuracy = epoch_correct/len(train_array)*100

    model.eval()
    val_correct = 0
    val_avg_loss = 0.0
    with torch.no_grad():
        for val_labels, val_texts, val_offsets in tqdm(val_loader):
            val_outputs = model(val_texts, val_offsets)
            val_loss = loss_function(val_outputs, val_labels)
            val_correct += (val_outputs.argmax(dim=-1) == val_labels).sum().item()
            val_avg_loss += val_loss/int(len(val_array)/BATCH_SIZE)
        val_accuracy = val_correct/len(val_array)*100
    print(f'[EPOCH {epoch+1}/{EPOCHS}] [ACCURACY {round(epoch_accuracy,2)}%] [LOSS {round(epoch_loss.item(),5)}] [VAL ACCURACY {round(val_accuracy,5)}] [VAL LOSS {round(val_avg_loss.item(),5)}]') 

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1865 [00:00<?, ?it/s]

TypeError: forward() takes 2 positional arguments but 3 were given