In [256]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader

import torchtext
from torchtext.vocab import build_vocab_from_iterator

import torchdata

from konlpy.tag import Okt 
import pandas as pd
from tqdm.notebook import tqdm

In [257]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [258]:
train_df = pd.read_table('ratings_train.txt')
test_df = pd.read_table('ratings_test.txt')

In [259]:
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [260]:
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [261]:
train_df = train_df.drop(['id'],axis=1)
test_df = test_df.drop(['id'],axis=1)

In [262]:
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","",regex=True)
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","",regex=True)
train_df.drop(train_df[train_df['document']==''].index, inplace=True)
test_df.drop(test_df[test_df['document']==''].index, inplace=True)

In [263]:
print('train 결측치 정보')
print(train_df.isna().sum())
train_df.dropna(inplace=True)
print('\n train결측치 정보')
print(train_df.isna().sum())

train 결측치 정보
document    5
label       0
dtype: int64

 train결측치 정보
document    0
label       0
dtype: int64


In [264]:
print('test 결측치 정보')
print(test_df.isna().sum())
test_df.dropna(inplace=True)
print('\ntest 결측치 정보')
print(test_df.isna().sum())

test 결측치 정보
document    3
label       0
dtype: int64

test 결측치 정보
document    0
label       0
dtype: int64


In [265]:
train_array,val_array = train_df.iloc[:int(len(train_df)*0.8),:].values, train_df.iloc[int(len(train_df)*0.8):,:].values
test_array = test_df.values

---
#### 불용어
---

In [266]:
train_array

array([['아 더빙 진짜 짜증나네요 목소리', 0],
       ['흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나', 1],
       ['너무재밓었다그래서보는것을추천한다', 0],
       ...,
       ['와 진짜 내가 왠만해서 진짜 씹노잼이다와 이거 점주는인간들 머지 도대체 영화를 볼줄아는건가 ㅋㅋ', 0],
       ['다빈씨 하늘에선 행복 하시길', 1],
       ['편 본사람은 완전실망비디오용도 고려 울아거들도 무표정', 0]], dtype=object)

In [267]:
tokenizer = Okt()

stop_words_file = open('stopwords_kor.txt','r')
stop_words_list = stop_words_file.read().split('\n')

def yield_tokenizer(array):
    for sentence,_ in tqdm(array):
        sentence = tokenizer.morphs(sentence)
        sentence = [word for word in sentence if not word in stop_words_list]
        yield sentence

vocab = build_vocab_from_iterator(iterator=yield_tokenizer(train_array), 
                                  specials=['<unk>', '<pad>'], 
                                  special_first=True, 
                                  max_tokens=5000)


  0%|          | 0/119348 [00:00<?, ?it/s]

In [268]:
def text_pipeline(sentence:str):
    output = list()
    sentence = tokenizer.morphs(sentence)
    for word in sentence:
        if word in vocab:
            output.append(word)
        else:
            output.append('<unk>')
    return vocab(output)

In [269]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for text,label in batch:
        label_list.append(label)
        processed_text = text_pipeline(text)
        processed_text = torch.tensor(processed_text, dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [270]:
vocab_length = len(vocab)
BATCH_SIZE = 64
train_loader = DataLoader(train_array, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_array, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_array, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

In [271]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.embed = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.rnn = nn.RNN(input_size = embed_dim,
                          hidden_size = hidden_dim,
                          num_layers = n_layers,
                          dropout=dropout)
        
        self.fc_layer1 = nn.Sequential(
            nn.Linear(256,256),
            nn.ReLU()
            )
        
        self.drop = nn.Dropout(dropout)
        
        self.output_layer = nn.Sequential(
            nn.Linear(256, output_dim),
            nn.Sigmoid()
            )

    def forward(self,text,offsests):
        embedded = self.embed(text,offsests)
        output, hidden = self.rnn(embedded)
        output = output.view(output.shape[0], -1)
        output = self.fc_layer1(output)
        output = self.output_layer(output)
        return output

In [274]:
model = RNNClassifier(vocab_size=vocab_length,
                      embed_dim=64,
                      hidden_dim=256,
                      output_dim=2,
                      n_layers=2,
                      dropout=0.5).to(device)

In [277]:
EPOCHS = 5
LEARNING_RATE = 1e-5

loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [278]:
for epoch in tqdm(range(20,20+EPOCHS)):
    model.train()
    epoch_avg_loss = 0.0
    epoch_correct = 0

    for labels, texts, offsets in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(texts, offsets)
        epoch_correct += (outputs.argmax(-1)==labels).sum().item()
        labels = nn.functional.one_hot(labels)
        labels = labels.type(torch.float32)
        batch_loss = loss_func(outputs,labels)
        epoch_avg_loss += batch_loss.item()/len(train_loader)
        batch_loss.backward()
        optimizer.step()
    epoch_accuracy = epoch_correct/len(train_array)*100
    
    # eval
    model.eval()
    val_avg_loss = 0.0
    val_correct = 0
    with torch.no_grad():
        for val_labels, val_texts, val_offsets in tqdm(val_loader):
            val_outputs = model(val_texts,val_offsets)
            val_correct += (val_outputs.argmax(-1)==val_labels).sum().item()
            val_labels = nn.functional.one_hot(val_labels)
            val_labels = val_labels.type(torch.float32)
            batch_loss = loss_func(val_outputs,val_labels)
            val_batch_loss = loss_func(val_outputs,val_labels)
            val_avg_loss += val_batch_loss.item()/len(val_loader)
        val_accuracy = val_correct/len(val_array)*100

    print(f'[EPOCH: {epoch+1:2}/{EPOCHS}] [TRAIN LOSS:{epoch_avg_loss:.5f}] [TRAIN ACCURACY:{epoch_accuracy:.2f}%] [VAL LOSS:{val_avg_loss:.5f}] [VAL ACCURACY: {val_accuracy:.2f}%]')

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1865 [00:00<?, ?it/s]

  0%|          | 0/467 [00:00<?, ?it/s]

[EPOCH: 21/5] [TRAIN LOSS:0.37378] [TRAIN ACCURACY:83.22%] [VAL LOSS:0.40284] [VAL ACCURACY: 81.93%]


  0%|          | 0/1865 [00:00<?, ?it/s]

  0%|          | 0/467 [00:00<?, ?it/s]

[EPOCH: 22/5] [TRAIN LOSS:0.37284] [TRAIN ACCURACY:83.35%] [VAL LOSS:0.40311] [VAL ACCURACY: 81.89%]


  0%|          | 0/1865 [00:00<?, ?it/s]

  0%|          | 0/467 [00:00<?, ?it/s]

[EPOCH: 23/5] [TRAIN LOSS:0.37226] [TRAIN ACCURACY:83.38%] [VAL LOSS:0.40309] [VAL ACCURACY: 81.91%]


  0%|          | 0/1865 [00:00<?, ?it/s]

  0%|          | 0/467 [00:00<?, ?it/s]

[EPOCH: 24/5] [TRAIN LOSS:0.37162] [TRAIN ACCURACY:83.43%] [VAL LOSS:0.40276] [VAL ACCURACY: 81.91%]


  0%|          | 0/1865 [00:00<?, ?it/s]

  0%|          | 0/467 [00:00<?, ?it/s]

[EPOCH: 25/5] [TRAIN LOSS:0.37097] [TRAIN ACCURACY:83.42%] [VAL LOSS:0.40273] [VAL ACCURACY: 81.94%]


In [282]:
model.eval()
test_avg_loss = 0.0
test_correct = 0
with torch.no_grad():
    for test_labels, test_texts, test_offsets in tqdm(test_loader):
        test_outputs = model(test_texts, test_offsets)
        test_correct += (test_outputs.argmax(-1)==test_labels).sum().item()
        test_labels = nn.functional.one_hot(test_labels)
        test_labels = test_labels.type(torch.float32)
        batch_loss = loss_func(test_outputs, test_labels)
        test_batch_loss = loss_func(test_outputs, test_labels)
        test_avg_loss += test_batch_loss.item()/len(test_loader)
    test_accuracy = test_correct/len(test_array)*100
    print(f'[TEST LOSS:{test_avg_loss:.5f}] [TEST ACCURACY: {test_accuracy:.2f}%]')

  0%|          | 0/777 [00:00<?, ?it/s]

[TEST LOSS:0.40482] [TEST ACCURACY: 81.68%]


In [279]:
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.embed = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.rnn = nn.GRU(input_size = embed_dim,
                          hidden_size = hidden_dim,
                          num_layers = n_layers,
                          dropout=dropout)
        
        self.fc_layer1 = nn.Sequential(
            nn.Linear(256,256),
            nn.ReLU()
            )
        
        self.drop = nn.Dropout(dropout)
        
        self.output_layer = nn.Sequential(
            nn.Linear(256, output_dim),
            nn.Sigmoid()
            )

    def forward(self,text,offsests):
        embedded = self.embed(text,offsests)
        output, hidden = self.rnn(embedded)
        output = output.view(output.shape[0], -1)
        output = self.fc_layer1(output)
        output = self.output_layer(output)
        return output

In [289]:
model2 = GRUClassifier(vocab_size=vocab_length,
                      embed_dim=64,
                      hidden_dim=256,
                      output_dim=2,
                      n_layers=2,
                      dropout=0.5).to(device)

In [290]:
EPOCHS = 5
LEARNING_RATE = 1e-4

loss_func1 = nn.BCELoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=LEARNING_RATE)

In [291]:
for epoch in tqdm(range(30)):
    model2.train()
    epoch_avg_loss = 0.0
    epoch_correct = 0

    for labels, texts, offsets in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model2(texts, offsets)
        epoch_correct += (outputs.argmax(-1)==labels).sum().item()
        labels = nn.functional.one_hot(labels)
        labels = labels.type(torch.float32)
        batch_loss = loss_func1(outputs,labels)
        epoch_avg_loss += batch_loss.item()/len(train_loader)
        batch_loss.backward()
        optimizer.step()
    epoch_accuracy = epoch_correct/len(train_array)*100
    
    # eval
    model2.eval()
    val_avg_loss = 0.0
    val_correct = 0
    with torch.no_grad():
        for val_labels, val_texts, val_offsets in tqdm(val_loader):
            val_outputs = model2(val_texts,val_offsets)
            val_correct += (val_outputs.argmax(-1)==val_labels).sum().item()
            val_labels = nn.functional.one_hot(val_labels)
            val_labels = val_labels.type(torch.float32)
            batch_loss = loss_func1(val_outputs,val_labels)
            val_batch_loss = loss_func(val_outputs,val_labels)
            val_avg_loss += val_batch_loss.item()/len(val_loader)
        val_accuracy = val_correct/len(val_array)*100

    print(f'[EPOCH: {epoch+1:2}/{30}] [TRAIN LOSS:{epoch_avg_loss:.5f}] [TRAIN ACCURACY:{epoch_accuracy:.2f}%] [VAL LOSS:{val_avg_loss:.5f}] [VAL ACCURACY: {val_accuracy:.2f}%]')

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1865 [00:00<?, ?it/s]

  0%|          | 0/467 [00:00<?, ?it/s]

[EPOCH:  1/30] [TRAIN LOSS:0.65994] [TRAIN ACCURACY:58.79%] [VAL LOSS:0.60957] [VAL ACCURACY: 66.68%]


  0%|          | 0/1865 [00:00<?, ?it/s]

  0%|          | 0/467 [00:00<?, ?it/s]

KeyboardInterrupt: 