### 다음 참고자료를 바탕으로 작성되었습니다.
### https://github.com/lih0905/korean-pytorch-sentiment-analysis

In [None]:
# 한국어 자연어처리 패키지 KoNLPy 및 형태소 분석기 MeCab 설치
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/SOMJANG/Mecab-ko-for-Google-Colab/refs/heads/master/install_mecab-ko_on_colab_light_220429.sh", filename="mecab.sh")
!bash mecab.sh

# Pytorch 및 torchtext 라이브러리 다운그레이드 (최신버전에서는 충돌 이슈 발생)
!pip install torch==1.12.0 torchtext==0.6.0

# 위의 셀을 실행한 후, <런타임 -> 세션 다시 시작> 클릭하여 런타임 재실행


In [None]:
# 필요 라이브러리들 import

import torch
import torchtext
import torch.nn as nn

import pandas as pd
import urllib.request
import random

torch.manual_seed(2024)
torch.backends.cudnn.deterministic = True

# nn.RNN 및 nn.LSTM 이해, 파라미터 개수 확인

In [None]:
a = nn.RNN(input_size=20, hidden_size=25)
print(sum(p.numel() for p in a.parameters()))

In [None]:
b = nn.RNN(input_size=20, hidden_size=25, bidirectional=True)
print(sum(p.numel() for p in b.parameters()))

In [None]:
c = nn.LSTM(input_size=20, hidden_size=25)
print(sum(p.numel() for p in c.parameters()))

# 네이버 영화리뷰 데이터셋 다운로드 및 전처리

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

In [None]:
columns = ['id','text','label']
train_data = pd.read_csv('ratings_train.txt', sep='\t', names=columns, skiprows=1).dropna() # null데이터 삭제
test_data = pd.read_csv('ratings_test.txt', sep='\t', names=columns, skiprows=1).dropna()

In [None]:
train_data[5:10]

In [None]:
print(len(train_data))
print(len(test_data))

In [None]:
# torchtext dataloader를 활용하기 위해 csv 포맷으로 저장

train_data.to_csv('train_data.csv',index=False)
test_data.to_csv('test_data.csv',index=False)

In [None]:
# 문장을 형태소 단위로 쪼갬

from konlpy.tag import Mecab
mecab = Mecab()


TEXT = torchtext.data.Field(tokenize=mecab.morphs)
LABEL = torchtext.data.LabelField(dtype = torch.float)

fields = {'text': ('text',TEXT), 'label': ('label',LABEL)}
# dictionary 형식은 {csv컬럼명 : (데이터 컬럼명, Field이름)}

train_dataset, test_dataset = torchtext.data.TabularDataset.splits(
                            path = '.',
                            train = 'train_data.csv',
                            test = 'test_data.csv',
                            format = 'csv',
                            fields = fields,
)

In [None]:
vars(train_dataset[5])

In [None]:
train_dataset, valid_dataset = train_dataset.split(random_state=random.seed(2024))
print(f'훈련 데이터 수 : {len(train_dataset)}')
print(f'검증 데이터 수 : {len(valid_dataset)}')
print(f'테스트 데이터 수 : {len(test_dataset)}')

In [None]:
TEXT.build_vocab(train_dataset)
print(len(TEXT.vocab))

In [None]:
MAX_VOCAB_SIZE = 10000

TEXT.build_vocab(train_dataset, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_dataset)

In [None]:
print(TEXT.vocab.freqs.most_common(20))

In [None]:
batchsize = 64

train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset),
    batch_size = batchsize,
    device = 'cuda',
    sort_key = lambda x: len(x.text),
    sort_within_batch = False,
)

In [None]:
print(next(iter(train_iterator)))

print(next(iter(train_iterator)).text)

# RNN 모델 정의 및 학습

In [None]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # text : [sent_len, batch_size]
        embedded = self.embedding(text)
        # embedded : [sent_len, batch_size, emb_dim]
        output, hidden = self.rnn(embedded)
        # output : [sent_len, batch_size, hidden_dim]
        # hidden : [1, batch_size, hidden_dim]

        return self.fc(hidden[-1].squeeze(0)) # [batch_size, output_dim]

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).cuda()

print(sum(p.numel() for p in model.parameters()))

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()


In [None]:
num_epochs = 5
model.train()
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}')
    for batch in train_iterator:
        output = model(batch.text).squeeze(1)
        loss = criterion(output, batch.label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

num_correct = 0
model.eval()
with torch.no_grad():
    for batch in test_iterator:
        pred = model(batch.text).squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(pred))
        num_correct += (rounded_preds == batch.label).sum()

print(f'Accuracy : {num_correct / len(test_dataset) * 100:.2f}%')

# 사전 훈련된 단어 임베딩 사용

In [None]:
MAX_VOCAB_SIZE = 10000

TEXT.build_vocab(train_dataset,
                max_size = MAX_VOCAB_SIZE,
                vectors = 'fasttext.simple.300d',
                unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_dataset)

In [None]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

# RNN 대신 LSTM 모델 기용하여 학습

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1].squeeze(0))

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model.embedding.weight.data.copy_(pretrained_embeddings)
model = model.cuda()

print(sum(p.numel() for p in model.parameters()))

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()


In [None]:
num_epochs = 5
model.train()
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}')
    for batch in train_iterator:
        output = model(batch.text).squeeze(1)
        loss = criterion(output, batch.label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

num_correct = 0
model.eval()
with torch.no_grad():
    for batch in test_iterator:
        pred = model(batch.text).squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(pred))
        num_correct += (rounded_preds == batch.label).sum()

print(f'Accuracy : {num_correct / len(test_dataset) * 100:.2f}%')

# 임의의 텍스트에 대해 모델 예측 Score 확인

In [None]:
def predict_sentiment(sentence):
    model.eval()
    tokenized = [tok for tok in mecab.morphs(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to('cuda')
    tensor = tensor.unsqueeze(1) # 배치
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
predict_sentiment("이 영화 진짜 재밌었다!!")


In [None]:
predict_sentiment("이런걸 돈주고 보다니...")