# RNN 텍스트 분류기

In [14]:
# 데이터 준비
corpus = [
    "자연어 처리는 재미있다",
    "Python이 자연어 처리보다 쉽다",
    "자연어 처리 공부는 어렵다",
    "Python 활용법을 더 즐겁게 찾아보자"
]

labels = [1, 0, 0, 1]

In [15]:
# 데이터 전처리

# 토큰화
tokenized_corpus = [sentence.split() for sentence in corpus]

# 단어사전 생성
vocab = {}
for tokens in tokenized_corpus:
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab) + 1

# 인덱싱
indexed_corpus = []
for tokens in tokenized_corpus:
    indexed_sent = [vocab[token] for token in tokens]
    indexed_corpus.append(indexed_sent)

# 패딩 처리
max_seq_len = max(len(seq) for seq in indexed_corpus)
def pad_sequences(seq, max_len):
    if len(seq) < max_len:
        seq = seq + [0] * (max_len - len(seq))
    return seq
padded_corpus = [pad_sequences(seq, max_seq_len) for seq in indexed_corpus]

In [None]:
# Torch Tensor 변환
import torch

inputs = torch.tensor(padded_corpus, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

In [17]:
# RNN 기반 텍스트 분류기 모델 정의
import torch.nn as nn

class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, hidden = self.rnn(x)
        return self.sigmoid(self.fc(hidden[-1]))

In [None]:
# 모델 생서
VOCAB_SIZE = len(vocab) + 1
EMBED_SIZE = 128
HIDDEN_SIZE = 64
NUM_CLASSES = 1

model = RNNClassifier(
    vocab_size=VOCAB_SIZE, 
    embed_size=EMBED_SIZE, 
    hidden_size=HIDDEN_SIZE, 
    num_classes=NUM_CLASSES
)

print(model)

RNNClassifier(
  (embedding): Embedding(15, 128)
  (rnn): RNN(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [None]:
# 모델 학습
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs = 20

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(inputs)

    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}/{epochs} | Loss: {loss.item():.4f}')

Epoch 1/20 | Loss: 0.6612
Epoch 2/20 | Loss: 0.4007
Epoch 3/20 | Loss: 0.1937
Epoch 4/20 | Loss: 0.1123
Epoch 5/20 | Loss: 0.0588
Epoch 6/20 | Loss: 0.0324
Epoch 7/20 | Loss: 0.0196
Epoch 8/20 | Loss: 0.0129
Epoch 9/20 | Loss: 0.0092
Epoch 10/20 | Loss: 0.0070
Epoch 11/20 | Loss: 0.0056
Epoch 12/20 | Loss: 0.0046
Epoch 13/20 | Loss: 0.0038
Epoch 14/20 | Loss: 0.0032
Epoch 15/20 | Loss: 0.0028
Epoch 16/20 | Loss: 0.0024
Epoch 17/20 | Loss: 0.0021
Epoch 18/20 | Loss: 0.0018
Epoch 19/20 | Loss: 0.0016
Epoch 20/20 | Loss: 0.0014


In [None]:
# 테스트 데이터 전처리
test_texts = [
    "자연어 처리는 재미있다",
    "Python은 어렵다"
]

def preprocess_sentence(sentence, vocab, max_len):
    tokens = sentence.split()
    indices = [vocab.get(token, 0) for token in tokens]
    indices = pad_sequences(indices, max_len)
    return torch.tensor(indices, dtype=torch.long)

test_inputs = []
for sent in test_texts:
    test_inputs.append(preprocess_sentence(sent, vocab, max_seq_len))

test_inputs = torch.stack(test_inputs)

In [None]:
# 모델 예측 (평가)
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    print(outputs)

tensor([[0.9978],
        [0.3683]])
