#### Library

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torchtext.data import get_tokenizer
import torchtext
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re
import random
import torch.backends.cudnn as cudnn

In [2]:
def random_seed(seed_num):
    torch.manual_seed(seed_num)
    torch.cuda.manual_seed(seed_num)
    torch.cuda.manual_seed_all(seed_num)
    np.random.seed(seed_num)
    cudnn.benchmark = False
    cudnn.deterministic = True
    random.seed(seed_num)

random_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

#### Data Load

In [3]:
data_csv = pd.read_csv("../data/Medium/medium_data.csv")
data = data_csv['title'].values # title 만 사용
data

array(['A Beginner’s Guide to Word Embedding with Gensim Word2Vec\xa0Model',
       'Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric',
       'How to Use ggplot2 in\xa0Python', ...,
       'Content and Marketing Beyond Mass Consumption',
       '5 Questions All Copywriters Should Ask Clients Before Their Pen Hits the\xa0Paper',
       'How To Write a Good Business Blog\xa0Post'], dtype=object)

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data, vocab, tokenizer, max_len):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len
        self.tokenizer = tokenizer
        seq = self.make_sequence(self.data, self.vocab, self.tokenizer) # next word prediction을 하기 위한 형태로 변환
        self.seq = self.pre_zeropadding(seq, self.max_len) # zero padding으로 채워줌
        self.X = torch.tensor(self.seq[:,:-1])
        self.label = torch.tensor(self.seq[:,-1])

    def make_sequence(self, data, vocab, tokenizer):
        seq = []
        for i in data:
            token_id = vocab.lookup_indices(tokenizer(i))
            for j in range(1, len(token_id)):
                sequence = token_id[:j+1]
                seq.append(sequence)
        return seq

    def pre_zeropadding(self, seq, max_len): # max_len 길이에 맞춰서 0 으로 padding 처리 (앞부분에 padding 처리)
        return np.array([i[:max_len] if len(i) >= max_len else [0] * (max_len - len(i)) + i for i in seq])


    def __len__(self): # dataset의 전체 길이 반환
        return len(self.X)

    def __getitem__(self, idx): # dataset 접근
        X = self.X[idx]
        label = self.label[idx]

        return X, label

In [5]:
def cleaning_text(text):
    cleaned_text = re.sub( r"[^a-zA-Z0-9.,@#!\s']+", "", text) # 특수문자 를 모두 지우는 작업을 수행합니다.
    cleaned_text = cleaned_text.replace(u'\xa0',u' ') # No-break space를 unicode 빈칸으로 변환
    cleaned_text = cleaned_text.replace('\u200a',' ') # unicode 빈칸을 빈칸으로 변환
    return cleaned_text

In [6]:
data = list(map(cleaning_text, data))
tokenizer = get_tokenizer("basic_english")
vocab = torchtext.vocab.build_vocab_from_iterator(map(tokenizer, data))
vocab.insert_token('<pad>',0)
max_len = 20

In [7]:
# train set과 validation set, test set을 각각 나눕니다. 8 : 1 : 1 의 비율로 나눕니다.
train, test = train_test_split(data, test_size = .2, random_state = 42)
val, test = train_test_split(test, test_size = .5, random_state = 42)

print("Train 개수: ", len(train))
print("Validation 개수: ", len(val))
print("Test 개수: ", len(test))

train_dataset = CustomDataset(train, vocab, tokenizer, max_len)
valid_dataset = CustomDataset(val, vocab, tokenizer, max_len)
test_dataset = CustomDataset(test, vocab, tokenizer, max_len)

Train 개수:  5206
Validation 개수:  651
Test 개수:  651


In [8]:
batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
valid_dataloader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

#### Vanilla RNN 구현

In [9]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True) # batch_first=True는 입력의 첫 번째 차원이 batch 크기임을 나타냅니다.

        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        '''
        INPUT:
           x: [batch_size, seq_len]
        OUTPUT:
           output: [batch_size, vocab_size]
        '''
        x = self.embedding(x) # [batch_size, sequence_len, embedding_dim]

        # 첫 번째 리턴값인 output은 모든 time step의 hidden state를 포함한 출력입니다.
        # 두 번째 리턴값인 h_0 는 마지막 time step의 hidden state를 의미합니다.
        output, h_0 = self.rnn(x) # output: [batch_size, seq_len, hidden_dim] / h_0: [1, batch_size, hidden_dim]
        return self.fc(output[:,-1,:]) # [batch_size, vocab_size]

In [10]:
# training 코드, evaluation 코드, training_loop 코드
def training(model, dataloader, train_dataset, criterion, optimizer, device, epoch, num_epochs):
    model.train()  # 모델을 학습 모드로 설정
    train_loss = 0.0
    train_accuracy = 0

    tbar = tqdm(dataloader)
    for texts, labels in tbar:
        texts = texts.to(device)
        labels = labels.to(device)
        # 순전파

        outputs = model(texts)

        loss = criterion(outputs, labels)

        # 역전파 및 가중치 업데이트
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 손실과 정확도 계산
        train_loss += loss.item()
        # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
        _, predicted = torch.max(outputs, dim=1)


        train_accuracy += (predicted == labels).sum().item()

        # tqdm의 진행바에 표시될 설명 텍스트를 설정
        tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}")

    # 에폭별 학습 결과 출력
    train_loss = train_loss / len(dataloader)
    train_accuracy = train_accuracy / len(train_dataset)

    return model, train_loss, train_accuracy

def evaluation(model, dataloader, valid_dataset, criterion, device, epoch, num_epochs):
    model.eval()  # 모델을 평가 모드로 설정
    valid_loss = 0.0
    valid_accuracy = 0

    with torch.no_grad(): # model의 업데이트 막기
        tbar = tqdm(dataloader)
        for texts, labels in tbar:
            texts = texts.to(device)
            labels = labels.to(device)

            # 순전파
            outputs = model(texts)
            loss = criterion(outputs, labels)

            # 손실과 정확도 계산
            valid_loss += loss.item()
            # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
            _, predicted = torch.max(outputs, 1)
            # _, true_labels = torch.max(labels, dim=1)
            valid_accuracy += (predicted == labels).sum().item()


            # tqdm의 진행바에 표시될 설명 텍스트를 설정
            tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Valid Loss: {loss.item():.4f}")

    valid_loss = valid_loss / len(dataloader)
    valid_accuracy = valid_accuracy / len(valid_dataset)

    return model, valid_loss, valid_accuracy


def training_loop(model, train_dataloader, valid_dataloader, train_dataset, val_dataset, criterion, optimizer, device, num_epochs, patience, model_name):
    best_valid_loss = float('inf')  # 가장 좋은 validation loss를 저장
    early_stop_counter = 0  # 카운터
    valid_max_accuracy = -1

    for epoch in range(num_epochs):
        model, train_loss, train_accuracy = training(model, train_dataloader, train_dataset, criterion, optimizer, device, epoch, num_epochs)
        model, valid_loss, valid_accuracy = evaluation(model, valid_dataloader, val_dataset, criterion, device, epoch, num_epochs)

        if valid_accuracy > valid_max_accuracy:
            valid_max_accuracy = valid_accuracy

        # validation loss가 감소하면 모델 저장 및 카운터 리셋
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f"../models/model_{model_name}.pt")
            early_stop_counter = 0

        # validation loss가 증가하거나 같으면 카운터 증가
        else:
            early_stop_counter += 1

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f} Valid Loss: {valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.4f}")

        # 조기 종료 카운터가 설정한 patience를 초과하면 학습 종료
        if early_stop_counter >= patience:
            print("Early stopping")
            break

    return model, valid_max_accuracy

In [11]:
num_epochs = 100
patience = 3
model_name = 'RNN'

vocab_size = len(vocab)
embedding_dim = 512
hidden_size = 256
model = RNN(vocab_size, embedding_dim, hidden_size).to(device)

lr = 1e-3
criterion = nn.CrossEntropyLoss(ignore_index = 0)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

model, valid_max_accuracy = training_loop(model, train_dataloader, valid_dataloader, train_dataset, valid_dataset, criterion, optimizer, device, num_epochs, patience, model_name)
print('Valid max accuracy : ', valid_max_accuracy)

  0%|          | 0/1159 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

Epoch [1/100], Train Loss: 7.0220, Train Accuracy: 0.1110 Valid Loss: 6.8250, Valid Accuracy: 0.1334


  0%|          | 0/1159 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

Epoch [2/100], Train Loss: 5.3991, Train Accuracy: 0.1644 Valid Loss: 6.9675, Valid Accuracy: 0.1361


  0%|          | 0/1159 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

Epoch [3/100], Train Loss: 4.2824, Train Accuracy: 0.2240 Valid Loss: 7.2146, Valid Accuracy: 0.1323


  0%|          | 0/1159 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

Epoch [4/100], Train Loss: 3.3850, Train Accuracy: 0.3315 Valid Loss: 7.4921, Valid Accuracy: 0.1273
Early stopping
Valid max accuracy :  0.13613338961587168


In [12]:
model.load_state_dict(torch.load("../models/model_RNN.pt")) # 모델 불러오기
model = model.to(device)
model.eval()
total_labels = []
total_preds = []
with torch.no_grad():
    for texts, labels in tqdm(test_dataloader):
        texts = texts.to(device)
        labels = labels

        outputs = model(texts)
        # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
        _, predicted = torch.max(outputs.data, 1)

        total_preds.extend(predicted.detach().cpu().tolist())
        total_labels.extend(labels.tolist())

total_preds = np.array(total_preds)
total_labels = np.array(total_labels)
nwp_rnn_acc = accuracy_score(total_labels, total_preds) # 정확도 계산
print("Next word prediction RNN model accuracy : ", nwp_rnn_acc)

  0%|          | 0/143 [00:00<?, ?it/s]

Next word prediction RNN model accuracy :  0.12860892388451445
