#### Library

In [1]:
import re
import numpy as np
import pandas as pd
import torch
import torchtext
import matplotlib.pyplot as plt
import random
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torchtext.data import get_tokenizer

In [2]:
# seed 고정
def random_seed(seed_num):
    torch.manual_seed(seed_num)
    torch.cuda.manual_seed(seed_num)
    torch.cuda.manual_seed_all(seed_num)
    np.random.seed(seed_num)
    cudnn.benchmark = False
    cudnn.deterministic = True
    random.seed(seed_num)

random_seed(42)

#### Data Load

In [3]:
df = pd.read_csv("../data/Medium/medium_data.csv")
df = df['title'].values # title 만 사용
df

array(['A Beginner’s Guide to Word Embedding with Gensim Word2Vec\xa0Model',
       'Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric',
       'How to Use ggplot2 in\xa0Python', ...,
       'Content and Marketing Beyond Mass Consumption',
       '5 Questions All Copywriters Should Ask Clients Before Their Pen Hits the\xa0Paper',
       'How To Write a Good Business Blog\xa0Post'], dtype=object)

In [4]:
def cleaning_text(text):
    cleaned_text = re.sub( r"[^a-zA-Z0-9.,@#!\s']+", "", text) # 특수문자 를 모두 지우는 작업을 수행
    cleaned_text = cleaned_text.replace(u'\xa0',u' ') # No-break space를 unicode 빈칸으로 변환
    cleaned_text = cleaned_text.replace('\u200a',' ') # unicode 빈칸을 빈칸으로 변환
    return cleaned_text

cleaned_data = list(map(cleaning_text, df)) # 모든 특수문자와 공백을 지움
print('Before preprocessing')
print(df[:5])
print('After preprocessing')
print(cleaned_data[:5])

Before preprocessing
['A Beginner’s Guide to Word Embedding with Gensim Word2Vec\xa0Model'
 'Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric'
 'How to Use ggplot2 in\xa0Python'
 'Databricks: How to Save Files in CSV on Your Local\xa0Computer'
 'A Step-by-Step Implementation of Gradient Descent and Backpropagation']
After preprocessing
['A Beginners Guide to Word Embedding with Gensim Word2Vec Model', 'Handson Graph Neural Networks with PyTorch  PyTorch Geometric', 'How to Use ggplot2 in Python', 'Databricks How to Save Files in CSV on Your Local Computer', 'A StepbyStep Implementation of Gradient Descent and Backpropagation']


#### build_vocab_from_iterator

- `torchtext.vocab.build_vocab_from_iterator`는 iterator를 이용하여 Vocab 클래스(단어사전)를 만드는 함수

In [5]:
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(cleaned_data[0])
print(f"Original text: {cleaned_data[0]}") # 원본
print(f"Token: {tokens}") # tokenizer 후

Original text: A Beginners Guide to Word Embedding with Gensim Word2Vec Model
Token: ['a', 'beginners', 'guide', 'to', 'word', 'embedding', 'with', 'gensim', 'word2vec', 'model']


In [6]:
vocab = torchtext.vocab.build_vocab_from_iterator(map(tokenizer, cleaned_data)) # 단어 사전 생성
vocab.insert_token('<pad>', 0) # 패딩 토큰 추가

In [7]:
# index > 단어
id2token = vocab.get_itos()
id2token[:10] # 각 인덱스의 단어 출력, list 형태

['<pad>', 'to', 'the', 'a', 'of', 'and', 'how', 'in', 'your', 'for']

In [8]:
# 단어 > index
token2id = vocab.get_stoi()
token2id = dict(sorted(token2id.items(), key=lambda item: item[1]))
for idx, (k, v) in enumerate(token2id.items()):
    print(k, v)
    if idx == 5:
        break

<pad> 0
to 1
the 2
a 3
of 4
and 5


- `lookup_indices()`: 리스트 형태의 토큰을 받아서 해당 토큰의 인덱스로 변환

In [9]:
# 문장을 token 화하고 각 token 을 index로 변환
vocab.lookup_indices(tokenizer(cleaned_data[0]))

[3, 273, 66, 1, 467, 1582, 12, 2884, 8549, 99]

- Padding: NLP에서는 문장의 길이를 맞추기 위해 짧은 문장 끝에 0을 채워 넣는 작업. 주로 미니배치 학습을 위해 사용됨
- ex
    ```
    문장1: 나는 밥을 먹었다      → 토큰 수: 5
    문장2: 밥 먹자               → 토큰 수: 2
    문장3: 오늘 날씨 어때?       → 토큰 수: 4
    ```
    ERROR 발생

    ```
    [나는, 밥을, 먹었다]       → [나는, 밥을, 먹었다, 0, 0]
    [밥, 먹자]                → [밥, 먹자, 0, 0, 0]
    ```
    해결: Padding 사용

In [10]:
seq = []
for i in cleaned_data:
    token_id = vocab.lookup_indices(tokenizer(i)) # 문장을 토큰화 → 인덱스로 변환
    for j in range(1, len(token_id)): # 2개 단어부터 슬리이싱
        sequence = token_id[:j+1]
        seq.append(sequence)

seq[:5]

[[3, 273],
 [3, 273, 66],
 [3, 273, 66, 1],
 [3, 273, 66, 1, 467],
 [3, 273, 66, 1, 467, 1582]]

In [11]:
max_len = max(len(sublist) for sublist in seq)
print(max_len)

24


In [12]:
def pre_zeropadding(seq, max_len): # max_len 길이에 맞춰서 0 으로 padding 처리 (앞부분에 padding 처리)
    return np.array([i[:max_len] if len(i) >= max_len else [0] * (max_len - len(i)) + i for i in seq])

zero_padding_data = pre_zeropadding(seq, max_len)
zero_padding_data[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   3, 273])

In [13]:
input_x = zero_padding_data[:,:-1]
label = zero_padding_data[:,-1]

print(input_x[:3], "\n")
print(label[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   3]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   3 273]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   3 273  66]] 

[273  66   1]


#### Custom Dataset 구현

In [14]:
class CustomDataset(Dataset):
    def __init__(self, data, vocab, tokenizer, max_len):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.max_len = max_len
        seq = self.make_sequence(self.data, self.vocab, self.tokenizer)
        self.seq = self.pre_zeropadding(seq, self.max_len) # zero padding 처리
        self.X = torch.tensor(self.seq[:,:-1])
        self.label = torch.tensor(self.seq[:,-1])

    # sequence 생성
    def make_sequence(self, data, vocab, tokenizer):
        seq = []
        for i in data:
            token_id = vocab.lookup_indices(tokenizer(i))
            for j in range(1, len(token_id)):
                sequence = token_id[:j+1]
                seq.append(sequence)
        return seq
    
    # padding
    def pre_zeropadding(self, seq, max_len):
        return np.array([i[:max_len] if len(i) >= max_len else [0] * (max_len - len(i)) + i for i in seq])
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        X = self.X[idx]
        label = self.label[idx]
        return X, label

In [15]:
def cleaning_text(text):
    cleaned_text = re.sub( r"[^a-zA-Z0-9.,@#!\s']+", "", text) # 특수문자 를 모두 지우는 작업을 수행합니다.
    cleaned_text = cleaned_text.replace(u'\xa0',u' ') # No-break space를 unicode 빈칸으로 변환
    cleaned_text = cleaned_text.replace('\u200a',' ') # unicode 빈칸을 빈칸으로 변환
    return cleaned_text

data = list(map(cleaning_text, df))
tokenizer = get_tokenizer("basic_english")
vocab = torchtext.vocab.build_vocab_from_iterator(map(tokenizer, data))
vocab.insert_token('<pad>',0)
max_len = 20

In [16]:
# 분리
train, test = train_test_split(data, test_size=.2, random_state=42)
val, test = train_test_split(test, test_size=.5, random_state=42)

print(f"train: {len(train)}, val: {len(val)}, test: {len(test)}")

train: 5206, val: 651, test: 651


In [17]:
train_dataset = CustomDataset(train, vocab, tokenizer, max_len)
val_dataset = CustomDataset(val, vocab, tokenizer, max_len)
test_dataset = CustomDataset(test, vocab, tokenizer, max_len)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#### Next word prediction Model

- DNN Model 구현

In [18]:
class Next_Word_Prediction_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dims, hidden_dims, num_classes, dropout_ratio, set_super):
        if set_super:
            super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dims, padding_idx = 0) # padding index 설정 => gradient 계산에서 제외
        self.hidden_dims = hidden_dims
        self.layers = nn.ModuleList()
        self.num_classes = num_classes
        for i in range(len(self.hidden_dims) - 1):
            self.layers.append(nn.Linear(self.hidden_dims[i], self.hidden_dims[i+1]))
            self.layers.append(nn.BatchNorm1d(self.hidden_dims[i+1])) # Normalization
            self.layers.append(nn.ReLU()) # Activation Function
            self.layers.append(nn.Dropout(dropout_ratio))

        self.classifier = nn.Linear(self.hidden_dims[-1], self.num_classes)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, x):
        '''
        INPUT:
            x: [batch_size, sequence_len] # padding 제외
        OUTPUT:
            output : [batch_size, vocab_size]
        '''
        x = self.embedding(x) # [batch_size, sequence_len, embedding_dim]
        x = torch.sum(x, dim=1) # [batch_size, embedding_dim] 각 문장에 대해 임베딩된 단어들을 합쳐서, 해당 문장에 대한 임베딩 벡터로 만듬
        for layer in self.layers:
            x = layer(x)

        output = self.classifier(x) # [batch_size, num_classes]
        output = self.softmax(output) # [batch_size, num_classes]
        return output

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

#### Inference

In [19]:
# training 코드, evaluation 코드, training loop 코드
def training(model, dataloader, train_dataset, criterion, optimizer, device, epoch, num_epochs):
    model.train()  # 모델을 학습 모드로 설정
    train_loss = 0.0
    train_accuracy = 0

    tbar = tqdm(dataloader)
    for texts, labels in tbar:
        texts = texts.to(device)
        labels = labels.to(device)

        # 순전파
        outputs = model(texts)

        loss = criterion(outputs, labels)

        # 역전파 및 가중치 업데이트
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 손실과 정확도 계산
        train_loss += loss.item()
        # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
        _, predicted = torch.max(outputs, dim=1)


        train_accuracy += (predicted == labels).sum().item()

        # tqdm의 진행바에 표시될 설명 텍스트를 설정
        tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}")

    # 에폭별 학습 결과 출력
    train_loss = train_loss / len(dataloader)
    train_accuracy = train_accuracy / len(train_dataset)

    return model, train_loss, train_accuracy

def evaluation(model, dataloader, val_dataset, criterion, device, epoch, num_epochs):
    model.eval()  # 모델을 평가 모드로 설정
    valid_loss = 0.0
    valid_accuracy = 0

    with torch.no_grad(): # model의 업데이트 막기
        tbar = tqdm(dataloader)
        for texts, labels in tbar:
            texts = texts.to(device)
            labels = labels.to(device)

            # 순전파
            outputs = model(texts)
            loss = criterion(outputs, labels)

            # 손실과 정확도 계산
            valid_loss += loss.item()
            # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
            _, predicted = torch.max(outputs, 1)
            # _, true_labels = torch.max(labels, dim=1)
            valid_accuracy += (predicted == labels).sum().item()


            # tqdm의 진행바에 표시될 설명 텍스트를 설정
            tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Valid Loss: {loss.item():.4f}")

    valid_loss = valid_loss / len(dataloader)
    valid_accuracy = valid_accuracy / len(val_dataset)

    return model, valid_loss, valid_accuracy


def training_loop(model, train_dataloader, valid_dataloader, train_dataset, val_dataset, criterion, optimizer, device, num_epochs, patience, model_name):
    best_valid_loss = float('inf')  # 가장 좋은 validation loss를 저장
    early_stop_counter = 0  # 카운터
    valid_max_accuracy = -1

    for epoch in range(num_epochs):
        model, train_loss, train_accuracy = training(model, train_dataloader, train_dataset, criterion, optimizer, device, epoch, num_epochs)
        model, valid_loss, valid_accuracy = evaluation(model, valid_dataloader, val_dataset, criterion, device, epoch, num_epochs)

        if valid_accuracy > valid_max_accuracy:
            valid_max_accuracy = valid_accuracy

        # validation loss가 감소하면 모델 저장 및 카운터 리셋
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f"./model_{model_name}.pt")
            early_stop_counter = 0

        # validation loss가 증가하거나 같으면 카운터 증가
        else:
            early_stop_counter += 1

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f} Valid Loss: {valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.4f}")

        # 조기 종료 카운터가 설정한 patience를 초과하면 학습 종료
        if early_stop_counter >= patience:
            print("Early stopping")
            break

    return model, valid_max_accuracy

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lr = 1e-3
vocab_size = len(vocab.get_stoi())
embedding_dims = 512
hidden_dims = [embedding_dims, embedding_dims*4, embedding_dims*2, embedding_dims]
model = Next_Word_Prediction_Model(vocab_size = vocab_size, embedding_dims = embedding_dims, hidden_dims = hidden_dims, num_classes = vocab_size, \
            dropout_ratio = 0.2, set_super = True).to(device)

num_epochs = 100
patience = 3
model_name = 'next'

optimizer = optim.Adam(model.parameters(), lr = lr)
criterion = nn.NLLLoss(ignore_index=0) # padding 한 부분 제외
model, valid_max_accuracy = training_loop(model, train_loader, val_loader, train_dataset, val_dataset, criterion, optimizer, device, num_epochs, patience, model_name)
print('Valid max accuracy : ', valid_max_accuracy)

Epoch [1/100], Train Loss: 5.9816: 100%|██████████| 1159/1159 [01:02<00:00, 18.41it/s]
Epoch [1/100], Valid Loss: 7.8286: 100%|██████████| 149/149 [00:00<00:00, 156.19it/s]


Epoch [1/100], Train Loss: 7.3780, Train Accuracy: 0.0637 Valid Loss: 7.2292, Valid Accuracy: 0.0665


Epoch [2/100], Train Loss: 6.6932: 100%|██████████| 1159/1159 [01:00<00:00, 19.25it/s]
Epoch [2/100], Valid Loss: 8.3096: 100%|██████████| 149/149 [00:00<00:00, 164.00it/s]


Epoch [2/100], Train Loss: 6.7416, Train Accuracy: 0.0751 Valid Loss: 7.2252, Valid Accuracy: 0.0751


Epoch [3/100], Train Loss: 7.1709: 100%|██████████| 1159/1159 [01:03<00:00, 18.30it/s]
Epoch [3/100], Valid Loss: 8.7249: 100%|██████████| 149/149 [00:00<00:00, 159.53it/s]


Epoch [3/100], Train Loss: 6.3790, Train Accuracy: 0.0851 Valid Loss: 7.2903, Valid Accuracy: 0.0848


Epoch [4/100], Train Loss: 5.4916: 100%|██████████| 1159/1159 [00:59<00:00, 19.54it/s]
Epoch [4/100], Valid Loss: 8.9234: 100%|██████████| 149/149 [00:00<00:00, 162.95it/s]


Epoch [4/100], Train Loss: 6.0734, Train Accuracy: 0.0938 Valid Loss: 7.4722, Valid Accuracy: 0.0878


Epoch [5/100], Train Loss: 5.3046: 100%|██████████| 1159/1159 [00:57<00:00, 20.14it/s]
Epoch [5/100], Valid Loss: 10.0305: 100%|██████████| 149/149 [00:00<00:00, 151.80it/s]

Epoch [5/100], Train Loss: 5.8038, Train Accuracy: 0.1038 Valid Loss: 7.6084, Valid Accuracy: 0.0922
Early stopping
Valid max accuracy :  0.09223300970873786





In [21]:
model.load_state_dict(torch.load("./model_next.pt")) # 모델 불러오기
model = model.to(device)
model.eval()
total_labels = []
total_preds = []
with torch.no_grad():
    for texts, labels in tqdm(test_loader):
        texts = texts.to(device)
        labels = labels

        outputs = model(texts)
        # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
        _, predicted = torch.max(outputs.data, 1)

        total_preds.extend(predicted.detach().cpu().tolist())
        total_labels.extend(labels.tolist())

total_preds = np.array(total_preds)
total_labels = np.array(total_labels)
nwp_dnn_acc = accuracy_score(total_labels, total_preds) # 정확도 계산
print("Next word prediction DNN model accuracy : ", nwp_dnn_acc)

100%|██████████| 143/143 [00:00<00:00, 239.82it/s]

Next word prediction DNN model accuracy :  0.0726159230096238





In [22]:
print(vocab_size)

8618
