### 문장 분류 모델

In [38]:
from torch import nn

class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layers,
        dropout = 0.5,
        bidirectional = True,
        model_type = 'lstm'):

        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if model_type == 'rnn':
            self.model = nn.RNN(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout,
            batch_first=True,
        )
        elif model_type == 'lstm':
            self.model = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout,
            batch_first=True,
            )
        
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2,1)
        else:
            self.classifier = nn.Linear(hidden_dim,1)
        self.dropout = nn.Dropout(dropout)

    def forward(self,inputs):
        embeddings = self.embedding(inputs)
        output,_ = self.model(embeddings)
        last_output = output[:,-1,:]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

### 데이터세트 불러오기

In [39]:
import pandas as pd
from Korpora import Korpora

In [40]:
corpus = Korpora.load('nsmc')
corpus_df = pd.DataFrame(corpus.test)

train = corpus_df.sample(frac=0.9, random_state=42)
test = corpus_df.drop(train.index)

# print(train.head(5).to_markdown()) ??
print("Traing Data Size : ", len(train))
print("Test Data Size : ", len(test))


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-14\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

### 데이터 토큰화 및 단어사전 구축 <hr>

In [41]:
from konlpy.tag import Okt
from collections import Counter

def build_vocab(corpus, n_vocab, special_tokens):
    counter = Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab = special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

# 토큰화
tokenizer = Okt()
train_tokens = [tokenizer.morphs(review) for review in train.text]
test_tokens = [tokenizer.morphs(review) for review in test.text]

# 토큰화 된걸 사전으로 만듬
vocab = build_vocab(corpus = train_tokens,n_vocab=5000,special_tokens=["<pad>","<unk>"])

# 맵핑 부분분
token_to_id = {token : idx for idx, token in enumerate(vocab)}
id_to_token = {idx : token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


In [42]:
train_tokens[:1]

[['모든',
  '편견',
  '을',
  '날려',
  '버리는',
  '가슴',
  '따뜻한',
  '영화',
  '.',
  '로버트',
  '드',
  '니',
  '로',
  ',',
  '필립',
  '세이모어',
  '호프만',
  '영원하라',
  '.']]

### 정수 인코딩 및 패딩 <hr>

In [43]:
import numpy as np

def pad_sequences(sequences,max_length, pad_value):
    result = list()
    for sequence in sequences:
        sequence = sequence[:max_length]
        pad_length = max_length - len(sequence)
        padded_sequence = sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

unk_id = token_to_id['<unk>']
train_ids = [[token_to_id.get(token,unk_id)for token in review]for review in train_tokens]
test_ids = [[token_to_id.get(token,unk_id)for token in review]for review in test_tokens]

max_length = 32
pad_id = token_to_id['<pad>']
train_ids = pad_sequences(train_ids,max_length,pad_id)
test_ids = pad_sequences(test_ids,max_length,pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


### 데이터 로더 적용 <hr>

In [44]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_labels = torch.tensor(train.label.values, dtype=torch.float32)
test_labels = torch.tensor(test.label.values, dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16,shuffle=False)

### 손실 함수와 최적화 함수 정의 <hr>

In [45]:
from torch import optim

n_vocab  = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layers = 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
classifier = SentenceClassifier(
    n_vocab=n_vocab,hidden_dim=hidden_dim,embedding_dim=embedding_dim,n_layers=n_layers
).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(),lr=0.001)


### 모델 학습 및 테스트 <hr>

In [46]:
def train(model,datasets,criterion,optimizer,device,interval):
    model.train()
    losses = list()

    for step,(input_ids,labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits,labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval ==0:
            print(f'Train Loss {step} : {np.mean(losses)}')

def test(model,datasets,criterion,device):
    model.eval()
    losses = list()
    corrects = list()

    for step, (input_ids,labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits,labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat,labels).cpu().tolist()
        )
    print(f'Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}')


In [47]:
epochs = 5
interval = 500

for epoch in range(epochs):
    train(classifier,train_loader,criterion,optimizer,device,interval)
    test(classifier,test_loader,criterion,device)

Train Loss 0 : 0.6927856802940369
Train Loss 500 : 0.6938404606249994
Train Loss 1000 : 0.6865301315898781
Train Loss 1500 : 0.6721761725808206
Train Loss 2000 : 0.6626201713609433
Train Loss 2500 : 0.6550581421746297
Val Loss : 0.6011598525336757, Val Accuracy : 0.6978
Train Loss 0 : 0.4339098632335663
Train Loss 500 : 0.5744141944630179
Train Loss 1000 : 0.5592193121200317
Train Loss 1500 : 0.5391078550644035
Train Loss 2000 : 0.5238093916235538
Train Loss 2500 : 0.5139217551042442
Val Loss : 0.45824491520659233, Val Accuracy : 0.7834
Train Loss 0 : 0.4779515266418457
Train Loss 500 : 0.41523371975935863
Train Loss 1000 : 0.41677073037975676
Train Loss 1500 : 0.41582335488030625
Train Loss 2000 : 0.4114315731824189
Train Loss 2500 : 0.4093450354468055
Val Loss : 0.417715231855266, Val Accuracy : 0.8128
Train Loss 0 : 0.5705158114433289
Train Loss 500 : 0.3585417429487148
Train Loss 1000 : 0.36271222055955726
Train Loss 1500 : 0.3651913345321705
Train Loss 2000 : 0.36353966742865684
T

### 학습된 모델로부터 임베딩 추출 <hr>

In [48]:
token_to_embedding = dict()
embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab,embedding_matrix):
    token_to_embedding[word] = emb

token = vocab[1000]
print(token,token_to_embedding[token])

보고싶다 [-2.13147354e+00 -9.73338604e-01 -7.23130524e-01  8.38033617e-01
  2.92781144e-01  4.16226745e-01  1.17933583e-02 -1.00700533e+00
 -2.69620836e-01  7.94402435e-02  5.83757579e-01  7.15165377e-01
 -5.73228717e-01  1.74757552e+00 -1.28089988e+00  2.19189852e-01
 -1.48195565e+00  6.85004532e-01 -1.71868369e-01  1.21109508e-01
  6.44406319e-01  1.33566356e+00  6.51631832e-01  5.33246994e-01
  2.14045838e-01 -1.61191058e+00  7.25900084e-02 -1.36394358e+00
 -4.77418393e-01  7.22214222e-01  3.01264822e-01 -1.42810106e-01
  2.77615190e-01 -9.24553692e-01 -6.60686970e-01 -8.07152867e-01
 -1.94810987e+00  7.05322981e-01 -9.50230122e-01 -1.23959124e+00
  1.25406277e+00 -1.51167369e+00  9.22580957e-01 -1.43614089e+00
 -1.67827860e-01 -6.22930765e-01 -1.26506555e+00  5.57834804e-02
 -1.46782899e+00  9.10904408e-01 -8.59702826e-01 -4.51323569e-01
 -1.33559191e+00 -1.22612394e-01  1.56435668e-01  1.49809372e+00
 -8.72326076e-01  2.04337025e+00 -1.29371965e+00 -8.08906615e-01
  1.34814525e+00 -1.

### 사전 학습된 모델로 임베딩 계층 초기화 <hr>

In [None]:
from gensim.models import Word2Vec

word2vec = Word2Vec.load()