In [1]:
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer
from soynlp.hangle import jamo_levenshtein

In [2]:
from kor2vec import Kor2Vec

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data as D

In [4]:
import re
import codecs
import pandas as pd
import pickle

In [5]:
EMBEDDING_DIM = 128
HIDDEN_SIZE = 64
LABEL_SIZE = 7
BATCH_SIZE = 54
EPOCH = 50
SEQ_LEN = 10

In [6]:
class SentenceDataset(D.Dataset):
    def __init__(self, fileName):
        self.fileName = fileName
        label = pd.read_excel(self.fileName + '.xlsx')['label']
        sentence = pd.read_excel(self.fileName + '.xlsx')['question']        
        print(' set dataset')
        print('read data from ', self.fileName)
        
        for i in range(0,len(sentence)):
            sentence[i] = self.onlyKorean(sentence[i])
        print('delete punctuation marks from data')
        
        self.len = len(sentence)
        self.x_data = sentence.values   
        self.y_data = label.values
    
    # 매개변수로 받은 sentence에서 문장부호를 제외한 한글만 남김
    def onlyKorean(self, sentence):    
        korean = re.compile('[^ ㄱ-ㅣ가-힣]+') 
        result = korean.sub('', sentence)
        return result
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

In [7]:
# embedding 모델은 따로
class SentenceClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, label_size):
        super(SentenceClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, BATCH_SIZE, self.hidden_dim),
               torch.zeros(1, BATCH_SIZE, self.hidden_dim))
    
    # x = embedding.vectorizeSentence(list of sentence)
    def forward(self, x):
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        lstm_out = lstm_out[:,9,:]
        y = self.hidden2label(lstm_out)
        
        # y = self.hidden2label(lstm_out, -1)
        result = F.log_softmax(y, dim=1)
        
        return result

In [8]:
class TrainModel():
    def __init__(self, fileName, 
                 vocabFileName = "./nlp/vocab.txt", tokenizerFileName = "./nlp/tokenizer.pkl", 
                 kor2vecFileName = "./nlp/embedding.model", classifierFileName = "./nlp/classifier.model", 
                 embedding_dim = EMBEDDING_DIM, hidden_size = HIDDEN_SIZE, label_size = LABEL_SIZE, epoch = EPOCH, seq_len = SEQ_LEN):
        
        self.fileName = fileName
        self.vocabFileName = vocabFileName
        self.tokenizerFileName = tokenizerFileName
        self.kor2vecFileName = kor2vecFileName  
        self.classifierFileName = classifierFileName
       
        self.readNLP()
        self.readDataset()        
        
        self.model = SentenceClassifier(embedding_dim, hidden_size, label_size)
        
        self.loss_function = nn.CrossEntropyLoss()
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.1)
        self.seq_len = seq_len
        self.epoch = epoch
        
    def trainStart(self):
        self.trainModel()
        self.saveModel()
        
    # tokenizer, kor2vec, vocab 불러오기
    def readNLP(self):
        # tokenizer 사용하는 이유 = 띄어쓰기 문제 해결을 위하여
        with open(self.tokenizerFileName,'rb') as f:
            self.tokenizer = pickle.load(f)
            
        # model
        self.kor2vec = Kor2Vec.load(self.kor2vecFileName)
        
        # vocab
        self.vocab = []
        f = open(self.vocabFileName, 'r')
        while True:
            word = f.readline()
            if not word: 
                break
            else :
                self.vocab.append(word[:-1])
        f.close()
    
    def readDataset(self):    
        self.dataset = SentenceDataset(self.fileName)
        
        # train, test 나누기
        train_len = self.dataset.__len__() * 0.8
        train_len = int(round(float(train_len)))
        test_len = self.dataset.__len__() - train_len
        
        print("train len : ", train_len)
        print("test len : ", test_len)
        
        self.train_data, self.test_data = D.random_split(self.dataset, lengths=[train_len, test_len])
        
        self.train_loader = D.DataLoader(dataset = self.train_data,
                                  batch_size = BATCH_SIZE,
                                  shuffle = True)
        self.test_loader = D.DataLoader(dataset = self.test_data,
                                  batch_size = BATCH_SIZE,
                                  shuffle = True)
    
    def trainModel(self):
        # training
        for e in range(self.epoch):
            for i, data in enumerate(self.train_loader, 0):
                x = list(data[0])
                y = data[1]
                x = self.kor2vec.to_seqs(x, seq_len = self.seq_len) # tensor(batch_size, seq_len, char_seq_len)
                x = self.kor2vec(x) # tensor(batch_size, seq_len, 128)
                # x = self.kor2vec.embedding(x)
                
                # y = self.makeLabeltoTensor(y)
                
                # clear gradients out before each instance
                self.model.zero_grad()
                self.model.hidden = self.model.init_hidden()
                # run our forward pass.
                result = self.model(x)
                
                # compute the loss, gradients, and update the parameters by
                #  calling optimizer.step()
                loss = self.loss_function(result, y)
                loss.backward()
                self.optimizer.step()
            print(e, "(loss : ", loss, ")")
                
    def makeLabeltoTensor(self, label):
        result = torch.zeros(0, 0)
        
        for l in label:
            temp = torch.zeros([1,7], dtype=torch.long)
            temp[0][int(l)] = 1
            if result.size() == torch.Size([0, 0]):
                result = temp
            else:
                result = torch.cat([result, temp], dim=0)
        
        return result
    
    def saveModel(self):
        with open(self.classifierFileName, 'wb') as f:
            pickle.dump(self.model, f)
            
    def test(self):
        correct = 0
        all = 0
        # test
        for i, data in enumerate(self.test_loader, 0):
            x = list(data[0])
            y = data[1]
            x = self.kor2vec.to_seqs(x, seq_len = self.seq_len) # tensor(batch_size, seq_len, char_seq_len)
            x = self.kor2vec(x) # tensor(batch_size, seq_len, 128)

            self.model.hidden = self.model.init_hidden()
            result = self.model(x)

            _, result = torch.max(result, 1)

            for i in range(len(data[0])):
                all += 1
                if result[i] == y[i]:
                    correct += 1

        print("model test result : ", correct)
        print(correct)
        print("model test result : ", correct, "/", all)
        print((correct / all) * 100, "%")
        
    def debug(self):
        print("debug")

In [9]:
#kor2vec = Kor2Vec.load("../model/path")
## or kor2vec = SejongVector()

#lstm = nn.LSTM(128, 64, batch_first=True)
#dense = nn.Linear(64, 1)

## Make tensor input
#sentences = ["이 영화는 정말 대박이에요", "우와 진짜 재미있었어요"]

#x = kor2vec.to_seqs(sentences, seq_len=10)
## >>> tensor(batch_size, seq_len, char_seq_len)

#x = kor2vec(x) # tensor(batch_size, seq_len, 128)
#_, (x, xc) = lstm(x) # tensor(batch_size, 64)
#x = dense(x) # tensor(batch_size, 1)

## test = vocab.kor2vec.embedding("김동호 교수님 수업 어때?")

##input = vocab.kor2vec.to_seqs(["김동호 교수님 수업 어때?", "컴퓨터보안"], seq_len=6)
## vocab.kor2vec.forward(input)

In [10]:
tm = TrainModel(fileName = "./dataset/2019_01_06_10차_RAN")

 set dataset
read data from  ./dataset/2019_01_06_10차_RAN
delete punctuation marks from data
train len :  4320
test len :  1080


In [11]:
tm.trainStart()

0 (loss :  tensor(1.7813, grad_fn=<NllLossBackward>) )
1 (loss :  tensor(1.7190, grad_fn=<NllLossBackward>) )
2 (loss :  tensor(1.5357, grad_fn=<NllLossBackward>) )
3 (loss :  tensor(1.0777, grad_fn=<NllLossBackward>) )
4 (loss :  tensor(0.6396, grad_fn=<NllLossBackward>) )
5 (loss :  tensor(0.3683, grad_fn=<NllLossBackward>) )
6 (loss :  tensor(0.1014, grad_fn=<NllLossBackward>) )
7 (loss :  tensor(0.1436, grad_fn=<NllLossBackward>) )
8 (loss :  tensor(0.0552, grad_fn=<NllLossBackward>) )
9 (loss :  tensor(0.0248, grad_fn=<NllLossBackward>) )
10 (loss :  tensor(0.0328, grad_fn=<NllLossBackward>) )
11 (loss :  tensor(0.0135, grad_fn=<NllLossBackward>) )
12 (loss :  tensor(0.0167, grad_fn=<NllLossBackward>) )
13 (loss :  tensor(0.0105, grad_fn=<NllLossBackward>) )
14 (loss :  tensor(0.0087, grad_fn=<NllLossBackward>) )
15 (loss :  tensor(0.0100, grad_fn=<NllLossBackward>) )
16 (loss :  tensor(0.0060, grad_fn=<NllLossBackward>) )
17 (loss :  tensor(0.0045, grad_fn=<NllLossBackward>) )
18

In [12]:
tm.test()

NameError: name 'test_loader' is not defined

In [18]:
correct = 0
all = 0
# test
for i, data in enumerate(tm.test_loader, 0):
    x = list(data[0])
    y = data[1]
    x = tm.kor2vec.to_seqs(x, seq_len = tm.seq_len) # tensor(batch_size, seq_len, char_seq_len)
    x = tm.kor2vec(x) # tensor(batch_size, seq_len, 128)
    
    tm.model.hidden = tm.model.init_hidden()
    result = tm.model(x)
                
    _, result = torch.max(result, 1)
                
    for i in range(len(data[0])):
        all += 1
        if result[i] == y[i]:
            correct += 1
        
print("model test result : ", correct)
print(correct)
print("model test result : ", correct, "/", all)
print((correct / all) * 100, "%")

model test result :  1074
1074
model test result :  1074 / 1080
99.44444444444444 %


In [24]:
x = ["김동호 교수님 강의 들을만 해요?", "자료구조와실습은 언제 듣는게 좋은가요?"]

x = tm.kor2vec.to_seqs(x, seq_len = tm.seq_len) # tensor(batch_size, seq_len, char_seq_len)
x = tm.kor2vec(x) # tensor(batch_size, seq_len, 128)
tm.model.hidden = (torch.zeros(1, 2, tm.model.hidden_dim), torch.zeros(1, 2, tm.model.hidden_dim))
result = tm.model.forward(x)
                
_, result = torch.max(result, 1)

print(result)

tensor([6, 0])
