## 준비된 데이터셋을 사용하여 vocab, tokenizer, kor2vec, Classifier를 학습, 저장하는 코드

In [1]:
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer
from soynlp.hangle import jamo_levenshtein

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data as D

In [3]:
from kor2vec import Kor2Vec

In [4]:
import re
import codecs
import pandas as pd

In [5]:
import pickle

In [6]:
import sys

In [7]:
class TrainVocab:
    def __init__(self, fileName, corpusFileName = "./nlp/train_data.corpus", 
                 logFileName = "./nlp/training_log/kor2vec_log", vocabFileName = "./nlp/vocab.txt",
                tokenizerFileName = "./nlp/tokenizer.pkl", kor2vecFileName = "./nlp/embedding.model"):
        self.fileName = fileName
        self.corpusFileName = corpusFileName
        self.logFileName = logFileName
        self.vocabFileName = vocabFileName
        self.tokenizerFileName = tokenizerFileName
        self.kor2vecFileName = kor2vecFileName
        
    # Tokenizer와 Kor2Vec pickling
    def setEverything(self):
        self.readDataset()
        self.setTokenizer()     
        self.makeCorpusFile()        
        self.makeVocabFile()
        self.setKor2Vec()
        
    # 매개변수로 받은 sentence에서 문장부호를 제외한 한글만 남김
    def onlyKorean(self, sentence):    
        korean = re.compile('[^ ㄱ-ㅣ가-힣 ]+') 
        result = korean.sub('', sentence)
        return result
    
    def readDataset(self):
        self.question = pd.read_excel(self.fileName)['question']
        print(' read question data from ', self.fileName)        
        for i in range(0,len(self.question)):
            self.question[i] = self.onlyKorean(self.question[i])
        
        print('delete punctuation marks from data')
            
    # question(list of sentence)에 등장하는 단어의 점수 계산
    def calWordScores(self):   
        word_extractor = WordExtractor(
            max_left_length=20, 
            max_right_length=20, 
            min_frequency = 30,
            min_cohesion_forward = 0.05,
            min_right_branching_entropy = 0.0
        )        
        word_extractor.train(self.question)   
        word_scores = word_extractor.extract()
        print('extract and calculate ', len(word_scores), ' words')
        return word_scores
    
    # Tokenizer 정의 및 훈련
    def setTokenizer(self):
        print(' set Tokenizer')        
        word_scores = self.calWordScores()
        self.tokenizer = self.trainTokenizer(word_scores)    
        with open(self.tokenizerFileName, 'wb') as f:
            pickle.dump(self.tokenizer, f)
        print('Tokenizer saved in ',self.tokenizerFileName)   
            
    # Tokenizer 훈련
    def trainTokenizer(self, word_scores):
        cohesion_scores = {word:score.cohesion_forward for word, score in word_scores.items()}
        tokenizer = MaxScoreTokenizer(scores = cohesion_scores)
        # tokenizer = LTokenizer(scores = cohesion_scores)
        print('train tokenizer')  
        return tokenizer
    
    def makeCorpusFile(self):
        print(' make corpus file')   
        sample = []
        for q in self.question:
            words = self.tokenizer.tokenize(q)
            sentence = " ".join(words)
            sample.append(sentence)
        f = codecs.open(self.corpusFileName, 'w', encoding='utf8')
        for s in sample:
            f.write(s + "\r\n")
        f.close() 
        print('corpus file saved in ', self.corpusFileName) 
        
    def makeVocabFile(self):
        print(' make vocab file')   
        vocab = []
        for q in self.question:
            q = q.replace(" ", "")
            words = self.tokenizer.tokenize(q)
            for w in words:
                if w not in vocab:
                    vocab.append(w)
            
        f = open(self.vocabFileName, 'w')
        for v in vocab:
            f.write(v + "\n")
        f.close() 
        print('vocab file saved in ', self.vocabFileName) 
            
    def setKor2Vec(self):
        self.kor2vec = Kor2Vec(embed_size=128)
        self.kor2vec.train(self.corpusFileName, self.logFileName, batch_size=128)
        self.kor2vec.save(self.kor2vecFileName) # saving embedding
        print('Kor2Vec saved in ', self.kor2vecFileName)

In [8]:
# datasetFileName = sys.argv[1]
vocab = TrainVocab("./dataset/TotalDataset.xlsx")
# vocab = TrainVocab(datasetFileName)
vocab.setEverything()

 read question data from  ./dataset/TotalDataset.xlsx
delete punctuation marks from data
 set Tokenizer
training was done. used memory 0.163 Gbory 0.149 Gb
all cohesion probabilities was computed. # words = 2530
all branching entropies was computed # words = 8882
all accessor variety was computed # words = 8882
extract and calculate  1473  words
train tokenizer
Tokenizer saved in  ./nlp/tokenizer.pkl
 make corpus file
corpus file saved in  ./nlp/train_data.corpus
 make vocab file
vocab file saved in  ./nlp/vocab.txt
Reading Corpus lines


Spliting Lines: 100%|████████████████████████████████████████████████████████| 15732/15732 [00:00<00:00, 192347.44it/s]
Corpus Sampling: 100%|█████████████████████████████████████████████████████████| 15732/15732 [00:02<00:00, 6052.94it/s]


Training kor2vec
Loading Word_sample corpus
Loading corpus finished
CUDA Available/count: False 0
training on  cpu


EP 0: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:22<00:00,  3.72it/s]


{'epoch': 0, 'train_ep_loss': 1.3620213416626412}


EP 1: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:17<00:00,  3.80it/s]


{'epoch': 1, 'train_ep_loss': 1.084679505106304}


EP 2: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:16<00:00,  3.83it/s]


{'epoch': 2, 'train_ep_loss': 1.0286114386828298}


EP 3: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:32<00:00,  3.55it/s]


{'epoch': 3, 'train_ep_loss': 0.994489808481528}


EP 4: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:14<00:00,  3.87it/s]


{'epoch': 4, 'train_ep_loss': 0.9701039365642415}


EP 5: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:23<00:00,  3.69it/s]


{'epoch': 5, 'train_ep_loss': 0.9521244436621191}


EP 6: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:24<00:00,  3.69it/s]


{'epoch': 6, 'train_ep_loss': 0.9323679735144454}


EP 7: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:23<00:00,  3.70it/s]


{'epoch': 7, 'train_ep_loss': 0.9172853212432558}


EP 8: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:22<00:00,  3.71it/s]


{'epoch': 8, 'train_ep_loss': 0.9033639309494935}


EP 9: 100%|██████████████████████████████████████████████████████████████████████████| 753/753 [03:22<00:00,  3.72it/s]


{'epoch': 9, 'train_ep_loss': 0.890222750534891}
Kor2Vec saved in  ./nlp/embedding.model


In [9]:
EMBEDDING_DIM = 128
HIDDEN_SIZE = 64
LABEL_SIZE = 7
BATCH_SIZE = 23
EPOCH = 15

'''
Epoch = 50
    model test result :  2618
    2618
    model test result :  2618 / 2622
    99.84744469870328 %
'''

SEQ_LEN = 10

In [10]:
class SentenceDataset(D.Dataset):
    def __init__(self, fileName):
        self.fileName = fileName
        label = pd.read_excel(self.fileName)['label']
        sentence = pd.read_excel(self.fileName)['question']        
        print(' set dataset')
        print('read data from ', self.fileName)
        
        for i in range(0,len(sentence)):
            sentence[i] = self.onlyKorean(sentence[i])
        print('delete punctuation marks from data')
        
        self.len = len(sentence)
        self.x_data = sentence.values   
        self.y_data = label.values
    
    # 매개변수로 받은 sentence에서 문장부호를 제외한 한글만 남김
    def onlyKorean(self, sentence):    
        korean = re.compile('[^ ㄱ-ㅣ가-힣 ]+') 
        result = korean.sub('', sentence)
        result.replace(" ", "")
        return result
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

In [11]:
# embedding 모델은 따로
class SentenceClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, label_size):
        super(SentenceClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, BATCH_SIZE, self.hidden_dim),
               torch.zeros(1, BATCH_SIZE, self.hidden_dim))
    
    # x = embedding.vectorizeSentence(list of sentence)
    def forward(self, x):
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        lstm_out = lstm_out[:,9,:]
        y = self.hidden2label(lstm_out)
        
        # y = self.hidden2label(lstm_out, -1)
        result = F.log_softmax(y, dim=1)
        
        return result

In [12]:
class TrainModel():
    def __init__(self, fileName, 
                 vocabFileName = "./nlp/vocab.txt", tokenizerFileName = "./nlp/tokenizer.pkl", 
                 kor2vecFileName = "./nlp/embedding.model", classifierFileName = "./nlp/classifier.model", 
                 embedding_dim = EMBEDDING_DIM, hidden_size = HIDDEN_SIZE, label_size = LABEL_SIZE, epoch = EPOCH, seq_len = SEQ_LEN):
        
        self.fileName = fileName
        self.vocabFileName = vocabFileName
        self.tokenizerFileName = tokenizerFileName
        self.kor2vecFileName = kor2vecFileName  
        self.classifierFileName = classifierFileName
       
        self.readNLP()
        self.readDataset()        
        
        self.model = SentenceClassifier(embedding_dim, hidden_size, label_size)
        
        self.loss_function = nn.CrossEntropyLoss()
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.1)
        self.seq_len = seq_len
        self.epoch = epoch
        
    def trainStart(self):
        self.trainModel()
        self.saveModel()
        
    # tokenizer, kor2vec, vocab 불러오기
    def readNLP(self):
        # tokenizer 사용하는 이유 = 띄어쓰기 문제 해결을 위하여
        with open(self.tokenizerFileName,'rb') as f:
            self.tokenizer = pickle.load(f)
            
        # model
        self.kor2vec = Kor2Vec.load(self.kor2vecFileName)
        
        # vocab
        self.vocab = []
        f = open(self.vocabFileName, 'r')
        while True:
            word = f.readline()
            if not word: 
                break
            else :
                self.vocab.append(word[:-1])
        f.close()
    
    def readDataset(self):    
        self.dataset = SentenceDataset(self.fileName)
        
        # train, test 나누기
        train_len = (self.dataset.__len__() / 6) * 5
        train_len = int(round(float(train_len)))
        test_len = self.dataset.__len__() - train_len
        
        print("train len : ", train_len)
        print("test len : ", test_len)
        
        self.train_data, self.test_data = D.random_split(self.dataset, lengths=[train_len, test_len])
        
        self.train_loader = D.DataLoader(dataset = self.train_data,
                                  batch_size = BATCH_SIZE,
                                  shuffle = True)
        self.test_loader = D.DataLoader(dataset = self.test_data,
                                  batch_size = BATCH_SIZE,
                                  shuffle = True)
    
    def trainModel(self):
        # training
        for e in range(self.epoch):
            for i, data in enumerate(self.train_loader, 0):
                x = list(data[0])
                y = data[1]
                x = self.kor2vec.to_seqs(x, seq_len = self.seq_len) 
                # tensor(batch_size, seq_len, char_seq_len)
                x = self.kor2vec(x) 
                # tensor(batch_size, seq_len, 128)
                
                self.model.zero_grad()
                self.model.hidden = self.model.init_hidden()
                # run our forward pass.
                result = self.model(x)
                
                # compute the loss, gradients, and update the parameters by
                #  calling optimizer.step()
                loss = self.loss_function(result, y)
                loss.backward()
                self.optimizer.step()
            print(e, "(loss : ", loss, ")")
                
    def makeLabeltoTensor(self, label):
        result = torch.zeros(0, 0)
        
        for l in label:
            temp = torch.zeros([1,7], dtype=torch.long)
            temp[0][int(l)] = 1
            if result.size() == torch.Size([0, 0]):
                result = temp
            else:
                result = torch.cat([result, temp], dim=0)
        
        return result
    
    def saveModel(self):
        with open(self.classifierFileName, 'wb') as f:
            pickle.dump(self.model, f)
            
    def test(self):
        correct = 0
        all = 0
        # test
        for i, data in enumerate(self.test_loader, 0):
            x = list(data[0])
            y = data[1]
            x = self.kor2vec.to_seqs(x, seq_len = self.seq_len)
            # tensor(batch_size, seq_len, char_seq_len)
            x = self.kor2vec(x) 
            # tensor(batch_size, seq_len, 128)

            self.model.hidden = self.model.init_hidden()
            result = self.model(x)

            _, result = torch.max(result, 1)

            for i in range(len(data[0])):
                all += 1
                if result[i] == y[i]:
                    correct += 1

        print("model test result : ", correct)
        print(correct)
        print("model test result : ", correct, "/", all)
        print((correct / all) * 100, "%")
        
    def debug(self):
        print("debug")

In [13]:
# tm = TrainModel(fileName = "./dataset/2019_01_06_10차_RAN.xlsx")
tm = TrainModel(fileName = "./dataset/TotalDataset.xlsx")
tm.trainStart()

 set dataset
read data from  ./dataset/TotalDataset.xlsx
delete punctuation marks from data
train len :  13110
test len :  2622
0 (loss :  tensor(1.0175, grad_fn=<NllLossBackward>) )
1 (loss :  tensor(0.1352, grad_fn=<NllLossBackward>) )
2 (loss :  tensor(0.2829, grad_fn=<NllLossBackward>) )
3 (loss :  tensor(0.0957, grad_fn=<NllLossBackward>) )
4 (loss :  tensor(0.0043, grad_fn=<NllLossBackward>) )
5 (loss :  tensor(0.0048, grad_fn=<NllLossBackward>) )
6 (loss :  tensor(0.0048, grad_fn=<NllLossBackward>) )
7 (loss :  tensor(0.0018, grad_fn=<NllLossBackward>) )
8 (loss :  tensor(0.0131, grad_fn=<NllLossBackward>) )
9 (loss :  tensor(0.0010, grad_fn=<NllLossBackward>) )
10 (loss :  tensor(0.0007, grad_fn=<NllLossBackward>) )
11 (loss :  tensor(0.0012, grad_fn=<NllLossBackward>) )
12 (loss :  tensor(0.0004, grad_fn=<NllLossBackward>) )
13 (loss :  tensor(0.0004, grad_fn=<NllLossBackward>) )
14 (loss :  tensor(0.0005, grad_fn=<NllLossBackward>) )


In [14]:
tm.test()

model test result :  2616
2616
model test result :  2616 / 2622
99.77116704805492 %
