In [1]:
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer
from soynlp.hangle import jamo_levenshtein

In [2]:
from kor2vec import Kor2Vec

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data as D

In [4]:
import re
import codecs
import pandas as pd
import pickle

In [5]:
EMBEDDING_DIM = 128
HIDDEN_SIZE = 10
LABEL_SIZE = 7

In [7]:
class SentenceDataset(D.Dataset):
    def __init__(self, filename):
        label = pd.read_excel(self.fileName + '.xlsx')['label']
        sentence = pd.read_excel(self.fileName + '.xlsx')['question']        
        print(' set dataset')
        print('read data from ', fileName)
        
        for i in range(0,len(sentence)):
            sentence[i] = self.onlyKorean(sentence[i])
        print('delete punctuation marks from data')
        
        self.len = len(sentence)
        self.x_data = torch.tensor(sentence.values)        
        self.y_data = torch.tensor(label.values)   
    
    # 매개변수로 받은 sentence에서 문장부호를 제외한 한글만 남김
    def onlyKorean(self, sentence):    
        korean = re.compile('[^ ㄱ-ㅣ가-힣]+') 
        result = korean.sub('', sentence)
        return result
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

In [8]:
# embedding 모델은 따로
class SentenceClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, label_size):
        super(SentenceClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
               torch.zeros(1, 1, self.hidden_dim))
    
    # x = embedding.vectorizeSentence(list of sentence)
    def forward(self, x):
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y = self.hidden2label(lstm_out)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)

In [9]:
class TrainModel():
    def __init__(self, fileName, 
                 vocabFileName = "./korean_embedding/vocab.txt", tokenizerFileName = "./korean_embedding/tokenizer.pkl", 
                 kor2vecFileName = "./korean_embedding/embedding.model", 
                 embedding_dim = EMBEDDING_DIM, hidden_size = HIDDEN_SIZE, label_size = LABEL_SIZE):
        
        self.fileName = fileName
        self.vocabFileName = vocabFileName
        self.tokenizerFileName = tokenizerFileName
        self.kor2vecFileName = kor2vecFileName       
       
        self.readNLP()
        self.readDataset()
        
        self.loss_function = nn.NLLLoss()
        self.optimizer = optim.SGD(model.parameters(), lr=0.1)
        
        self.model = SentenceClassifier(embedding_dim, hidden_size, label_size)
        self.trainModel()

    # tokenizer, kor2vec, vocab 불러오기
    def readNLP(self):
        # tokenizer 사용하는 이유 = 띄어쓰기 문제 해결을 위하여
        with open('./korean_embedding/tokenizer.pkl','rb') as f:
            self.tokenizer = pickle.load(f)
            
        # model
        self.kor2vec = Kor2Vec.load("./korean_embedding/embedding.model")
        
        # vocab
        self.vocab = []
        f = open("./korean_embedding/vocab.txt", 'r')
        while True:
            word = f.readline()
            if not word: 
                break
            else :
                self.vocab.append(word[:-1])
        f.close()
    
    def readDataset(self):    
        self.dataset = SentenceDataset(fileName)
        
        # train, test 나누기
        train_len = self.dataset.__len__() * 0.8
        test_len = self.dataset.__len__() - train_len
        
        self.train_data, self.test_data = D.random_split(self.dataset, lengths=[train_len, test_len])
        
        self.train_loader = DataLoader(dataset = self.train_data,
                                  batch_size = 32,
                                  shuffle = True,
                                  num_workers = 2)
        self.test_loader = DataLoader(dataset = self.test_data,
                                  batch_size = 32,
                                  shuffle = True,
                                  num_workers = 2)
    
    def trainModel(self):
        # training
        for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
            for i, data in enumerate(train_loader, 0):
                x = kor2vec.to_seqs(sentences, seq_len=10) # tensor(batch_size, seq_len, char_seq_len)
                x = kor2vec(x) # tensor(batch_size, seq_len, 128)
                
                # clear gradients out before each instance
                model.zero_grad()
                # clear out the hidden state of the LSTM
                model.hidden = model.init_hidden()

                # Step 2. Get our inputs ready for the network, that is, turn them into
                # Tensors of word indices.

                # Step 3. Run our forward pass.
                tag_scores = model(sentence_in)

                # Step 4. Compute the loss, gradients, and update the parameters by
                #  calling optimizer.step()
                loss = loss_function(tag_scores, targets)
                loss.backward()
                optimizer.step()

In [None]:
#kor2vec = Kor2Vec.load("../model/path")
## or kor2vec = SejongVector()

#lstm = nn.LSTM(128, 64, batch_first=True)
#dense = nn.Linear(64, 1)

## Make tensor input
#sentences = ["이 영화는 정말 대박이에요", "우와 진짜 재미있었어요"]

#x = kor2vec.to_seqs(sentences, seq_len=10)
## >>> tensor(batch_size, seq_len, char_seq_len)

#x = kor2vec(x) # tensor(batch_size, seq_len, 128)
#_, (x, xc) = lstm(x) # tensor(batch_size, 64)
#x = dense(x) # tensor(batch_size, 1)

## test = vocab.kor2vec.embedding("김동호 교수님 수업 어때?")

##input = vocab.kor2vec.to_seqs(["김동호 교수님 수업 어때?", "컴퓨터보안"], seq_len=6)
## vocab.kor2vec.forward(input)

In [10]:
tm = TrainModel("./dataset/2019_01_06_10차_RAN")

AttributeError: Can't get attribute '_rebuild_parameter' on <module 'torch._utils' from 'C:\\Program Files (x86)\\Microsoft Visual Studio\\Shared\\Anaconda3_64\\lib\\site-packages\\torch\\_utils.py'>