In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import pickle

In [2]:
import pandas as pd

from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models import Word2Vec

In [3]:
class Embedding:
    def setWord2Vec(self, fileName):
        question = pd.read_excel(fileName + '.xlsx')['question']
        print(' read question data from ', fileName)        
        for i in range(0,len(question)):
            question[i] = self.onlyKorean(question[i])
            
        word_scores = self.calWordScores(question)
        self.tokenizer = self.trainTokenizer(word_scores)        
        self.word2vec = self.trainWord2Vec(question)
        
    def calWordScores(self, question):   
        word_extractor = WordExtractor(
            max_left_length=20, 
            max_right_length=20, 
            min_frequency = 20,
            min_cohesion_forward = 0.05,
            min_right_branching_entropy = 0.0
        )
        
        word_extractor.train(question)   
        word_scores = word_extractor.extract()
        print(' extract and calculate ', len(word_scores), ' words')
        return word_scores
    
    def onlyKorean(self, sentence):
        korean = re.compile('[^ ㄱ-ㅣ가-힣]+') 
        result = korean.sub('', sentence)
        return result
    
    def trainTokenizer(self, word_scores):
        cohesion_scores = {word:score.cohesion_forward for word, score in word_scores.items()}
        tokenizer = MaxScoreTokenizer(scores = cohesion_scores)
        # tokenizer = LTokenizer(scores = cohesion_scores)
        print(' train tokenizer')  
        return tokenizer
                
    def trainWord2Vec(self, question):
        # print(self.question)
        tQuestion = [self.tokenizeSentence(q) for q in question]
        
        word2vec = Word2Vec(
            tQuestion, 
            size = 50, 
            window = 2, 
            min_count = 1, 
            iter = 100, 
            sg = 1
        )
        print(' train word2vec') 
        return word2vec
    
    # sent는 하나의 문장
    def tokenizeSentence(self, sent): 
        return self.tokenizer.tokenize(sent)
    
    # words 는 단어들의 리스트 ["김동호", "교수님"]
    def vectorizeWord(self, words):          
        return self.word2vec.wv[words] 
    
    def vectorizeSentence(self, sent):
        result = []
        for s in sent:
            s = self.onlyKorean(s)
            tSent = self.tokenizeSentence(s) 
            vec = self.vectorizeWord(tSent)
            result.append(torch.FloatTensor(vec))
        return result
    
    def vocabSize(self):
        return len(self.word2vec.wv.vocab)

In [11]:
# embedding 모델은 따로
class SentenceClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocabsize, label_size):
        super(SentenceClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.lsm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
               torch.zeros(1, 1, self.hidden_dim))
    
    # x = embedding.vectorizeSentence(list of sentence)
    def forward(self, x):
        x = embedding.vectorizeSentence(x)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y = self.hidden2label(lstm_out)

In [12]:
model = SentenceClassifier(50, 30, embedding.vocabSize(), 6)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

with open('Embedding.pkl','rb') as f:
    embedding = pickle.load(f)