In [1]:
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer
from soynlp.hangle import jamo_levenshtein

In [2]:
import re
import codecs
import pandas as pd

In [2]:
from kor2vec import Kor2Vec

In [4]:
import pickle

In [5]:
class Vocab:
    def __init__(self, fileName, corpusFileName = "./korean_embedding/train_data.corpus", 
                 logFileName = "./korean_embedding/training_log/kor2vec_log", vocabFileName = "./korean_embedding/vocab.txt",
                tokenizerFileName = "./korean_embedding/tokenizer.pkl", kor2vecFileName = "./korean_embedding/embedding.model"):
        self.fileName = fileName
        self.corpusFileName = corpusFileName
        self.logFileName = logFileName
        self.vocabFileName = vocabFileName
        self.tokenizerFileName = tokenizerFileName
        self.kor2vecFileName = kor2vecFileName
        
    # Tokenizer와 Kor2Vec pickling
    def setEverything(self):
        self.readDataset()
        self.setTokenizer()     
        self.makeCorpusFile()        
        self.makeVocabFile()
        self.setKor2Vec()
        
    # 매개변수로 받은 sentence에서 문장부호를 제외한 한글만 남김
    def onlyKorean(self, sentence):    
        korean = re.compile('[^ ㄱ-ㅣ가-힣]+') 
        result = korean.sub('', sentence)
        return result
    
    def readDataset(self):
        self.question = pd.read_excel(self.fileName + '.xlsx')['question']
        print(' read question data from ', self.fileName)        
        for i in range(0,len(self.question)):
            self.question[i] = self.onlyKorean(self.question[i])
        
        print('delete punctuation marks from data')
            
    # question(list of sentence)에 등장하는 단어의 점수 계산
    def calWordScores(self):   
        word_extractor = WordExtractor(
            max_left_length=20, 
            max_right_length=20, 
            min_frequency = 20,
            min_cohesion_forward = 0.05,
            min_right_branching_entropy = 0.0
        )        
        word_extractor.train(self.question)   
        word_scores = word_extractor.extract()
        print('extract and calculate ', len(word_scores), ' words')
        return word_scores
    
    # Tokenizer 정의 및 훈련
    def setTokenizer(self):
        print(' set Tokenizer')        
        word_scores = self.calWordScores()
        self.tokenizer = self.trainTokenizer(word_scores)    
        with open(self.tokenizerFileName, 'wb') as f:
            pickle.dump(self.tokenizer, f)
        print('Tokenizer saved in ',self.tokenizerFileName)   
            
    # Tokenizer 훈련
    def trainTokenizer(self, word_scores):
        cohesion_scores = {word:score.cohesion_forward for word, score in word_scores.items()}
        tokenizer = MaxScoreTokenizer(scores = cohesion_scores)
        # tokenizer = LTokenizer(scores = cohesion_scores)
        print('train tokenizer')  
        return tokenizer
    
    def makeCorpusFile(self):
        print(' make corpus file')   
        sample = []
        for q in self.question:
            words = self.tokenizer.tokenize(q)
            sentence = " ".join(words)
            sample.append(sentence)
        f = codecs.open(self.corpusFileName, 'w', encoding='utf8')
        for s in sample:
            f.write(s + "\r\n")
        f.close() 
        print('corpus file saved in ', self.corpusFileName) 
        
    def makeVocabFile(self):
        print(' make vocab file')   
        vocab = []
        for q in self.question:
            words = self.tokenizer.tokenize(q)
            for w in words:
                if w not in vocab:
                    vocab.append(w)
            
        f = open(self.vocabFileName, 'w')
        for v in vocab:
            f.write(v + "\n")
        f.close() 
        print('vocab file saved in ', self.vocabFileName) 
            
    def setKor2Vec(self):
        self.kor2vec = Kor2Vec(embed_size=128)
        self.kor2vec.train(self.corpusFileName, self.logFileName, batch_size=128)
        self.kor2vec.save(self.kor2vecFileName) # saving embedding
        print('Kor2Vec saved in ', self.kor2vecFileName)

In [6]:
vocab = Vocab("./dataset/2019_01_06_10차_RAN")
vocab.setEverything()
# embedding dim = 128

 read question data from  ./dataset/2019_01_06_10차_RAN
delete punctuation marks from data
 set Tokenizer
training was done. used memory 0.145 Gbry 0.139 Gb
all cohesion probabilities was computed. # words = 2043
all branching entropies was computed # words = 4634
all accessor variety was computed # words = 4634
extract and calculate  1031  words
train tokenizer
Tokenizer saved in  ./korean_embedding/tokenizer.pkl
 make corpus file
corpus file saved in  ./korean_embedding/train_data.corpus
 make vocab file
vocab file saved in  ./korean_embedding/vocab.txt
Reading Corpus lines


Spliting Lines: 100%|███████████████████████████████████████████████████████████| 5424/5424 [00:00<00:00, 94314.59it/s]
Corpus Sampling: 100%|███████████████████████████████████████████████████████████| 5424/5424 [00:01<00:00, 5128.53it/s]


Training kor2vec
Loading Word_sample corpus
Loading corpus finished
CUDA Available/count: False 0
training on  cpu


EP 0: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:25<00:00,  4.28it/s]


{'epoch': 0, 'train_ep_loss': 1.640703519641376}


EP 1: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:27<00:00,  4.51it/s]


{'epoch': 1, 'train_ep_loss': 1.1105290965955765}


EP 2: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:21<00:00,  4.50it/s]


{'epoch': 2, 'train_ep_loss': 1.046385027932339}


EP 3: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:16<00:00,  4.35it/s]


{'epoch': 3, 'train_ep_loss': 1.0149116371498734}


EP 4: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:14<00:00,  4.62it/s]


{'epoch': 4, 'train_ep_loss': 0.9907120352885762}


EP 5: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:15<00:00,  4.49it/s]


{'epoch': 5, 'train_ep_loss': 0.9692070892599762}


EP 6: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:14<00:00,  4.38it/s]


{'epoch': 6, 'train_ep_loss': 0.9514710465415579}


EP 7: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:14<00:00,  4.66it/s]


{'epoch': 7, 'train_ep_loss': 0.9296866516597936}


EP 8: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:15<00:00,  4.65it/s]


{'epoch': 8, 'train_ep_loss': 0.9136798186380355}


EP 9: 100%|██████████████████████████████████████████████████████████████████████████| 305/305 [01:13<00:00,  4.42it/s]


{'epoch': 9, 'train_ep_loss': 0.898882993322904}


In [13]:
test = vocab.kor2vec.embedding("김동호 교수님 수업 어때?")

#input = vocab.kor2vec.to_seqs(["김동호 교수님 수업 어때?", "컴퓨터보안"], seq_len=6)
#vocab.kor2vec.forward(input)

torch.Size([4, 128])

In [4]:
kv = Kor2Vec.load("./korean_embedding/embedding.model")

AttributeError: Can't get attribute '_rebuild_parameter' on <module 'torch._utils' from 'C:\\Program Files (x86)\\Microsoft Visual Studio\\Shared\\Anaconda3_64\\lib\\site-packages\\torch\\_utils.py'>

In [5]:
import torch
print(torch.__version__)

0.4.1
