# Korean word embedding

한국어 데이터셋을 읽고 word score 계산, tokenizing, word2vec 모델을 학습시키고 단어에 대한 벡터를 반환하는 클래스를 구현합니다.

## 준비
아래의 코드를 돌리기 위해서는 3가지의 pip install이 필요합니다

    pip install soynlp
    pip install gensim
    pip install numpy==1.13, should downgrade numpy

In [None]:
import pandas as pd

from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models import Word2Vec

# pip install soynlp
# pip install gensim
# pip install numpy==1.13, should downgrade numpy

# https://github.com/lovit/soynlp/
# https://lovit.github.io/nlp/2018/04/09/three_tokenizers_soynlp/
# https://ratsgo.github.io/natural%20language%20processing/2017/03/08/word2vec/
# https://radimrehurek.com/gensim/models/word2vec.html

# 띄어쓰기 오류의 해결 (맞춤법 교정 툴 사용)
# 학습되지 않은 단어에 대한 encoding

## EmbedQAData

EmbedQAData의 생성자에는 열고자 하는 데이터셋의 파일 이름이 매개변수로 필요하며, 기본적으로 .xlsx 파일을 사용하도록 되어있습니다.

생성자 실행 시, 읽어온 데이터셋을 바탕으로 word score 계산 -> tokenizer 설정 -> word2vec 모델 학습 및 저장이 이루어지게 됩니다

In [2]:
class EmbedQAData:
    def __init__(self, fileName):
        self.open_fileName = fileName
        self.save_fileName = self.open_fileName + '_word2vec'
        
        self.question = pd.read_excel(self.open_fileName + '.xlsx')['question']
        print(' read question data from ', self.open_fileName)
        
        self.setWordScores()
        self.setTokenizer()
        self.saveWord2Vec()
        
    def setWordScores(self):   
        word_extractor = WordExtractor(
            max_left_length=20, 
            max_right_length=20, 
            min_frequency = 20,
            min_cohesion_forward = 0.05,
            min_right_branching_entropy = 0.0
        )
        
        word_extractor.train(self.question)
        self.word_scores = word_extractor.extract()
        print(' extract and calculate ', len(self.word_scores), ' words in ', self.open_fileName)
    
    def setTokenizer(self):
        cohesion_scores = {word:score.cohesion_forward for word, score in self.word_scores.items()}
        self.tokenizer = LTokenizer(scores = cohesion_scores)
        print(' set tokenizer')
        
    def tokenizeSentence(self, sent):
        tSent = [self.tokenizer.tokenize(s) for s in sent]
        '''
        result = []
        for s in sent:
            temp = self.tokenizer.tokenize(s)
            result.append(temp)
        return result
        '''
        
        return tSent
                
    def saveWord2Vec(self):
        self.tQuestion = self.tokenizeSentence(self.question)
        self.word2vec = Word2Vec(
            self.tQuestion, 
            size = 100, 
            window = 2, 
            min_count = 30, 
            iter = 100, 
            sg = 1
        )
        
        self.word2vec.save(self.save_fileName + '.model')
        print(' train word2vec model and save model in ', self.save_fileName)
        
    def vectorizeWord(self, words):          
        return self.word2vec.wv[words]      

## EmbedQAData 객체 생성

현재, '2018_11_10.xlsx' 데이터셋을 사용하도록 되어있습니다. 다른 데이터셋을 사용하고자 하실 경우 아래의 셀에서 파일명을 수정하여 주시기 바랍니다. 데이터셋과 본 코드는 같은 디렉토리에 있어야 합니다.

In [3]:
embed = EmbedQAData('2018_11_10')

 read question data from  2018_11_10
training was done. used memory 0.139 Gbry 0.135 Gb
all cohesion probabilities was computed. # words = 1592
all branching entropies was computed # words = 4190
all accessor variety was computed # words = 4190
 extract and calculate  611  words in  2018_11_10
 set tokenizer
 train word2vec model and save model in  2018_11_10_word2vec


## EmbedQAData 내의 word2vec 모델 활용

embed.word2vec.wv.most_similar(word, topn = n) 함수는 데이터셋 내에 포함된 단어 중, word와 가장 유사한 n개의 단어와 유사도를 반환합니다.

In [4]:
print(embed.word2vec.wv.most_similar('김동호', topn=45))

[('엄기현', 0.9874568581581116), ('주태우', 0.9869372248649597), ('이창환', 0.98565673828125), ('임대운', 0.985440194606781), ('김현우', 0.9849758744239807), ('문봉교', 0.9849180579185486), ('홍정모', 0.984811544418335), ('손윤식', 0.9847736358642578), ('최은만', 0.9847699999809265), ('정영식', 0.9846036434173584), ('정대원', 0.9845662117004395), ('류철', 0.9843294620513916), ('최병석', 0.9842439293861389), ('양기주', 0.9841998815536499), ('김은정', 0.984149694442749), ('윤승현', 0.9841306805610657), ('박미화', 0.9836329817771912), ('임민중', 0.9833610653877258), ('김신우', 0.9833124876022339), ('이용규', 0.9827548265457153), ('신연순', 0.9825058579444885), ('한효준', 0.9824638962745667), ('서정열', 0.9824485778808594), ('정준호', 0.9818634986877441), ('조경은', 0.9818394184112549), ('김동환', 0.9815464019775391), ('송양의', 0.9815235137939453), ('최성연', 0.9814329147338867), ('서상현', 0.9813843965530396), ('주종화', 0.981324315071106), ('박상훈', 0.9803674221038818), ('박경원', 0.9801539182662964), ('장태무', 0.9799534678459167), ('성연식', 0.9797316789627075), ('이강만', 0.9794842600

## EmbedQAData를 통한 단어 vectorize


embed.vectorizeWord(word) 함수는 word에 대응하는 벡터를 반환하며, 이는 문자열 분류 딥러닝 모델의 입력으로서 사용 될 수 있습니다.

In [6]:
temp = ['이강우']

print(embed.vectorizeWord(temp))

[[-0.43749407  0.19403827  0.01953617  0.46005103  0.10975784  0.54906934
  -0.30651376  0.02718642  0.08389693  0.22822829  0.4514327  -0.1546101
  -0.73814076  0.23028906  0.21702991 -0.21690753 -0.32788274  0.1296224
  -0.19887778 -0.41153488  0.38080642  0.19134855 -0.33456147 -0.21013761
  -0.10433828  0.51776755  0.23274334 -0.23867849  0.05514818 -0.02785624
  -0.56283003 -0.14178321 -0.1170719  -0.03393989  0.32663324 -0.10419425
   0.21866202 -0.28924525 -0.38299119 -0.02399184 -0.25657564 -0.04598802
   0.39985606  0.29622754 -0.27895808 -0.0392534  -0.43190688 -0.37441874
   0.15400191 -0.39620721 -0.79515046 -0.25475162 -0.38195917 -0.02682945
  -0.08283545 -0.0504769  -0.1838235   0.15670744  0.1259345   0.354864
  -0.89394176 -0.50966024 -0.17803793 -0.1674211   0.04039023 -0.08191461
   0.15071036 -0.08209065  0.02674407  0.00483787  0.05701429  0.49547362
  -0.19398405 -0.21337189  0.16053754  0.24728402  0.23066407 -0.1833156
   0.08786409  0.42928508 -0.01716709 -0.04