# Korean word embedding

한국어 데이터셋을 읽고 word score 계산, tokenizing, word2vec 모델을 학습시키고 단어에 대한 벡터를 반환하는 클래스를 구현합니다.

## 준비
아래의 코드를 돌리기 위해서는 3가지의 pip install이 필요합니다

    pip install soynlp
    pip install gensim
    pip install numpy==1.13, should downgrade numpy

In [1]:
import pandas as pd

from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models import Word2Vec

# pip install soynlp
# pip install gensim
# pip install numpy==1.13, should downgrade numpy

# https://github.com/lovit/soynlp/
# https://lovit.github.io/nlp/2018/04/09/three_tokenizers_soynlp/
# https://ratsgo.github.io/natural%20language%20processing/2017/03/08/word2vec/
# https://radimrehurek.com/gensim/models/word2vec.html

# 띄어쓰기 오류의 해결 (맞춤법 교정 툴 사용)
# 학습되지 않은 단어에 대한 encoding

## EmbedQAData

EmbedQAData의 생성자에는 열고자 하는 데이터셋의 파일 이름이 매개변수로 필요하며, 기본적으로 .xlsx 파일을 사용하도록 되어있습니다.

생성자 실행 시, 읽어온 데이터셋을 바탕으로 word score 계산 -> tokenizer 설정 -> word2vec 모델 학습 및 저장이 이루어지게 됩니다

In [3]:
class EmbedQAData:
    def __init__(self, fileName):
        self.open_fileName = fileName
        self.save_fileName = self.open_fileName + '_word2vec'
        
        self.question = pd.read_excel(self.open_fileName + '.xlsx')['question']
        print(' read question data from ', self.open_fileName)
        
        self.setWordScores()
        self.setTokenizer()
        self.saveWord2Vec()
        
    def setWordScores(self):   
        word_extractor = WordExtractor(
            max_left_length=20, 
            max_right_length=20, 
            min_frequency = 20,
            min_cohesion_forward = 0.05,
            min_right_branching_entropy = 0.0
        )
        
        word_extractor.train(self.question)
        self.word_scores = word_extractor.extract()
        print(' extract and calculate ', len(self.word_scores), ' words in ', self.open_fileName)
    
    def setTokenizer(self):
        cohesion_scores = {word:score.cohesion_forward for word, score in self.word_scores.items()}
        self.tokenizer = MaxScoreTokenizer(score = cohesion_scores)
        # self.tokenizer = LTokenizer(scores = cohesion_scores)
        print(' set tokenizer')
        
    def tokenizeSentence(self, sent):
        tSent = [self.tokenizer.tokenize(s) for s in sent]
        
        return tSent
                
    def saveWord2Vec(self):
        self.tQuestion = self.tokenizeSentence(self.question)
        self.word2vec = Word2Vec(
            self.tQuestion, 
            size = 100, 
            window = 2, 
            min_count = 30, 
            iter = 100, 
            sg = 1
        )
        
        self.word2vec.save(self.save_fileName + '.model')
        print(' train word2vec model and save model in ', self.save_fileName)
        
    def vectorizeWord(self, words):          
        return self.word2vec.wv[words]      

## EmbedQAData 객체 생성

현재, '2018_11_10.xlsx' 데이터셋을 사용하도록 되어있습니다. 다른 데이터셋을 사용하고자 하실 경우 아래의 셀에서 파일명을 수정하여 주시기 바랍니다. 데이터셋과 본 코드는 같은 디렉토리에 있어야 합니다.

In [4]:
embed = EmbedQAData('2018_11_10')

 read question data from  2018_11_10
training was done. used memory 0.139 Gbry 0.136 Gb
all cohesion probabilities was computed. # words = 1592
all branching entropies was computed # words = 4190
all accessor variety was computed # words = 4190
 extract and calculate  611  words in  2018_11_10
 set tokenizer
 train word2vec model and save model in  2018_11_10_word2vec


## EmbedQAData 내의 word2vec 모델 활용

embed.word2vec.wv.most_similar(word, topn = n) 함수는 데이터셋 내에 포함된 단어 중, word와 가장 유사한 n개의 단어와 유사도를 반환합니다.

In [7]:
print(embed.word2vec.wv.most_similar('운영체제', topn=45))

[('확률및랜덤프로세스', 0.978531002998352), ('형식언어', 0.9780620336532593), ('공개sw프로젝트', 0.9758005738258362), ('ICT와소프트웨어', 0.9753122329711914), ('이산구조', 0.9732568860054016), ('종합설계', 0.9726034998893738), ('창의적공학설계', 0.9697827100753784), ('정보통신시스템시뮬레이션', 0.7893610596656799), ('sw비지니스와창업', 0.7745636701583862), ('네트워크보안', 0.7722445130348206), ('웹프로그래밍', 0.7705057859420776), ('자료구조와실습', 0.7702228426933289), ('시스템소프트웨어실습', 0.7693858742713928), ('비쥬얼프로그래밍', 0.7686390280723572), ('신호와시스템', 0.7679079174995422), ('무선통신및실험', 0.767259955406189), ('통신이론및실험', 0.766333281993866), ('인공지능', 0.7657416462898254), ('테크니컬프리젠테이션', 0.7636289000511169), ('웹플랫폼콘텐츠개발', 0.7617589235305786), ('증강혼합현실', 0.7609738111495972), ('안드로이드앱프로그래밍', 0.7601668834686279), ('OSS프로그래밍개발방법론', 0.7592142820358276), ('인터넷네트워킹', 0.7589564919471741), ('게임및로봇지능', 0.7549275159835815), ('초고속통신망', 0.7547352910041809), ('인간컴퓨터상호작용시스템', 0.7530588507652283), ('임베디드소프트웨어입문', 0.7521194219589233), ('가상현실', 0.752109944820404), ('인터넷프로그래밍', 0.75177353620

## EmbedQAData를 통한 단어 vectorize


embed.vectorizeWord(word) 함수는 word에 대응하는 벡터를 반환하며, 이는 문자열 분류 딥러닝 모델의 입력으로서 사용 될 수 있습니다.

In [8]:
temp = ['김동호']

print(embed.vectorizeWord(temp))

[[ 0.14703494  0.07003424 -0.07532252  0.2090735   0.23807326 -0.39866677
   0.12116498  0.26162276  0.16038509  0.11868048 -0.09329566  0.04649542
   0.25153664  0.48589748 -0.12524371  0.51430911 -0.14833455  0.60317266
   0.16239899 -0.41450891  0.1486879  -0.05780003  0.0558086  -0.20388298
  -0.28821549 -0.43740484  0.55810088 -0.27375475 -0.38436699  0.46118206
  -0.02095545  0.22825168  0.08149022 -0.3376433  -0.46305653 -0.47918421
   0.14156145  0.21566027 -0.00966598 -0.57200462  0.05379961  0.23373818
   0.35211366 -0.49744651  0.08418851  0.2943913  -0.76034296 -0.10517261
   0.28527269  0.06699755 -0.5476442   0.07141912  0.17617524 -0.33091044
   0.23739529  0.01365994 -0.08067644  0.53982013 -0.10116026 -0.03791698
   0.11387751  0.08742219 -0.51501119 -0.22153682 -0.37704751 -0.23892644
  -0.05746894 -0.16092631  0.11809233  0.35424164  0.03903409  0.22147048
   0.13047653 -0.37080708  0.09219976 -0.10861404 -0.53251064  0.14750987
   0.08352289  0.09339508  0.16494358 