In [2]:
import gensim # version 4
# conda install -c anaconda gensim
# pip install gensim==4.0.1



# Step1: prepare the corpus for training

In [22]:
# Step 1. 주어진 data로 gensim을 활용하여 word2vec 모델 학습

# 학습을 위한 데이터 로딩 -- Data 준비
class TextIterator(object):
    def __init__(self, fname):
        self.fname = fname
        
    def __iter__(self):
        for line in open(self.fname,encoding='utf-8'):
            yield line.split()

filename = 'newskor.txt'
sentences = TextIterator(filename)

# Step 2, 3: Training & Load Word2Vec model

In [23]:
# Hyperparams
train = True # train flag (True: train model / False: load trained model)
SIZE = 300 # vector size
WINDOW = 5 # context window 앞뒤로  5+target+5
SG = 1 # 1 == skip-gram / 0 == cbow
MIN_COUNT = 10 # ignores all words appearing lower than min_count #적게 등장하는 단어는 굳이 학습을 하지 않도록 함.
WORKERS = 20 # cpu cores

In [25]:
if train:
    model = gensim.models.Word2Vec(
        vector_size=SIZE, window=WINDOW, sg=SG, 
        min_count=MIN_COUNT, workers=WORKERS
    )
    model.build_vocab(sentences) # prepare model vocab
    model.train(sentences, total_examples=model.corpus_count, epochs=1)
    model.save('newskor.model')
else:
    model = gensim.models.Word2Vec.load('newskor.model')

In [26]:
vocab = model.wv.index_to_key # See vocabs
for i, v in enumerate(vocab):
    print("{}: {}".format(i, v))
    if i==30: break

0: 하
1: 이
2: .
3: 는
4: 을
5: ㄴ
6: 다
7: 의
8: 에
9: 를
10: 은
11: 어
12: 있
13: 고
14: 으로
15: 가
16: 였
17: ㄹ
18: 되
19: ,
20: 에서
21: 었
22: )
23: (
24: 로
25: 것
26: 도
27: 등
28: 과
29: 들
30: 지


In [27]:
## check word embed result
word = '버스'
print(model.wv[word])
print('size of vector: ', len(model.wv[word]))

[-0.06196966  0.2443025  -0.00139966  0.02323417 -0.28431296 -0.18615228
 -0.03106055  0.09346455 -0.03404297 -0.02171713 -0.08730045 -0.03931028
 -0.2926308   0.07602247  0.07774882  0.12796213  0.00265447 -0.12028006
  0.10675357 -0.09822863  0.02753089 -0.27141872 -0.14659564 -0.00650539
 -0.07444834 -0.08077253 -0.13549362  0.11622631 -0.11447094  0.05133944
 -0.05963147 -0.04824286  0.19740534  0.09139168 -0.12114628 -0.02173519
  0.06208365 -0.18713741 -0.1348079   0.08764952 -0.05121287 -0.05168116
  0.24995574  0.18817267  0.12913667  0.13066295 -0.25786987  0.20005894
  0.42409542  0.18157567  0.09399304  0.31929448 -0.01467193  0.08827894
  0.2192231   0.35404733 -0.07174036  0.12566915 -0.05472526 -0.02516148
 -0.08312676 -0.0346135  -0.13047582  0.02981478 -0.06950638  0.20200075
  0.16968367  0.03333197 -0.08733575  0.19859171 -0.07983916  0.26091725
 -0.17213994 -0.30112422  0.19590712  0.11933541  0.12719485 -0.16143684
 -0.03746552 -0.05472356 -0.00304054 -0.15337983  0

# Step4: Get word similarity

In [28]:
#word1 = '한국'
#word2 = '북한'
print ("Caculate the similarity between word 1 and word2")
word1 = input("word1: ")
word2 = input("word2: ")

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

if word1 not in vocab:
    print ('the word ' + word1 + ' is not in the vocabulary')
    no_problem = False

if word2 not in vocab:
    print ('the word ' + word2 + ' is not in the vocabulary')
    no_problem = False

if no_problem:
    similarity = model.wv.similarity(word1, word2)
    print ('the similarity between ' + word1 + ' and ' + word2 + ' : ', similarity)

Caculate the similarity between word 1 and word2
word1: 컴퓨터
word2: 치약
the similarity between 컴퓨터 and 치약 :  0.5219859


# Step5: Find mismatch word

In [32]:
#words = '소프트웨어 네트워크 프로그램 가방'
print("Find mismatched word in the words")
text = input("text(words): ")
words = text.split()

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

for word in words:
    if word not in vocab:
        print('the word ' + word + ' is not in the vocabulary')
        no_problem = False
        break;

if no_problem:
    mismatched = model.wv.doesnt_match(words)
    print ('the mismatch word between ' + text +' is', mismatched)

Find mismatched word in the words
text(words): 한국 북한 김밥
the mismatch word between 한국 북한 김밥 is 김밥


# Step 6. Find the top-N most similar words

In [35]:
print("Print the most similar words")
word = input("word: ")

no_problem = True
vocab = model.wv.index_to_key

if word not in vocab:
    print ('the word ' + word + ' is not in the vocabulary')
    no_problem = False

if no_problem:
    print(model.wv.most_similar(positive=[word]))

Print the most similar words
word: 음식
[('옷', 0.8326328992843628), ('맵', 0.8317863345146179), ('빵', 0.8258019089698792), ('맛', 0.8165563344955444), ('아이들', 0.8155353665351868), ('화장실', 0.8116585612297058), ('습관', 0.8098694682121277), ('과일', 0.8042347431182861), ('사물', 0.8012840151786804), ('맛있', 0.7991383075714111)]


# Step 7: Vector calculation

In [37]:
#word_a = '한국'
#word_b = '아시아'
#word_c = '유럽'
print('Find the most similar word with the result of [ a - b + c ]')
word_a = input("a: ")
word_b = input("b: ")
word_c = input("c: ")

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

if word_a not in vocab:
    print ('the word ' + word_a + ' is not in the vocabulary')
    no_problem = False

if word_b not in vocab:
    print ('the word ' + word_b + ' is not in the vocabulary')
    no_problem = False

if word_c not in vocab:
    print ('the word ' + word_c + ' is not in the vocabulary')
    no_problem = False

if no_problem:
    mostsimilar = model.wv.most_similar(positive=[word_a, word_c], negative=[word_b], topn=5)
    print ('most similar word of ' + word_a + ' - ' + word_b + ' + ' + word_c + ' is', mostsimilar[0][0], mostsimilar[1][0], mostsimilar[2][0])

Find the most similar word with the result of [ a - b + c ]
a: 서울
b: 한국
c: 미국
most similar word of 서울 - 한국 + 미국 is 종로구 삼성동 세종로
