In [1]:
from gensim.models import Word2Vec
import pandas as pd
import pickle
import time
import logging
import multiprocessing as mp
import os
logging.basicConfig(
	format='%(asctime)s : %(levelname)s : %(message)s',
	level=logging.INFO)
corpus_path = 'corpus/'
cores = mp.cpu_count()
name_corpus = ['attraction_tag.list',
               'hotel_tag.list',
               'restaurant_tag.list']

name_model = ['model/attraction_tag.model',
              'model/hotel_tag.model',
              'model/restaurant_tag.model']

params_tag = [{'size':300, 'window':99999, 'min_count':0,        # Attraction
               'workers':cores, 'iter':100, 'sg':1, 'sample':1e-1},
              {'size':300, 'window':99999, 'min_count':0,        # Hotel
               'workers':cores, 'iter':100, 'sg':1, 'sample':1e-2},
              {'size':300, 'window':99999, 'min_count':0,        # Restaurant
               'workers':cores, 'iter':100, 'sg':1, 'sample':1e-4}]

In [2]:
corpora = []
for name in name_corpus:
    try:
        os.stat(corpus_path+name)
        with open(corpus_path+name, 'rb') as f:
            corpora.append(pickle.load(f))
    except FileNotFoundError:
        # int to string
        with open(corpus_path+'kor_'+name, 'rb') as f:
            corpus = pickle.load(f)
        corpus = [[str(pid) for pid in line] for line in corpus]
        with open(corpus_path+'kor_'+name,'wb') as f:
            pickle.dump(corpus, f)
        with open(corpus_path+'eng_'+name, 'rb') as f:
            corpus = pickle.load(f)
        corpus = [[str(pid) for pid in line] for line in corpus]
        with open(corpus_path+'eng_'+name,'wb') as f:
            pickle.dump(corpus, f)
        # 한글&영문 corpus 병합
        with open(corpus_path+'kor_'+name, 'rb') as f:
            kor = pickle.load(f)
        with open(corpus_path+'eng_'+name, 'rb') as f:
            eng = pickle.load(f)
        merged = kor+eng
        with open(corpus_path+name, 'wb') as f:
            pickle.dump(merged, f)
        corpora.append(merged)

In [3]:
#test
corpus = corpora[0]
print('corpus 길이', len(corpus))
total = pd.DataFrame([len(sent) for sent in corpus]).sum()
length = 0
for line in corpus:
    length = max(length, len(line))
print('corpus내 최대 길이', length)
print('corpus내 모든 장소 합', total)

corpus 길이 65862
corpus내 최대 길이 33995
corpus내 모든 장소 합 0    2256328
dtype: int64


In [None]:
## spent = []
#일부
for i in range(1,2):
#전체
#for i in range(len(corpora)):
    start = time.time()
    model = Word2Vec(corpora[i], **params_tag[i])
    spent.append('Elapsed time: '+str(time.time() - start)+' sec'+' ['+name_model[i]+']')
    model.wv.save(name_model[i])
print(spent)

2018-09-11 02:48:51,534 : INFO : collecting all words and their counts
2018-09-11 02:48:51,535 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-09-11 02:48:51,574 : INFO : PROGRESS: at sentence #10000, processed 261355 words, keeping 532 word types
2018-09-11 02:48:51,707 : INFO : PROGRESS: at sentence #20000, processed 1303452 words, keeping 972 word types
2018-09-11 02:48:51,716 : INFO : PROGRESS: at sentence #30000, processed 1356950 words, keeping 973 word types
2018-09-11 02:48:51,722 : INFO : PROGRESS: at sentence #40000, processed 1375840 words, keeping 975 word types
2018-09-11 02:48:51,725 : INFO : collected 980 word types from a corpus of 1383273 raw words and 45889 sentences
2018-09-11 02:48:51,726 : INFO : Loading a fresh vocabulary
2018-09-11 02:48:51,729 : INFO : effective_min_count=0 retains 980 unique words (100% of original 980, drops 0)
2018-09-11 02:48:51,730 : INFO : effective_min_count=0 leaves 1383273 word corpus (100% of original 13

2018-09-11 03:02:01,804 : INFO : EPOCH 1 - PROGRESS: at 26.44% examples, 874 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:02:04,732 : INFO : EPOCH 1 - PROGRESS: at 26.47% examples, 884 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:02:13,566 : INFO : EPOCH 1 - PROGRESS: at 26.52% examples, 884 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:02:15,372 : INFO : EPOCH 1 - PROGRESS: at 26.54% examples, 894 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:02:30,018 : INFO : EPOCH 1 - PROGRESS: at 26.55% examples, 890 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:02:32,493 : INFO : EPOCH 1 - PROGRESS: at 26.62% examples, 899 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:02:38,438 : INFO : EPOCH 1 - PROGRESS: at 26.66% examples, 905 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:02:50,162 : INFO : EPOCH 1 - PROGRESS: at 26.72% examples, 904 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:02:54,071 : INFO : EPOCH 1 - PROGRESS: at 26.77% examples, 911 words/s, in_qsize 7, out_qsize 0
2018-09-11

2018-09-11 03:07:54,744 : INFO : EPOCH 2 - PROGRESS: at 25.79% examples, 1961 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:09:11,397 : INFO : EPOCH 2 - PROGRESS: at 25.80% examples, 1325 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:09:13,831 : INFO : EPOCH 2 - PROGRESS: at 25.80% examples, 1356 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:09:18,327 : INFO : EPOCH 2 - PROGRESS: at 25.80% examples, 1372 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:09:36,288 : INFO : EPOCH 2 - PROGRESS: at 25.81% examples, 1307 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:10:07,635 : INFO : EPOCH 2 - PROGRESS: at 25.81% examples, 1178 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:10:45,996 : INFO : EPOCH 2 - PROGRESS: at 25.81% examples, 1061 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:11:10,420 : INFO : EPOCH 2 - PROGRESS: at 25.81% examples, 1012 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:11:27,808 : INFO : EPOCH 2 - PROGRESS: at 25.82% examples, 984 words/s, in_qsize 7, out_qsize 0
20

2018-09-11 03:21:39,618 : INFO : EPOCH 2 - PROGRESS: at 29.02% examples, 1063 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:21:44,445 : INFO : EPOCH 2 - PROGRESS: at 29.16% examples, 1068 words/s, in_qsize 8, out_qsize 0
2018-09-11 03:21:47,413 : INFO : EPOCH 2 - PROGRESS: at 29.51% examples, 1085 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:21:52,337 : INFO : EPOCH 2 - PROGRESS: at 29.74% examples, 1090 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:21:53,560 : INFO : EPOCH 2 - PROGRESS: at 29.82% examples, 1098 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:21:59,545 : INFO : EPOCH 2 - PROGRESS: at 30.51% examples, 1121 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:22:01,445 : INFO : EPOCH 2 - PROGRESS: at 31.22% examples, 1148 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:22:03,927 : INFO : EPOCH 2 - PROGRESS: at 31.65% examples, 1155 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:22:05,908 : INFO : EPOCH 2 - PROGRESS: at 33.00% examples, 1183 words/s, in_qsize 7, out_qsize 0
2

2018-09-11 03:34:07,133 : INFO : EPOCH 3 - PROGRESS: at 26.25% examples, 867 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:34:10,896 : INFO : EPOCH 3 - PROGRESS: at 26.28% examples, 877 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:34:14,854 : INFO : EPOCH 3 - PROGRESS: at 26.32% examples, 885 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:34:25,281 : INFO : EPOCH 3 - PROGRESS: at 26.34% examples, 883 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:34:43,807 : INFO : EPOCH 3 - PROGRESS: at 26.37% examples, 874 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:34:48,299 : INFO : EPOCH 3 - PROGRESS: at 26.40% examples, 879 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:34:56,264 : INFO : EPOCH 3 - PROGRESS: at 26.42% examples, 882 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:35:03,810 : INFO : EPOCH 3 - PROGRESS: at 26.42% examples, 886 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:35:20,132 : INFO : EPOCH 3 - PROGRESS: at 26.44% examples, 880 words/s, in_qsize 7, out_qsize 0
2018-09-11

2018-09-11 03:40:05,553 : INFO : EPOCH 4 - PROGRESS: at 0.87% examples, 1751 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:40:08,251 : INFO : EPOCH 4 - PROGRESS: at 1.03% examples, 1818 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:40:09,703 : INFO : EPOCH 4 - PROGRESS: at 1.28% examples, 1908 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:40:12,360 : INFO : EPOCH 4 - PROGRESS: at 1.72% examples, 2090 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:40:13,650 : INFO : EPOCH 4 - PROGRESS: at 2.58% examples, 2299 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:40:14,750 : INFO : EPOCH 4 - PROGRESS: at 19.75% examples, 3103 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:40:26,754 : INFO : EPOCH 4 - PROGRESS: at 25.75% examples, 2788 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:40:30,593 : INFO : EPOCH 4 - PROGRESS: at 25.76% examples, 2705 words/s, in_qsize 7, out_qsize 0
2018-09-11 03:40:44,994 : INFO : EPOCH 4 - PROGRESS: at 25.79% examples, 2442 words/s, in_qsize 7, out_qsize 0
2018-0