-
Notifications
You must be signed in to change notification settings - Fork 0
/
MainController.py
57 lines (47 loc) · 1.88 KB
/
MainController.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#/Users/inosphe/anaconda/bin/python
# -*- coding: utf-8 -*-
from DBConnector.db_tools import Pool, QueryPool
from base_config import WORD2VEC_LINE_TEXT, WORD2VEC_MODEL, PIPE_DUMPING
from TextProcedure.TextParser import law_contents_to_file
from MachineLearner import Trainer
import gensim
import os
query_pool = QueryPool(Pool())
#webserver_query_pool = QueryPool(Pool(host_type_local=False))
def train_word2vec(delete=False):
# load articles from db
if os.path.exists(WORD2VEC_LINE_TEXT):
if delete:
os.remove(WORD2VEC_LINE_TEXT)
else:
print 'line_file_exists'
else:
print 'Load Article Contents.....'
contents = query_pool.get_unparsed_content()
# do text processing and mk word2vec line file
print 'Proceed text filtering and mk word2vec_line_text'
law_contents_to_file(contents, WORD2VEC_LINE_TEXT)
if os.path.exists(WORD2VEC_MODEL):
if delete:
os.remove(WORD2VEC_MODEL)
else:
return gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True)
# training!
return Trainer.train_and_save_vector(WORD2VEC_LINE_TEXT, WORD2VEC_MODEL)
def task_clustering(word2vec=None):
# load db from remote
_ids, tasks = query_pool.get_article_logs()
# word2vec load
if word2vec is None:
word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True)
# create and dump pipe
# percentage 는 전체 태스크 수를 클러스터링 했을 때 한 클러스터의 차지하는 비중
print 'Start Clustering'
pipe, labels = Trainer.decompose_and_cluster(tasks, word2vec, PIPE_DUMPING, n_clusters=3)
# update task db
query_pool.attach_task_label(_ids=_ids, labels=labels)
def main():
word2vec = train_word2vec(delete=False)
task_clustering(word2vec=word2vec)
if __name__=='__main__':
main()