In [1]:
from gensim.models import word2vec
import logging
import pandas as pd
import numpy as np
import scipy as sp
from pprint import pprint
from datetime import datetime
import math
import heapq 
import re
import os
import sys
from spherecluster import SphericalKMeans
from sklearn.cluster import KMeans

In [8]:
# 用iter读数据，确保内存不会炸
class MyCorpus(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for line in open(self.filename):
            # assume there's one document per line, tokens separated by whitespace
            yield (line.lower().split())

            
# 计算词组与文本的tf-idf，输入phrase和所有文本，返回和每个文本的tf-idf
def tf_idf(phrase,docs):
    phrase_exist = []
    for doc in docs:
        phrase = phrase.lower().strip()
        doc = doc.lower().strip()
        tf = doc.count(phrase)/(1+len(doc.split()))
        phrase_exist.append(tf)
    exist_num = len(phrase_exist)-phrase_exist.count(0)
#     print ('-'*30)
#     print ('Exist num: %s'%exist_num)
    idf = math.log(len(docs)/(1+exist_num))
#     print ('Idf: %s'%idf)
    return np.array([list(map(lambda i: i*idf, phrase_exist))])

# 替换原文本中所有话题为a_b
def replace_all(text, phrases):
    count = 0
    for phrase in phrases:
        if count % 100 == 0:
            print (count, datetime.now())
        text = text.replace(phrase, phrase.replace(' ','_'))
        count += 1
    return text

# 训练，更新话题向量
def local_embedding(phrases, docs):
    path = 'dataset/%s/%s/temp/docs.txt'%(dataset,level)
    if not os.path.isfile(path):
        print ('Docs not exists, need to generate docs first.')
        with open(path, 'w') as f_write:
            for doc in docs:
                for sentence in re.split(r'[.?!]', doc):
                    sentence_new = re.sub(",!@#$%^&*()", " ",sentence)  
                    f_write.write(sentence_new.strip()+'\n')
        f_write.close()
        
#     sys.exit()
    # 训练词向量
    # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    # 加载语料
    model_path = 'dataset/%s/%s/model'%(dataset,level)
    sentences = MyCorpus(path)
    # 训练skip-gram模型; 设定维度为100，窗口5，词最小出现次数3，4核并行
    if not os.path.isfile(model_path):
        print ('Train model')
        model = word2vec.Word2Vec(sentences, size=size, window=5, min_count=2, workers=4)
        model.save(model_path)
    else:
        print ('Load model')
        model = word2vec.Word2Vec.load(model_path)
    # 将话题集合中的每一个话题求phrase embedding
    phrases_vector = {}
    total_count = 0
    none_zero_count = 0
    zero_count = 0
    for phrase in phrases:
        phrase = phrase.replace(' ','_')
        vector = np.zeros((size,), dtype=float)
        total_count += 1
        try:
            vector += model.wv[phrase]
            none_zero_count += 1
            if phrase not in phrases_vector:
                phrases_vector[phrase] = vector
        # 对于词典中不存在的词，直接取0
        except Exception as e:
            zero_count += 1
    return phrases_vector
    
def spherical_kmeans(phrases_vector, k):
    phrase_list = []
    vector_list = []
    for phrase in phrases_vector:
        vec = phrases_vector[phrase]
        phrase_list.append(phrase)
        vector_list.append(vec)
    X = np.array(vector_list)
    
    # 假设k=10，用spherical k-means进行聚类
    # Number of clusters
    skm = SphericalKMeans(n_clusters=k)
    # Fitting the input data
    kmeans = skm.fit(X)
    # Getting the cluster labels
    labels = skm.labels_
    # Centroid values
    centroids = skm.cluster_centers_
    
    # 获得每个话题的聚类结果
    phrases_labels = pd.DataFrame(columns=['label','phrase'])
    for i in range(len(phrases_vector)):
        phrases_labels = phrases_labels.append({'label': labels[i], 'phrase': phrase_list[i]}, ignore_index=True)
    
    print ('Write phrases_clustering File......')
    phrases_labels.to_csv('dataset/%s/%s/%s/temp/phrases_clustering.csv'%(dataset,model_name,level),index=False)
    print ('Completed')
    return phrases_labels
    
def kmeans(phrases_vector, k):
    phrase_list = []
    vector_list = []
    for phrase in phrases_vector:
        vec = phrases_vector[phrase]
        phrase_list.append(phrase)
        vector_list.append(vec)
    X = np.array(vector_list)
    # 假设k=10，用k-means进行聚类
    kmeans = KMeans(n_clusters=k)
    # Fitting the input data
    kmeans = kmeans.fit(X)
    # Getting the cluster labels
    labels = kmeans.predict(X)
    # Centroid values
    centroids = kmeans.cluster_centers_
    
    # 获得每个话题的聚类结果
    phrases_labels = pd.DataFrame(columns=['label','phrase'])
    for i in range(len(phrases_vector)):
        phrases_labels = phrases_labels.append({'label': labels[i], 'phrase': phrase_list[i]}, ignore_index=True)
    
    print ('Write phrases_clustering File......')
    phrases_labels.to_csv('dataset/%s/%s/%s/temp/phrases_clustering.csv'%(dataset,model_name,level),index=False)
    print ('Completed')
    return phrases_labels
    
    
def extract_docs_by_phrases(phrases_labesl, docs, k):
    model_path = 'dataset/%s/%s/model'%(dataset,level)
    model = word2vec.Word2Vec.load(model_path)
    top_tfidf = 1000
    # 利用tf-idf找到与phrase最相关的documents集合，并完成对于每个label下所有documents集合
    docs_cluster = []
#     for label in range(1):
    for label in range(k):
        print ('Label: %s'%label)
        phrases = phrases_labesl.loc[phrases_labesl['label']==label]['phrase'].tolist()

        # 计算每个词组对文档的tf-idf
#         for i in range(5):
        for i in range(len(phrases)):
            phrase = phrases[i].replace(' ','_')
            result = tf_idf(phrase,docs)
            if i == 0 or i == len(phrases)-1:
                print (phrase)
                print (result.shape, np.sum(result, axis=1))
            if i == 0:
                tf_idf_distribution = result
            else:
                tf_idf_distribution = np.insert(tf_idf_distribution, tf_idf_distribution.shape[0], values=result, axis=0)
        print ('Num of phrases: %s'%tf_idf_distribution.shape[0])
#         print ('Dim of vector: %s'%tf_idf_distribution.shape[1]) 
        '''
        # 取出所有词中tfidf top1000的documents下标
        docs_index = []
        for i in range(tf_idf_distribution.shape[0]):
            a = tf_idf_distribution[i]
            top_index = heapq.nlargest(min(top_tfidf,len(tf_idf_distribution[i])), range(len(a)), a.take)
            if top_index[min(top_tfidf,len(top_index)-1)] <= 0:
                # 取出所有tfidf不为0的document
                top_tfidf = np.count_nonzero(a)
                top_index = heapq.nlargest(top_tfidf, range(len(a)), a.take)
            docs_index = list(set(docs_index).union((set(top_index))))
        print ('Num of docs: %s'%len(docs_index))
        
        # 生成第i个聚类下的di
        docs_i = ''
        for i in range(len(docs_index)):
            docs_i += docs[docs_index[i]]
            docs_i += '$$$'
        docs_i = docs_i.strip()
        docs_cluster.append((label,docs_i))
        print ('-'*20)
        '''
        
        print ('计算平均词向量%s'%datetime.now())
        # 计算第k类话题的平均词向量
        phrases_embedding = np.zeros((0,size,), dtype=np.float32)
        for phrase in phrases:
            phrases_embedding = np.insert(phrases_embedding,phrases_embedding.shape[0],values=model.wv[phrase],axis=0)
        mean_phrase_embedding = np.mean(phrases_embedding,axis=0)
#         print (phrases,mean_phrase_embedding[:3])
        
        print ('计算文档向量%s'%datetime.now())
        # 计算文档向量，利用tf_idf加权的term embedding,并计算和该平均词向量相近的top1000个文本向量对应的文本
        phrase_doc_cos_dist = np.array([])
        for i in range(tf_idf_distribution.shape[1]):
            doc = docs[i]
            doc_embedding = np.zeros((size,), dtype=float)
            for j in range(tf_idf_distribution.shape[0]):
                phrase = phrases[j]
                phrase_embedding = model.wv[phrase]
                doc_embedding += phrase_embedding*tf_idf_distribution[j,i]
            if np.sum(tf_idf_distribution,axis=0)[i] != 0:
                doc_embedding = doc_embedding/np.sum(tf_idf_distribution,axis=0)[i]
                dist = 1-sp.spatial.distance.cosine(mean_phrase_embedding, doc_embedding)
            else:
                doc_embedding = doc_embedding/1
                dist = 0
            phrase_doc_cos_dist = np.insert(phrase_doc_cos_dist, len(phrase_doc_cos_dist), values=dist, axis=0)
        
        docs_index = heapq.nlargest(min(top_tfidf,np.count_nonzero(phrase_doc_cos_dist)), range(len(phrase_doc_cos_dist)), phrase_doc_cos_dist.take)
#         for index in docs_index:
#             print (index,phrase_doc_cos_dist[index])
        if len(docs_index) != 0:
            index = docs_index[-1]
            print (index,phrase_doc_cos_dist[index])
        else:
            print ('No Documents.')
#         print (phrase_doc_cos_dist[docs_index[-1]])
    
        print ('生成第i个聚类下的di%s'%datetime.now())
        # 生成第i个聚类下的di
        docs_i = ''
        for i in range(len(docs_index)):
            docs_i += docs[docs_index[i]]
            docs_i += '$$$'
        docs_i = docs_i.strip()
        docs_cluster.append((label,docs_i))
        print ('-'*20)
        
    print ('Write documents_clustering File......')
    # 将list转成df然后存下来
    df_labels = ['label','docs']
    df_dc = pd.DataFrame.from_records(docs_cluster,columns=df_labels)
    df_dc.to_csv('dataset/%s/%s/%s/temp/documents_clustering_1000.csv'%(dataset,model_name,level),index=False)
    print ('Completed')
    return df_dc.iloc[:,1]
    
    
def adaptive_spherical_kmeans(phrases, docs, threshold=0.5, k=10):
    general_topics = pd.DataFrame(columns=['label','phrase','rep'])
    sub_topics = pd.DataFrame(columns=['label','phrase','rep'])
    k = docs.shape[0]
    for i in range(k):
        print ('Label: %s'%i)
        phrases_i = phrases.loc[phrases['label']==i]['phrase']
        for j in range(phrases_i.shape[0]):
            phrase = phrases_i.iloc[j]
            rep = representativeness(phrase,i,docs)
            phrase_rep = {'label': i,'phrase': phrase,'rep': rep}
            if j == 0 or j == phrases_i.shape[0]-1:
                print (phrase_rep)
            if rep < threshold:
                general_topics = general_topics.append(phrase_rep, ignore_index=True)
            else:
                sub_topics = sub_topics.append(phrase_rep, ignore_index=True)
        print ('='*30)
    return general_topics, sub_topics

In [3]:
# 计算某个词出现在docs中的次数
def tf(docs_k, term=None):
    if term == None:
        return len(docs_k.split(' '))
    else:
#         print ('出现次数: %s'%docs_k.count(term))
        return docs_k.count(term)

# BM25相似
def rel(term, k, docs):
    k1 = 1
    b = 0.75
    docs_all = ''
    for doc in docs:
        docs_all += doc.strip()
    idf = math.log((len(docs_all.split(' '))-docs_all.count(term)+0.5)/(docs_all.count(term)+0.5))
    R = (docs[k].strip().count(term)*(k1+1))/\
        (docs[k].strip().count(term)+k1*(1-b+b*len(docs[k].strip().split(' ')))/(len(docs_all.split(' '))/len(docs)))
#     print ('doc_label:%s\nidf:%s\nR:%s'%(k,idf,R))
#     print ('='*20)
    return idf*R
    
# 计算某个词在docs集合中的popularity，即在docs_k中大量出现，越小表示越general
def popularity(term, k, docs):
    pop = math.log(tf(docs[k], term)+1)/math.log(tf(docs[k])+1)
#     print ('popularity: %s'%pop)
    return pop
  
# 计算某个词在docs集合中的concentration，即只在这个docs_k中出现而很少出现在docs_k-，越小表示越general
def concentration(term, k, docs):
    den = 1.0
    for i in range(len(docs)):
        den += math.exp(rel(term, i, docs))
    con = math.exp(rel(term, k, docs)/den)
#     print ('concentration: %s'%con)
    return con

# 计算某个词在docs集合中的representativeness，如果越高表示越属于这个cluster，反之为general的词                  
def representativeness(term, k, docs):
    rep = math.sqrt(popularity(term, k, docs)*concentration(term, k, docs))
    return rep

In [4]:
def topic_tree_generator(phrases, docs, threshold=0.5, k=10, phrases_labels=None, docs_labels=None):
    
    # 输入话题集合，文本集合；
    # 输出更新的话题向量
    start_time = datetime.now()
    print ('Start local embedding process.\nStart time: %s'%start_time)
    phrases_vector = local_embedding(phrases, docs)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    
    # 输入更新的话题向量
    # 输出贴有label的聚完k类的话题集合
    start_time = datetime.now()
    print ('Start spherical kmeans process.\nStart time: %s'%start_time)
    phrases_labels = spherical_kmeans(phrases_vector, k)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    
    # 输入k类话题集合和全体文本集合
    # 输出贴有label的聚完k类的文本集合
    start_time = datetime.now()
    print ('Start extract docs by phrases process.\nStart time: %s'%start_time)
    docs_labels = extract_docs_by_phrases(phrases_labels, docs, k)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    # sys.exit()
    
    # 输入k类话题集合和k类文本集合
    # 输出常见话题和k类子话题集合
    start_time = datetime.now()
    print ('Start adaptive spherical k means process.\nStart time: %s'%start_time)
    general_topics, sub_topics = adaptive_spherical_kmeans(phrases_labels, docs_labels, threshold, k)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))

    # 返回常见话题和k类子话题集合
    return general_topics, sub_topics


def topic_tree_generator_kmeans(phrases, docs, threshold=0.5, k=10, phrases_labels=None, docs_labels=None):
    
    # 输入话题集合，文本集合；
    # 输出更新的话题向量
    start_time = datetime.now()
    print ('Start local embedding process.\nStart time: %s'%start_time)
    phrases_vector = local_embedding(phrases, docs)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    
    # 输入更新的话题向量
    # 输出贴有label的聚完k类的话题集合
    start_time = datetime.now()
    print ('Start kmeans process.\nStart time: %s'%start_time)
    phrases_labels = kmeans(phrases_vector, k)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    
    # 输入k类话题集合和全体文本集合
    # 输出贴有label的聚完k类的文本集合
    start_time = datetime.now()
    print ('Start extract docs by phrases process.\nStart time: %s'%start_time)
    docs_labels = extract_docs_by_phrases(phrases_labels, docs, k)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    # sys.exit()
    
    # 输入k类话题集合和k类文本集合
    # 输出常见话题和k类子话题集合
    start_time = datetime.now()
    print ('Start adaptive spherical k means process.\nStart time: %s'%start_time)
    general_topics, sub_topics = adaptive_spherical_kmeans(phrases_labels, docs_labels, threshold, k)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))

    # 返回常见话题和k类子话题集合
    return general_topics, sub_topics


def topic_tree_generator_noac(phrases, docs, threshold=0.5, k=10, phrases_labels=None, docs_labels=None):
    # 不会啊...怎么分层啊。。。
    # 那就不分层，只做分类
    
    # 输入话题集合，文本集合；
    # 输出更新的话题向量
    start_time = datetime.now()
    print ('Start local embedding process.\nStart time: %s'%start_time)
    phrases_vector = local_embedding(phrases, docs)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    
    # 输入更新的话题向量
    # 输出贴有label的聚完k类的话题集合
    start_time = datetime.now()
    print ('Start spherical kmeans process.\nStart time: %s'%start_time)
    phrases_labels = spherical_kmeans(phrases_vector, k)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    
    # 输入k类话题集合和全体文本集合
    # 输出贴有label的聚完k类的文本集合
    start_time = datetime.now()
    print ('Start extract docs by phrases process.\nStart time: %s'%start_time)
    docs_labels = extract_docs_by_phrases(phrases_labels, docs, k)
    end_time = datetime.now()
    print ('Time end: %s'%(end_time))
    print ('Time costed: %s'%(end_time-start_time))
    # sys.exit()
    
    return phrases_labels,phrases_labels

In [531]:
# ieee\ vision\ dblp
dataset = 'ieee'
# top keywords
top = 1000
# word2vec size 
size = 128
# num of clustering
k = 5
# top tf-idf documents
top_tfidf = 1000
# threshold of representitiveness
threshold = 0.4
# choice of model TTG/TTG_k/NoAC/NoLE
model_name = 'TTG'
# level
level = '_0_0_k6'

In [560]:
# a = pd.read_csv('/Users/wh/Desktop/ieee_remote_k6/TTG/_0/sub_topics.csv')
# a = pd.read_csv('dataset/ieee/TTG/_0_1_k6/sub_topics.csv')
a = pd.read_csv('dataset/ieee/TTG/_0_0_2_k6/sub_topics.csv')
# a.loc[a['label']==5].sort_values(['label','rep'])
a

Unnamed: 0,label,phrase,rep
0,0,road_traffic,0.524361
1,0,visual_analytics,0.528118
2,0,video_surveillance,0.53169
3,0,risk_assessment,0.541453
4,0,smart_buildings,0.552684
5,0,cyber_physical_systems,0.555233
6,0,wearable_devices,0.555233
7,0,requirements_engineering,0.562366
8,0,intrusion_detection,0.564589
9,0,software_architecture,0.57086


In [545]:
for label_i in range(0,5):
    # level
    model_name = 'TTG'
    level = '_0_0_%s_k6'%label_i
    print (level)
#     phrases_path = '/Users/wh/Desktop/ieee_remote_k6/TTG/_0/sub_topics.csv'
    phrases_path = 'dataset/ieee/TTG_k/_0_0_k6/sub_topics.csv'
    phrases_file = pd.read_csv(phrases_path)
    phrases = phrases_file.loc[phrases_file['label']==label_i]['phrase'].tolist()

    #     docs_path = '/Users/wh/Desktop/ieee_remote_k6/TTG/_0/temp/documents_clustering_1000.csv'
    docs_path = 'dataset/ieee/TTG_k/_0_0_k6/temp/documents_clustering_1000.csv'
    docs_file = pd.read_csv(docs_path)
    docs_str = docs_file.loc[docs_file['label']==label_i].iloc[0,1]
    docs = replace_all(docs_str.lower(), phrases).split('$$$')
    pd.DataFrame(docs, columns=["docs"]).to_csv('dataset/%s/%s/docs_after_replace.csv'%(dataset,level),index=False)

    start_time = datetime.now()

    model_choice = {
        'TTG':topic_tree_generator,
        'NoAC':topic_tree_generator_noac,
        'NoLE':topic_tree_generator,
        'TTG_k':topic_tree_generator_kmeans
    }

    general_topics, sub_topics = model_choice[model_name](phrases, docs, threshold, k)
    end_time = datetime.now()

    print ('Time costed for %s model: %s'%(model_name,end_time-start_time))

    general_topics.to_csv('dataset/%s/%s/%s/general_topics.csv'%(dataset,model_name,level),index=False)
    sub_topics.to_csv('dataset/%s/%s/%s/sub_topics.csv'%(dataset,model_name,level),index=False)

_0_0_0_k6
0 2018-05-23 21:43:16.457142
Start local embedding process.
Start time: 2018-05-23 21:43:16.466885
Docs not exists, need to generate docs first.
Train model
Time end: 2018-05-23 21:43:16.809360
Time costed: 0:00:00.342475
Start spherical kmeans process.
Start time: 2018-05-23 21:43:16.809521
Write phrases_clustering File......
Completed
Time end: 2018-05-23 21:43:16.878856
Time costed: 0:00:00.069335
Start extract docs by phrases process.
Start time: 2018-05-23 21:43:16.878885
Label: 0
special_session
(1, 301) [0.26736199]
Num of phrases: 1
计算平均词向量2018-05-23 21:43:16.919648
计算文档向量2018-05-23 21:43:16.919739
294 1.000000015421169
生成第i个聚类下的di2018-05-23 21:43:16.934952
--------------------
Label: 1
visually_impaired
(1, 301) [0.12832971]
cyber_physical
(1, 301) [0.92089647]
Num of phrases: 29
计算平均词向量2018-05-23 21:43:17.051108
计算文档向量2018-05-23 21:43:17.052107
279 0.9944869426137765
生成第i个聚类下的di2018-05-23 21:43:17.141942
--------------------
Label: 2
keynote_speech
(1, 301) [0.14431

smart_home
(1, 1001) [3.40652492]
Num of phrases: 1
计算平均词向量2018-05-23 21:43:33.569739
计算文档向量2018-05-23 21:43:33.569841
999 1.0000000201184331
生成第i个聚类下的di2018-05-23 21:43:33.630899
--------------------
Label: 2
big_data
(1, 1001) [5.36102823]
Num of phrases: 1
计算平均词向量2018-05-23 21:43:33.653166
计算文档向量2018-05-23 21:43:33.653527
912 1.0000000155240822
生成第i个聚类下的di2018-05-23 21:43:33.721134
--------------------
Label: 3
smart_city
(1, 1001) [4.80256074]
Num of phrases: 1
计算平均词向量2018-05-23 21:43:33.736573
计算文档向量2018-05-23 21:43:33.736672
647 1.0000000129133375
生成第i个聚类下的di2018-05-23 21:43:33.809042
--------------------
Label: 4
smart_cities
(1, 1001) [4.01063444]
Num of phrases: 1
计算平均词向量2018-05-23 21:43:33.825538
计算文档向量2018-05-23 21:43:33.825683
905 0.9999999933645225
生成第i个聚类下的di2018-05-23 21:43:33.924951
--------------------
Write documents_clustering File......
Completed
Time end: 2018-05-23 21:43:33.967013
Time costed: 0:00:00.583754
Start adaptive spherical k means process.
Start time: 20

In [309]:
''' 测试ieee的sub_topics
phrases_data = pd.read_csv('dataset/%s/%s/sub_topics.csv'%(dataset,level),sep=',')
phrases = phrases_data.loc[phrases_data['label']==1]['phrase'].tolist()
docs_str = pd.read_csv('dataset/%s/%s/temp/documents_clustering.csv'%(dataset,level),sep=',').iloc[1,1]
docs = replace_all(docs_str.lower(), phrases).split('$$$')
'''

# 测试ieee根节点TOP5000的topics
start_time = datetime.now()
# phrases = pd.read_csv('dataset/%s/%s/nps_autophrase.csv'%(dataset,level),sep='\t').iloc[:top,1].tolist()
if model_name == 'NoLE':
    docs_path = 'dataset/%s/_0/docs_after_replace.csv'%(dataset)
else:
    docs_path = 'dataset/%s/%s/docs_after_replace.csv'%(dataset,level)
if os.path.isfile(docs_path):
    docs = pd.read_csv(docs_path)['docs'].tolist()
else:
    docs_data = pd.read_csv('dataset/%s/%s/papers.csv'%(dataset,level),sep='\t')
    docs_str = ''
    for i in range(docs_data.shape[0]): 
        if dataset == 'dblp':
            doc_str = str(docs_data.iloc[i,0])+'. $$$ '
            docs_str += doc_str
        else:
            doc_str = str(docs_data.iloc[i,1])+'. ### '+str(docs_data.iloc[i,2])+' $$$ '
            docs_str += doc_str
    print ('Start replace phrases in documents.')
    docs = replace_all(docs_str.lower(), phrases).split('$$$')
    pd.DataFrame(docs, columns=["docs"]).to_csv('dataset/%s/%s/docs_after_replace.csv'%(dataset,level),index=False)
    end_time = datetime.now()

    print ('Time costed for collection: %s'%(end_time-start_time))


In [408]:
start_time = datetime.now()

model_choice = {
    'TTG':topic_tree_generator,
    'NoAC':topic_tree_generator_noac,
    'NoLE':topic_tree_generator,
    'TTG_k':topic_tree_generator_kmeans
}

general_topics, sub_topics = model_choice[model_name](phrases, docs, threshold, k)
end_time = datetime.now()

print ('Time costed for %s model: %s'%(model_name,end_time-start_time))

Start local embedding process.
Start time: 2018-05-23 16:37:31.888615
Load model
Time end: 2018-05-23 16:37:34.914292
Time costed: 0:00:03.025677
Start spherical kmeans process.
Start time: 2018-05-23 16:37:34.914390


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [436]:
general_topics.to_csv('dataset/%s/%s/%s/general_topics.csv'%(dataset,model_name,level),index=False)
sub_topics.to_csv('dataset/%s/%s/%s/sub_topics.csv'%(dataset,model_name,level),index=False)

In [180]:
general_topics.sort_values(['label','rep'])

Unnamed: 0,label,phrase,rep
0,0,asynchronous_motor,0.387404
1,0,electromagnetic_interference,0.456226
2,2,modular_multilevel_converter,0.430138
3,3,inductive_power_transfer,0.404448
4,3,switched_capacitor,0.448969
11,4,phase_locked_loop,0.418646
8,4,equivalent_circuit,0.464958
9,4,wind_generator,0.464958
10,4,dual_active_bridge,0.471627
5,4,pfc_converter,0.471627


In [181]:
sub_topics.sort_values(['label','rep'])

Unnamed: 0,label,phrase,rep
4,0,synchronous_machines,0.500929
33,0,transient_stability,0.519250
5,0,solar_power,0.522404
19,0,transmission_line,0.528312
12,0,dc_microgrids,0.528312
7,0,control_strategies,0.531087
9,0,gate_driver,0.536320
3,0,bldc_motor,0.538794
29,0,circuit_breakers,0.541180
40,0,power_plants,0.541180


In [None]:
'''
start_time = datetime.now()

phrases_labels = pd.read_csv('dataset/dblp/temp/phrases_clustering.csv',sep=',')
docs_labels = pd.read_csv('dataset/dblp/temp/documents_clustering.csv',sep=',').iloc[:,1]

general_topics, sub_topics = topic_tree_generator(phrases, docs, 0.5, 5 , phrases_labels, docs_labels)
end_time = datetime.now()

print ('Time costed for TTG model: %s'%(end_time-start_time))

# phrases_labels = topic_tree_generator(phrases, docs, 0.4, 5)
'''

In [349]:
'''
for model_name in ['TTG','TTG_k','NoLE']:
    sub_topics = pd.read_csv('/Users/wh/Desktop/ieee_remote_k6/%s/_0/sub_topics'%model_name)
    general_topics = pd.read_csv('/Users/wh/Desktop/ieee_remote_k6/%s/_0/general_topics'%model_name)
    all_topics = pd.concat([sub_topics,general_topics])
    sub_topics_new = all_topics.loc[all_topics['rep']>=0.22].sort_values(['label','rep'])
    general_topics_new = all_topics.loc[all_topics['rep']<0.22].sort_values(['label','rep'])
    sub_topics_new.to_csv('/Users/wh/Desktop/ieee_remote_k6/%s/_0/sub_topics.csv'%model_name,index=False)
    general_topics_new.to_csv('/Users/wh/Desktop/ieee_remote_k6/%s/_0/general_topics.csv'%model_name,index=False)
'''