In [1]:
from urllib.request import urlopen
import json
import os
import pandas as pd
from gensim.models import LdaModel, LdaMulticore
from gensim.models import CoherenceModel
import jieba
import numpy as np

# Reading data back
def openJsonFile(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data
    # resp = json.loads(u.read().decode('utf-8'))
    
def FindAllFile(base):
    for root, ds, fs in os.walk(base):
        for f in fs:
            yield f
            
# 创建停用词列表
def stopwordslist():
    stopwords = [line.strip() for line in open('/home/featurize/data/stopwords.txt', 'r', encoding='UTF-8').readlines()]
    return stopwords


# 定义停词函数 对句子进行中文分词
def seg_depart(sentence):
    # 对文档中的每一行进行中文分词
    sentence_depart = jieba.cut(sentence.strip())
    # 创建一个停用词列表
    stopwords = stopwordslist()
    # 去停用词
    output = filter(lambda x: x not in stopwords and x != '\xa0' and not x.isnumeric() and len(x.strip())>1, sentence_depart)
    return output



In [2]:
database = "/home/featurize/data/resultJson/"

label_set = ['古代', '当代', '现代', '近代']
sample_set = dict(zip(label_set, [[], [], [], []]))
print(sample_set)

{'古代': [], '当代': [], '现代': [], '近代': []}


In [3]:
for filename in FindAllFile(database):
    label = filename.split("-")[1].split("(")[1][:-1]
    content = openJsonFile(database + filename)["人物简介"]["text"]
    sample_set[label].append(content)

In [4]:
# 分词测试
# seg = list(seg_depart(gudai[1]))
# # seg = " ".join(seg)
# print(seg)
# gudai[1]

In [5]:
# def seg_corpus(contents, save_path):
#     # 分词
#     path = save_path + "seg.csv"
#     if os.path.exists(path):
#         df = pd.read_csv(path, header=None, delimiter="\t", names=["raw", "seg"])
#     else:
#         result_seg = []
#         for content in contents:
#             seg = list(seg_depart(content))
#             seg = " ".join(seg)
#             if len(seg) > 0:
#                 result_seg.append([content, seg])

#         df = pd.DataFrame(result_seg, columns=['raw', 'seg'])
# #        df.to_csv("/home/featurize/data/seg_text.csv", index=False, header=None, sep="\t")
#         df.to_csv(path, index=False, header=None, sep="\t")
#     return df

In [6]:
def seg_corpus(contents, save_path):
    # 分词
    result_seg = []
    for content in contents:
        seg = list(seg_depart(content))
        seg = " ".join(seg)
        if len(seg) > 0:
            result_seg.append([content, seg])

    df = pd.DataFrame(result_seg, columns=['raw', 'seg'])
    # df.to_csv(path, index=False, header=None, sep="\t")
    return df

In [7]:
def get_corpus_dict(df):
    docs = df.seg.apply(lambda x: x.split())
    
    # Remove rare and common tokens.
    from gensim.corpora import Dictionary
    
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)
                                                        
    corpus = [dictionary.doc2bow(list(doc)) for doc in docs]
    return dictionary, docs, corpus

In [8]:
def train_model(dictionary, docs, corpus):
    # Train LDA model.
    # Set training parameters.
    # num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.
    s = 2
    e = 40
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    
    coherence_values = []
    model_list = []
    for num_topics in range(s, e, 2):
        lda = LdaMulticore(
            corpus,
            id2word=id2word,
            num_topics=num_topics,
            iterations=iterations,
            eval_every=eval_every
        )
        model_list.append(lda)
        coherencemodel = CoherenceModel(model=lda, texts=docs,
                                        dictionary=dictionary, coherence='c_v')
        coherence_values.append(round(coherencemodel.get_coherence(), 3))
    return model_list, coherence_values

In [9]:
def get_best_model(dictionary, docs, corpus, save_path):
    from pprint import pprint
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))
    model_list, coherence_values = train_model(dictionary, docs, corpus)
    best_id = np.argmax(np.array(coherence_values))
    model = model_list[best_id]
    pprint(model.print_topics(num_words=20))
    # 保存最佳模型
    # model.save("/home/featurize/data/topics/best_model.lda")
    print(save_path)
    model.save(save_path + "best_model.lda")

    return model

In [10]:
def visualization(corpus, model, dictionary, save_path):
    '''
    可视化函数，输入语料、模型和字典文件，输出html文件
    corpus: 输入语料
    model: 模型
    dictionary: 字典
    save_path: html保存路径
    '''
    import pyLDAvis.gensim_models as gensimvis
    import pyLDAvis
    vis = gensimvis.prepare(model, corpus, dictionary)
#    pyLDAvis.show(vis)
    pyLDAvis.save_html(vis, save_path)

In [11]:
from gensim.corpora import Dictionary, MmCorpus

def main(save_path, df):
    dictionary, docs, corpus = get_corpus_dict(df)
    # 保存corpus
    MmCorpus.serialize(save_path + 'data_corpus.mm', corpus)
    # 保存dict
    dictionary.save(save_path + 'doc2bow.dict')
    model = get_best_model(dictionary, docs, corpus, save_path)
    visualization(corpus, model, dictionary, save_path+ "vis.html")

In [12]:
gudai = sample_set["古代"]
dangdai = sample_set["当代"]
xiandai = sample_set["现代"]
jindai = sample_set["近代"]


In [13]:
save_path = "/home/featurize/data/topics/jindai/"
df_jindai = seg_corpus(jindai, save_path)
main(save_path, df_jindai)
# df_jindai

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.858 seconds.
Prefix dict has been built successfully.


Number of unique tokens: 649
Number of documents: 1326
[(1,
  '0.037*"孙中山" + 0.019*"中华民国" + 0.018*"伟大" + 0.016*"民国" + 0.014*"中国" + '
  '0.014*"出版" + 0.012*"北京" + 0.012*"逝世" + 0.012*"委员" + 0.011*"香港" + 0.011*"主任" '
  '+ 0.010*"总统" + 0.010*"袁世凯" + 0.009*"天津" + 0.009*"生于" + 0.008*"创立" + '
  '0.008*"原名" + 0.008*"光绪" + 0.007*"国民党" + 0.007*"成立"'),
 (10,
  '0.031*"上海" + 0.025*"光绪" + 0.022*"光绪帝" + 0.016*"中国" + 0.015*"日本" + '
  '0.014*"生于" + 0.013*"成为" + 0.012*"北京" + 0.012*"清朝" + 0.012*"皇帝" + '
  '0.009*"国民党" + 0.009*"反对" + 0.008*"势力" + 0.008*"历史" + 0.008*"亲王" + '
  '0.007*"享年" + 0.007*"袁世凯" + 0.007*"同志" + 0.007*"爱新觉罗" + 0.007*"组织"'),
 (13,
  '0.036*"中国" + 0.020*"日本" + 0.019*"教授" + 0.018*"研究" + 0.018*"文化" + 0.017*"鲁迅" '
  '+ 0.014*"著名" + 0.012*"发展" + 0.012*"哲学" + 0.012*"思想" + 0.012*"浙江" + '
  '0.011*"生于" + 0.010*"先生" + 0.010*"笔名" + 0.010*"早年" + 0.009*"现代" + 0.009*"领域" '
  '+ 0.008*"近代" + 0.008*"影响" + 0.008*"毕业"'),
 (9,
  '0.029*"中国" + 0.018*"民国" + 0.016*"上海" + 0.014*"参加" + 0.013*"日本" + 0.010*"生

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

In [14]:
save_path = "/home/featurize/data/topics/xiandai/"
df_xiandai = seg_corpus(xiandai, save_path)
main(save_path, df_xiandai)
# df_xiandai

  and should_run_async(code)


Number of unique tokens: 393
Number of documents: 1023
[(0,
  '0.044*"演员" + 0.026*"电视剧" + 0.024*"京剧" + 0.021*"香港" + 0.019*"相声" + '
  '0.019*"中国" + 0.016*"获得" + 0.015*"先生" + 0.015*"李小龙" + 0.014*"出演" + '
  '0.012*"北京" + 0.012*"著名" + 0.011*"艺术" + 0.011*"父亲" + 0.010*"话剧" + 0.010*"电影" '
  '+ 0.010*"参加" + 0.010*"戏剧" + 0.010*"美国" + 0.010*"出生"'),
 (1,
  '0.022*"台湾" + 0.020*"导演" + 0.020*"北京" + 0.018*"原名" + 0.017*"著名" + 0.016*"毕业" '
  '+ 0.016*"演员" + 0.015*"电影" + 0.014*"执导" + 0.014*"中国" + 0.012*"浙江省" + '
  '0.012*"主演" + 0.012*"浙江" + 0.012*"参加" + 0.011*"工作" + 0.011*"电视剧" + '
  '0.011*"享年" + 0.010*"全国" + 0.010*"香港" + 0.010*"美国"'),
 (2,
  '0.053*"电影" + 0.041*"最佳" + 0.034*"获得" + 0.024*"演员" + 0.020*"主演" + 0.018*"香港" '
  '+ 0.018*"中国" + 0.015*"导演" + 0.014*"出演" + 0.014*"台湾" + 0.014*"电视剧" + '
  '0.012*"执导" + 0.011*"参演" + 0.010*"电影节" + 0.010*"享年" + 0.009*"日出" + '
  '0.009*"编剧" + 0.009*"金马奖" + 0.009*"喜剧片" + 0.009*"提名"'),
 (3,
  '0.045*"电影" + 0.030*"中国" + 0.026*"香港" + 0.025*"获得" + 0.019*"执导" + 0.017*"台湾" '

In [15]:
save_path = "/home/featurize/data/topics/dangdai/"
df_dangdai = seg_corpus(dangdai, save_path)
main(save_path, df_dangdai)
# df_dangdai  

  and should_run_async(code)


Number of unique tokens: 2588
Number of documents: 6730
[(15,
  '0.028*"电视剧" + 0.025*"出演" + 0.023*"最佳" + 0.015*"执导" + 0.015*"饰演" + '
  '0.014*"男演员" + 0.011*"毕业" + 0.010*"导演" + 0.010*"同年" + 0.009*"内地" + '
  '0.009*"参演" + 0.008*"爱情" + 0.008*"演员" + 0.008*"担任" + 0.007*"古装" + 0.007*"香港" '
  '+ 0.007*"电视" + 0.007*"首部" + 0.007*"提名" + 0.006*"电影节"'),
 (9,
  '0.032*"参演" + 0.028*"出演" + 0.024*"电视剧" + 0.019*"饰演" + 0.014*"参加" + '
  '0.012*"女演员" + 0.011*"最佳" + 0.011*"毕业" + 0.011*"古装" + 0.010*"内地" + '
  '0.010*"同年" + 0.008*"香港" + 0.008*"演员" + 0.007*"影视" + 0.006*"时装" + 0.006*"年度" '
  '+ 0.005*"北京" + 0.005*"湖南卫视" + 0.005*"TVB" + 0.005*"歌手"'),
 (4,
  '0.041*"出演" + 0.027*"古装" + 0.026*"饰演" + 0.015*"爱情" + 0.015*"都市" + '
  '0.014*"电视剧" + 0.013*"同年" + 0.011*"毕业" + 0.011*"播出" + 0.011*"情感" + '
  '0.010*"励志" + 0.009*"内地" + 0.009*"正式" + 0.009*"个人" + 0.008*"男演员" + '
  '0.008*"参加" + 0.008*"进入" + 0.008*"演艺圈" + 0.008*"演员" + 0.007*"参演"'),
 (6,
  '0.031*"出演" + 0.014*"同年" + 0.014*"最佳" + 0.012*"香港" + 0.011*"发行" + '
  '0.

In [16]:
save_path = "/home/featurize/data/topics/gudai/"
df_gudai = seg_corpus(gudai, save_path)
main(save_path, df_gudai)
# df_gudai

  and should_run_async(code)


Number of unique tokens: 4573
Number of documents: 9550
[(0,
  '0.011*"皇帝" + 0.011*"元年" + 0.008*"在位" + 0.008*"谥号" + 0.006*"即位" + 0.006*"皇后" '
  '+ 0.006*"二年" + 0.005*"去世" + 0.005*"三年" + 0.005*"刺史" + 0.005*"将军" + '
  '0.004*"公主" + 0.004*"庙号" + 0.004*"东汉" + 0.004*"概述" + 0.004*"来源" + 0.004*"次年" '
  '+ 0.004*"时期" + 0.003*"太子" + 0.003*"生于"'),
 (1,
  '0.009*"公主" + 0.007*"司马" + 0.007*"将军" + 0.007*"时期" + 0.007*"去世" + 0.006*"皇后" '
  '+ 0.006*"二年" + 0.005*"曹操" + 0.005*"册封" + 0.005*"元年" + 0.005*"谥号" + '
  '0.005*"蜀汉" + 0.004*"即位" + 0.004*"大臣" + 0.004*"唐朝" + 0.004*"成为" + 0.004*"贞观" '
  '+ 0.004*"大将军" + 0.004*"刺史" + 0.003*"儿子"'),
 (2,
  '0.010*"中国" + 0.009*"公元前" + 0.006*"皇帝" + 0.005*"时期" + 0.004*"司马懿" + '
  '0.004*"人物" + 0.004*"去世" + 0.004*"曹操" + 0.004*"将军" + 0.004*"谥号" + 0.004*"即位" '
  '+ 0.004*"皇后" + 0.004*"元年" + 0.003*"公主" + 0.003*"在位" + 0.003*"生于" + '
  '0.003*"太子" + 0.003*"弟子" + 0.003*"部落" + 0.003*"成为"'),
 (3,
  '0.010*"皇帝" + 0.008*"公主" + 0.008*"元年" + 0.007*"谥号" + 0.007*"在位" + 0.005*"去世" '
  '