In [None]:
import io
import pandas as pd
import numpy as np
import jieba
import gensim
from pyhanlp import *
from tqdm import tqdm


In [None]:
#导入数据
df = pd.read_csv('./all_bk.csv')
print('行，列:', df.shape)
print('字段:', df.columns)


In [None]:
#停用词表
with open('./stopwords.txt') as f:
    read = f.read()
    stop_words = read.splitlines()
stop_word = [' ','',r'&#',r'x0D']#补充停用词，不断调整
stop_words.append(stop_word)
#print(stop_word)


In [None]:
#利用hanlp基于信息熵原理提取短语作为新词 保存为newword.txt
#提取短语 作用于长文本才比较有效
#从实验结果来看，此方法比hanlp两种切词方法效果好
txt_str = ''
for i in range (0,df.shape[0]):
    title = df.loc[i][3] #标题文本
    txt_str = txt_str + title
#print(txt_str)
for new_word in HanLP.extractPhrase(txt_str, 100):
    print(new_word)
#new_word = HanLP.extractPhrase(txt_str, 100)


In [None]:
#文档-分词 注意分词质量很关键
#需要不断查看分词结果，调整停用词表、调整新词词典
docs_words = []
for i in range (0,df.shape[0]):
    doc_words = []
    title = df.loc[i][3] #修改导入的字段
    jieba.add_word('反恐法')
    jieba.add_word('三股势力')#加入新词 ，不断调整 
    jieba.add_word('斯里兰卡')
    jieba.add_word('去极端化')
    jieba.add_word('伊斯兰国')
    jieba.load_userdict('./newwords.txt')#导入新词词典，可加入特定领域的词典
    title = str(title)
    a_seg_list = jieba.lcut(title,cut_all=False)#精确模式
    for a_word in a_seg_list:
        a_word = str(a_word)
        a_word = a_word.strip()
        if a_word not in stop_words:#过滤停用词
            if a_word >= u'\u4e00' and a_word <= u'\u9fa5’:#只保留文字，根据情况调整
                if len(a_word) >= 2:#去除单字，根据情况调整
                    doc_words.append(a_word)
    docs_words.append(doc_words)
#print(docs_words)  


In [None]:
#发现二元词、三元词，从实验结果看这个方法对于提升分词质量具有显著的效果
# Build the bigram and trigram models
bigram = gensim.models.Phrases(docs_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[docs_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


In [None]:
new_docs_words = [] #最终分词的结果
for doc_words in docs_words:
    new_docs_words.append(trigram_mod[bigram_mod[doc_words]])


In [None]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel, LdaModel
from gensim import models
%matplotlib inline


In [None]:
dictionary = Dictionary(new_docs_words)
corpus = [dictionary.doc2bow(text) for text in docs_words]
#print(corpus) #对词进行唯一id编码，并统计在对应文档出现的次数 形成文档-词的词频矩阵，即词袋 word-bag
#tf-idf 对高频但不重要的词进行频数的调整
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]
print(tfidf_model)
print(corpus_tfidf)


In [None]:
#判断最优主题数
def compute_coherence_values(id2word, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        #这里可以使用其他模型
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


In [None]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = './mallet-2.0.8/bin/mallet' # update this path
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(id2word=dictionary, corpus=corpus, texts=new_docs_words, start=2, limit=40, step=2)
# Show graph
limit=20; start=2; step=2;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))


In [1]:
#方案一
lda_model = models.LdaModel(corpus = corpus_tfidf, id2word=dictionary, num_topics=20,dtype=np.float64,
                            iterations=6000,chunksize = 2000, passes = 1)
#请查看各个参数的含义 #根据最优主题数调整num_topics
lda_model.show_topics(num_topics=20, num_words=10, log=False, formatted=True)


In [2]:
#方案二
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=32, id2word=dictionary)
#根据最优主题数调整num_topics
ldamallet.save('./mallet_LDAmodle’)
#保存模型

In [None]:
#可视化方案一：适用于建模方案一
import pyLDAvis
import pyLDAvis.gensim
vis_data  = pyLDAvis.gensim.prepare(model2, corpus, dictionary)
pyLDAvis.display(vis_data)
#保存可视化结果
pyLDAvis.save_html(vis_data,’./lda_vis.html')



In [None]:
#给文档打标签
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

#在这里修改参数
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=new_docs_words)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

df_dominant_topic.to_csv('./mallet_wz_topic_docs.csv’)#保存结果


In [None]:
#各主题占比可视化
topics_size = df_dominant_topic['Document_No'].groupby(df_dominant_topic['Dominant_Topic']).count()
topics_size = topics_size.sort_values()
topics_per = topics_size/df_dominant_topic.shape[0]
#调节图大小
plt.figure(figsize=(15, 10))
# 字体会按照列表依次从电脑中查找,直到找到
plt.rcParams['font.family'] = ['Arial Unicode MS','Microsoft YaHei','SimHei','sans-serif']
plt.rcParams['axes.unicode_minus'] = False  #解决黑体符号乱码
plt.yticks(fontsize=13)#调节刻度字体大小
#设置文字注释
plt.title('主题占比',fontsize=16,fontweight='bold')
plt.ylabel('主题ID',fontsize=16)
plt.xlabel('比例',fontsize=16)
topics_per.plot.barh()


In [None]:
#给主题下属文章数可视化
#调节图大小
plt.figure(figsize=(15, 10))
# 字体会按照列表依次从电脑中查找,直到找到
plt.rcParams['font.family'] = ['Arial Unicode MS','Microsoft YaHei','SimHei','sans-serif']
plt.rcParams['axes.unicode_minus'] = False  #解决黑体符号乱码
plt.yticks(fontsize=13)#调节刻度字体大小
#设置文字注释
plt.title('主题下属文章数',fontsize=16,fontweight='bold')
plt.ylabel('主题ID',fontsize=16)
plt.xlabel('文章数(篇)',fontsize=16)
x = np.array(topics_size)
y = np.arange(32)+1
for a,b in zip(x,y):
    plt.text(a+10, b-1.5, '%.0f' % a, ha='center', va= 'bottom',fontsize=13)

topics_size.plot.barh()


In [None]:
#找出主题的代表文章
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head(32)
#保存结果
sent_topics_sorteddf_mallet.to_csv('./mallet_wz_pretopic_docs.csv')


In [None]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.XKCD_COLORS.items()]
# more colors: 'mcolors.XKCD_COLORS' mcolors.CSS4_COLORS mcolors.TABLEAU_COLORS

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0,
                  font_path='./SourceHanSansCN-Regular.otf')

topics = ldamallet.show_topics(num_topics=32,formatted=False)#调整参数

fig, axes = plt.subplots(16, 2, figsize=(30,100), sharex=True, sharey=True)#调整参数

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=30))
    plt.gca().axis('off')
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()


In [None]:
from collections import Counter
topics = ldamallet.show_topics(num_topics=32,formatted=False)
data_flat = [w for w_list in new_docs_words for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

topics_words = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        
#保存主题-关键词数据
df2.to_csv('./wz_topic_kw.csv’)


In [None]:
# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(16,2, figsize=(16,60), sharey=True, dpi=160) # 调节参数
cols = [color for name, color in mcolors.XKCD_COLORS.items()] #颜色数量问题
# 字体会按照列表依次从电脑中查找,直到找到
plt.rcParams['font.family'] = ['Arial Unicode MS','Microsoft YaHei','SimHei','sans-serif']
plt.rcParams['axes.unicode_minus'] = False  #解决黑体符号乱码
#plt.rcParams['axes.facecolor'] = ‘grey’ #调节背景颜色
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df2.loc[df2.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df2.loc[df2.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.20); ax.set_ylim(0, 45000) #调节纵坐标
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df2.loc[df2.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
    
fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()
