In [None]:
import jieba,os,re
from gensim import corpora, models, similarities

"""创建停用词列表"""
def stopwordslist():
    stopwords = [line.strip() for line in open('./stopwords.txt',encoding='UTF-8').readlines()]
    return stopwords

"""对句子进行中文分词"""
def seg_depart(sentence):
    sentence_depart = jieba.cut(sentence.strip())
    stopwords = stopwordslist()
    outstr = ''
    for word in sentence_depart:
        if word not in stopwords:
            outstr += word
            outstr += " "
    # outstr：'黄蜂 湖人 首发 科比 带伤 战 保罗 加索尔 ...'       
    return outstr

"""如果文档还没分词，就进行分词"""
if not os.path.exists('./cnews.train_jieba.txt'):
    # 给出文档路径
    filename = "./cnews.train.txt"
    outfilename = "./cnews.train_jieba.txt"
    inputs = open(filename, 'r', encoding='UTF-8')
    outputs = open(outfilename, 'w', encoding='UTF-8')

    # 把非汉字的字符全部去掉
    # 将输出结果写入ouputs.txt中
    for line in inputs:
        line = line.split('\t')[1]
        line = re.sub(r'[^\u4e00-\u9fa5]+','',line)
        line_seg = seg_depart(line.strip())
        outputs.write(line_seg.strip() + '\n')
    
    outputs.close()
    inputs.close()
    print("删除停用词和分词成功！！！")

In [2]:
fr = open('./cnews.train_jieba.txt', 'r',encoding='utf-8')
train = []
for line in fr.readlines():
    line = [word.strip() for word in line.split(' ')]
    train.append(line)
    # train: [['黄蜂', '湖人', '首发', '科比', '带伤', '战',...],[...],...]
    
"""构建词频矩阵，训练LDA模型"""
dictionary = corpora.Dictionary(train)
# corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...]
# corpus是把每条新闻ID化后的结果，每个元素是新闻中的每个词语，在字典中的ID和频率
corpus = [dictionary.doc2bow(text) for text in train]

lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
topic_list = lda.print_topics(10)
print("10个主题的单词分布为：\n")
for topic in topic_list:
    print(topic)

10个主题的单词分布为：

(0, '0.010*"功能" + 0.009*"基金" + 0.008*"中" + 0.008*"拍摄" + 0.008*"采用" + 0.007*"元" + 0.006*"万" + 0.006*"支持" + 0.005*"镜头" + 0.005*"英寸"')
(1, '0.073*"基金" + 0.013*"公司" + 0.010*"中" + 0.008*"分红" + 0.008*"投资" + 0.007*"考试" + 0.006*"股票" + 0.006*"经理" + 0.005*"市场" + 0.005*"赎回"')
(2, '0.012*"市场" + 0.010*"涨幅" + 0.009*"投资" + 0.007*"基金" + 0.006*"行业" + 0.006*"公司" + 0.005*"投资者" + 0.005*"经济" + 0.004*"上涨" + 0.004*"增长"')
(3, '0.008*"设计" + 0.006*"产品" + 0.006*"采用" + 0.006*"传感器" + 0.006*"视频" + 0.005*"拍摄" + 0.005*"三星" + 0.005*"功能" + 0.005*"倍" + 0.005*"万"')
(4, '0.060*"基金" + 0.013*"市场" + 0.009*"投资" + 0.008*"中" + 0.008*"股票" + 0.006*"公司" + 0.005*"新" + 0.005*"中国" + 0.005*"行业" + 0.005*"亿元"')
(5, '0.009*"玩家" + 0.008*"中" + 0.005*"相机" + 0.005*"游戏" + 0.004*"手机" + 0.004*"说" + 0.003*"活动" + 0.003*"时间" + 0.003*"句子" + 0.003*"答案"')
(6, '0.014*"公司" + 0.009*"投资" + 0.008*"市场" + 0.007*"私募" + 0.006*"股票" + 0.005*"基金" + 0.005*"行业" + 0.004*"中" + 0.004*"新能源" + 0.004*"长城"')
(7, '0.023*"分红" + 0.011*"机身" + 0.008*"红利" + 0.005

In [3]:
dictionary[3]

'主场'

In [4]:
"""抽取新闻的主题"""
# 用来测试的三条新闻，分别为体验、娱乐和科技新闻    
file_test = "./cnews.test.txt"
news_test = open(file_test, 'r', encoding='UTF-8')
    
test = []
# 处理成正确的输入格式       
for line in news_test:
    line = line.split('\t')[1]
    line = re.sub(r'[^\u4e00-\u9fa5]+','',line)
    line_seg = seg_depart(line.strip())
    line_seg = [word.strip() for word in line_seg.split(' ')]
    test.append(line_seg)    
    
# 新闻ID化    
corpus_test = [dictionary.doc2bow(text) for text in test]
# 得到每条新闻的主题分布
topics_test = lda.get_document_topics(corpus_test)  
labels = ['体育','娱乐','科技']
for i in range(3):
    print('这条'+labels[i]+'新闻的主题分布为：\n')
    print(topics_test[i],'\n')

fr.close()
news_test.close()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ASUS\AppData\Local\Temp\jieba.cache
Loading model cost 0.450 seconds.
Prefix dict has been built successfully.


这条体育新闻的主题分布为：

[(3, 0.0159496), (4, 0.046352763), (5, 0.2440357), (6, 0.1527652), (8, 0.3432481), (9, 0.19527471)] 

这条娱乐新闻的主题分布为：

[(0, 0.16890468), (3, 0.14646012), (5, 0.36114714), (6, 0.09123584), (8, 0.2295812)] 

这条科技新闻的主题分布为：

[(0, 0.43570173), (3, 0.24439986), (4, 0.30053645), (8, 0.016059207)] 



In [6]:
import pyLDAvis.gensim_models

In [8]:
data = pyLDAvis.gensim_models.prepare(lda, corpus=corpus,dictionary=dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [9]:
pyLDAvis.display(data=data)