In [1]:
#!/usr/bin/python
# -*- coding:utf-8 -*-

import jieba,os,re
from gensim import corpora, models, similarities

"""创建停用词列表"""
def stopwordslist():
    stopwords = [line.strip() for line in open('./stopwords.txt',encoding='UTF-8').readlines()]
    return stopwords

"""对句子进行中文分词"""
def seg_depart(sentence):
    sentence_depart = jieba.cut(sentence.strip())
    stopwords = stopwordslist()
    outstr = ''
    for word in sentence_depart:
        if word not in stopwords:
            outstr += word
            outstr += " "
    # outstr：'黄蜂 湖人 首发 科比 带伤 战 保罗 加索尔 ...'       
    return outstr

"""如果文档还没分词，就进行分词"""
if not os.path.exists('./cnews.train_jieba.txt'):
    # 给出文档路径
    filename = "./cnews.train.txt"
    outfilename = "./cnews.train_jieba.txt"
    inputs = open(filename, 'r', encoding='UTF-8')
    outputs = open(outfilename, 'w', encoding='UTF-8')

    # 把非汉字的字符全部去掉
    # 将输出结果写入ouputs.txt中
    for line in inputs:
        line = line.split('\t')[1]
        line = re.sub(r'[^\u4e00-\u9fa5]+','',line)
        line_seg = seg_depart(line.strip())
        outputs.write(line_seg.strip() + '\n')
    
    outputs.close()
    inputs.close()
    print("删除停用词和分词成功！！！")

In [2]:
"""准备好训练语料，整理成gensim需要的输入格式"""
fr = open('./cnews.train_jieba.txt', 'r',encoding='utf-8')
train = []
for line in fr.readlines():
    line = [word.strip() for word in line.split(' ')]
    train.append(line)
    # train: [['黄蜂', '湖人', '首发', '科比', '带伤', '战',...],[...],...]
    
"""构建词频矩阵，训练LDA模型"""
dictionary = corpora.Dictionary(train)
# corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...]
# corpus是把每条新闻ID化后的结果，每个元素是新闻中的每个词语，在字典中的ID和频率
corpus = [dictionary.doc2bow(text) for text in train]

lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
topic_list = lda.print_topics(10)
print("10个主题的单词分布为：\n")
for topic in topic_list:
    print(topic)

10个主题的单词分布为：

(0, '0.006*"万" + 0.006*"中" + 0.005*"元" + 0.004*"高速" + 0.003*"句子" + 0.003*"市场" + 0.003*"公司" + 0.003*"约" + 0.002*"说" + 0.002*"时间"')
(1, '0.010*"中" + 0.005*"做" + 0.005*"英语" + 0.004*"单词" + 0.004*"说" + 0.003*"镜头" + 0.003*"文章" + 0.003*"公司" + 0.003*"时间" + 0.003*"搭载"')
(2, '0.017*"申购" + 0.015*"净值" + 0.014*"分红" + 0.010*"索尼" + 0.009*"公司" + 0.007*"中" + 0.005*"减持" + 0.004*"产品" + 0.004*"市场" + 0.004*"行业"')
(3, '0.013*"考试" + 0.010*"基金" + 0.006*"中" + 0.005*"公司" + 0.005*"英寸" + 0.005*"万" + 0.005*"采用" + 0.005*"视频" + 0.004*"中国" + 0.004*"债券"')
(4, '0.089*"基金" + 0.015*"投资" + 0.014*"市场" + 0.011*"股票" + 0.010*"公司" + 0.009*"中" + 0.008*"行业" + 0.007*"经理" + 0.006*"新" + 0.006*"经济"')
(5, '0.011*"索尼" + 0.008*"快门" + 0.008*"万" + 0.006*"拍摄" + 0.006*"镜头" + 0.005*"高清" + 0.005*"小巧" + 0.004*"中" + 0.003*"短片" + 0.003*"性能"')
(6, '0.012*"功能" + 0.009*"拍摄" + 0.008*"中" + 0.008*"采用" + 0.008*"玩家" + 0.007*"机身" + 0.007*"支持" + 0.006*"相机" + 0.006*"拥有" + 0.005*"游戏"')
(7, '0.024*"分红" + 0.014*"基金" + 0.012*"赎回" + 0.011*"公司" + 

In [3]:
"""抽取新闻的主题"""
# 用来测试的三条新闻，分别为体验、娱乐和科技新闻    
file_test = "./cnews.test.txt"
news_test = open(file_test, 'r', encoding='UTF-8')
    
test = []
# 处理成正确的输入格式       
for line in news_test:
    line = line.split('\t')[1]
    line = re.sub(r'[^\u4e00-\u9fa5]+','',line)
    line_seg = seg_depart(line.strip())
    line_seg = [word.strip() for word in line_seg.split(' ')]
    test.append(line_seg)    
    
# 新闻ID化    
corpus_test = [dictionary.doc2bow(text) for text in test]
# 得到每条新闻的主题分布
topics_test = lda.get_document_topics(corpus_test)  
labels = ['体育','娱乐','科技']
for i in range(3):
    print('这条'+labels[i]+'新闻的主题分布为：\n')
    print(topics_test[i],'\n')

fr.close()
news_test.close()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\SIYI\AppData\Local\Temp\jieba.cache
Loading model cost 1.222 seconds.
Prefix dict has been built succesfully.


这条体育新闻的主题分布为：

[(1, 0.36818886), (3, 0.06355113), (5, 0.055832624), (9, 0.5021481)] 

这条娱乐新闻的主题分布为：

[(1, 0.17199118), (3, 0.2358956), (6, 0.281208), (9, 0.3077155)] 

这条科技新闻的主题分布为：

[(3, 0.51802427), (4, 0.19868204), (6, 0.27476338)] 

