# Loading Data

In [4]:
import json
import jieba
from gensim.models import LdaModel
import pandas as pd
from gensim.corpora import Dictionary

train = r"C:\Users\Desjajja\OneDrive - email.ncu.edu.cn\课程\统计计算\期末\iflytek_public\train.json" # 训练集路径


with open(train, encoding='utf-8') as f:
    sentences = [] # 逐行读取json文件，分词，存入列表

    for line in f.readlines():

        d = json.loads(line)

        label = d['label']

        label_des = d['label_des']

        sentence = d['sentence']

        sentence = sentence.strip()

        sentence = ",".join(sentence.split())

        sentences.append(jieba.lcut(sentence))

    f.close()

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Desjajja\AppData\Local\Temp\jieba.cache
Loading model cost 1.739 seconds.
Prefix dict has been built successfully.


# Training LDA

In [8]:
dictionary = Dictionary(sentences)
dictionary.filter_n_most_frequent(200)
corpus = [dictionary.doc2bow(text) for text in sentences ]

In [9]:
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)

In [10]:
topic_list = lda.print_topics(10)
print("10个主题的单词分布为：\n")
for topic in topic_list:
    print(topic)

10个主题的单词分布为：

(0, '0.010*"商品" + 0.008*"优惠券" + 0.007*"快递" + 0.007*"折扣" + 0.007*"购物" + 0.006*"优惠" + 0.006*"充值" + 0.006*"多开" + 0.005*"旅行" + 0.005*"旅游"')
(1, '0.019*"小说" + 0.018*"阅读" + 0.007*"公交" + 0.005*"★" + 0.004*"热门" + 0.004*"僵尸" + 0.003*"流量" + 0.003*"播放" + 0.003*"路线" + 0.003*"正版"')
(2, '0.010*"跑" + 0.008*"胡子" + 0.005*"捕鱼" + 0.005*"预测" + 0.004*"麻将" + 0.004*"约会" + 0.004*"流量" + 0.004*"单机" + 0.004*"交友" + 0.003*"送"')
(3, '0.008*"手游" + 0.005*"开启" + 0.005*"技能" + 0.004*"直播" + 0.004*"聊天" + 0.004*"音乐" + 0.004*"战斗" + 0.004*"交友" + 0.003*"萌宠" + 0.003*"来电"')
(4, '0.006*"红包" + 0.005*"企业" + 0.005*"文件" + 0.005*"支付" + 0.004*"动态" + 0.004*"壁纸" + 0.004*"通知" + 0.004*"客户端" + 0.004*"云" + 0.003*"设备"')
(5, '0.032*"宝宝" + 0.017*";" + 0.014*"&" + 0.010*"巴士" + 0.009*"音乐" + 0.005*"故事" + 0.005*"妈妈" + 0.005*"儿歌" + 0.005*"育儿" + 0.005*"儿童"')
(6, '0.012*"直播" + 0.007*"电视" + 0.006*"壁纸" + 0.005*"动态" + 0.005*"公主" + 0.005*"美丽" + 0.005*"她" + 0.004*"图片" + 0.004*"高清" + 0.004*"它"')
(7, '0.017*"贷款" + 0.016*"借款" + 0.008*"借钱" + 0.0

# Test

In [17]:
demo = r"C:\Users\Desjajja\OneDrive - email.ncu.edu.cn\课程\统计计算\期末\iflytek_public\dev.json"

with open(demo, encoding='utf-8') as f:
    demo_sentences = [] # 逐行读取json文件，分词，存入列表

    demo_labels = []

    for line in f.readlines():

        d = json.loads(line)

        label = d['label']

        label_des = d['label_des']

        sentence = d['sentence']

        sentence = sentence.strip()

        sentence = ",".join(sentence.split())

        demo_sentences.append(jieba.lcut(sentence))

        demo_labels.append(label)

f.close()


In [19]:
# 测试10条
for i in range(10):
    print("label id:", demo_labels[i])
    print("\n")

    doc_bow = dictionary.doc2bow(demo_sentences[i])    
    rst = lda[doc_bow]
    for idx, lky in rst
    print("lda result:", rst)
    print("\n")




label id: 110


lda result: [(0, 0.4713706), (4, 0.42289895), (5, 0.09384658)]


label id: 70


lda result: [(0, 0.014280724), (1, 0.01428148), (2, 0.014281982), (3, 0.01428059), (4, 0.014283068), (5, 0.014281249), (6, 0.014282262), (7, 0.014283687), (8, 0.8714636), (9, 0.014281378)]


label id: 10


lda result: [(0, 0.20578696), (2, 0.15337059), (4, 0.4799358), (5, 0.06033147), (8, 0.08129551)]


label id: 18


lda result: [(1, 0.10361827), (2, 0.05527945), (3, 0.7109083), (4, 0.049515486), (5, 0.010134702), (8, 0.012831275), (9, 0.05533035)]


label id: 17


lda result: [(0, 0.020029237), (1, 0.020024557), (2, 0.02002185), (3, 0.020023756), (4, 0.020019487), (5, 0.020024091), (6, 0.8197941), (7, 0.020018762), (8, 0.020019889), (9, 0.020024281)]


label id: 34


lda result: [(0, 0.116023876), (4, 0.6855978), (5, 0.083707646), (7, 0.1047039)]


label id: 71


lda result: [(3, 0.0557655), (5, 0.9378907)]


label id: 104


lda result: [(0, 0.065075), (1, 0.068911105), (3, 0.095410444), (