In [1]:
import numpy as np
import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
from sklearn import metrics
from sklearn.cluster import KMeans
from pprint import pprint
import time
import jieba
import jieba.analyse
import pandas as pd
from sklearn.manifold import TSNE
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def load_stopword():
    #stopword_path = 'F:\\编程练习\\Jupyter notebook\\文本文件夹\\博客爬虫分析文章\\停用词表.txt'
    #stopword_path = 'F:\\学习用夹\\大三下学期\\综合课程设计\\实验数据\\停用词库.txt'
    stopword_path = 'F:\\学习用夹\\大三下学期\\综合课程设计\\实验数据\\scu_stopwords.txt'
    f_stop = open(stopword_path,'r',encoding='UTF-8')
    sw = [line.strip() for line in f_stop]
    f_stop.close()
    stopword_extend = ['\n','湖北','武汉','病毒','中国',
                       '兰州','甘肃','我国','北京','微博','正文',
                       '收起','肖战','消费','复工','复产','全文','消费者','疫情','网页','链接','美国','浙江','杭州','新冠','市场']
    sw.extend(stopword_extend)
    return sw


def jieba_text(data_path):
    stopwords = load_stopword()
    df = pd.read_excel(data_path)
    #df = pd.read_csv(data_path)
    text_seg_list = []
    for index,row in df.iterrows():
        #fileId = row['id']
        fileContent = row['内容']
        segs = jieba.analyse.textrank(fileContent,topK=20,withWeight=False,allowPOS=('ns','n','nr','nt','vn')) 
        segments_list = []
        for seg in segs:
            if seg not in stopwords and len(seg) > 1:
                segments_list.append(seg)
        text_seg_list.append(segments_list)
    return text_seg_list

    
#导出csv文件
def dataToCsv(file,df):
    file_data = df
    file_data.to_csv(file,index=False)
    print('csv文件已生成在：{}'.format(file))

In [12]:
def LDA(text_seg_list,topic_num):
    # 建立字典
    dictionary = corpora.Dictionary(text_seg_list)
    V = len(dictionary)

    # 转换文本数据为索引，并计数
    corpus = [dictionary.doc2bow(text) for text in text_seg_list]

    # 计算tf-idf值
    corpus_tfidf = models.TfidfModel(corpus)[corpus]

    # 训练模型
    lda = models.LdaModel(corpus_tfidf, num_topics=topic_num, id2word=dictionary)
    #alpha=0.01, eta=0.01, minimum_probability=0.001,update_every=1, chunksize=100, passes=1
    Perplexity = lda.log_perplexity(corpus_tfidf)

    num_show_term = 10  # 每个主题显示几个词
    #print('结果：每个主题的词分布：')
    lda_topic = {'1':[],'2':[],'3':[],'4':[],'5':[],'6':[],'7':[],'8':[],'9':[],'10':[]}
    lda_topic_prob = []
    for topic_id in range(topic_num):
        #print('主题#%d：\t' % topic_id)
        term_distribute_all = lda.get_topic_terms(topicid=topic_id)
        term_distribute = term_distribute_all[:num_show_term]
        term_distribute = np.array(term_distribute)
        term_id = term_distribute[:, 0].astype(np.int)
        #print('词：\t', )
        i = 0
        for t in term_id:
            i += 1
            #print(dictionary.id2token[t], )
            lda_topic[str(i)].append(dictionary.id2token[t])
        #print('\n概率：\t', term_distribute[:, 1])
        lda_topic_prob.append(term_distribute[:, 1])
    #print(lda.print_topics(5))
    
    #lda可视化
    vis_data = pyLDAvis.gensim.prepare(lda,corpus_tfidf,dictionary)
    pyLDAvis.show(vis_data,open_browser = False)
    
    #输出每个list所属的类别
    #topic_list=[]
    #for topics in lda.get_document_topics(corpus)[:]:
        #for topic in topics:
            #topic_list.append(topic[0])
   # corpus_topic_df = pd.DataFrame({'content':text_seg_list,'topic':topic_list})
    #dataToCsv('F:\\学习用夹\\大三下学期\\综合课程设计\\实验数据\\LDA结果\\LDA文本分类',corpus_topic_df)
    
    return lda_topic,lda_topic_prob,Perplexity

In [4]:
#doc2vec与K-means聚类
TaggededDocument = gensim.models.doc2vec.TaggedDocument
 
    
def get_train(text_seg_list):
    x_train = []
    for i, text in enumerate(text_seg_list):
        document = TaggededDocument(text, tags=[i])
        x_train.append(document)
    return x_train
 
    
def train(x_train, size=200, epoch_num=1):
    model_dm = Doc2Vec(x_train, min_count=1, window = 3, size = size, sample=1e-3, negative=5, workers=4)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100)
    #model_dm.save('model/model_dm')
    return model_dm
 
    
def cluster(x_train,model_dm,n_clu):
    infered_vectors_list = []
    for text, label in x_train:
        vector = model_dm.infer_vector(text)
        infered_vectors_list.append(vector)
 
    kmean_model = KMeans(n_clusters=n_clu)
    kmean_model.fit(infered_vectors_list)
    labels= kmean_model.predict(infered_vectors_list)
    return labels,infered_vectors_list

In [5]:
data_path_1 = 'F:\\学习用夹\\大三下学期\\综合课程设计\\实验数据\\复产内容信息2020-05-10.xls'
data_path_2 = 'F:\\学习用夹\\大三下学期\\综合课程设计\\实验数据\\复工内容信息2020-05-10.xlsx'
data_path_3 = 'F:\\学习用夹\\大三下学期\\综合课程设计\\实验数据\\消费内容信息2020-05-10.xlsx'

In [6]:
#建立训练集
text_seg_list = jieba_text(data_path_3)
x_train = get_train(text_seg_list)
model_dm = train(x_train)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\MU_XIA~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.589 seconds.
Prefix dict has been built successfully.


In [None]:
#轮廓系数评估
infered_vectors_list = []
for text, label in x_train:
    vector = model_dm.infer_vector(text)
    infered_vectors_list.append(vector)

def get_cluster_n(infered_vectors_list,model_dm,n_clu):
    kmean_model = KMeans(n_clusters=n_clu)
    kmean_model.fit(infered_vectors_list)
    labels = kmean_model.predict(infered_vectors_list)
    print(metrics.silhouette_score(infered_vectors_list, kmean_model.labels_, metric='cosine'))

#寻找3到20里的轮廓系数，用系数较高的值做聚类中心
for i in range(3,20):
    get_cluster_n(infered_vectors_list,model_dm,i)

In [7]:
#根据聚类中心进行K-Means聚类
labels,infered_vectors_list = cluster(x_train,model_dm,3)   #第二个系数选择聚类中心
clu_df = pd.DataFrame({'内容':text_seg_list})
clu_df['labels'] = labels

In [8]:
#结果存入相应文件夹
save_path = 'F:\\学习用夹\\大三下学期\\综合课程设计\\实验数据\\聚类结果\\'   #保存聚类结果的文件夹
clu_gp = clu_df.groupby('labels')

cluster_lists = []
for clu_id in clu_gp.groups.keys():
    dataToCsv(save_path + str(clu_id) + '.csv', clu_gp.get_group(clu_id)['内容'])  #在保存的csv文件里查看聚类结果好坏
    cluster_lists.append(clu_gp.get_group(clu_id)['内容'].values.tolist())   #cluster_lists[n],n为聚类中心数，在lda中用此列表分析

csv文件已生成在：F:\学习用夹\大三下学期\综合课程设计\实验数据\聚类结果\0.csv
csv文件已生成在：F:\学习用夹\大三下学期\综合课程设计\实验数据\聚类结果\1.csv
csv文件已生成在：F:\学习用夹\大三下学期\综合课程设计\实验数据\聚类结果\2.csv




In [30]:
#聚类后的词频分析,根据高频词判断聚类质量
def word_count(cluster_list):
    counts = {}
    words = []
    for w_list in cluster_list:
        for w in w_list:
            words.append(w)
    for word in words:
        counts[word] = counts.get(word, 0) + 1
    items = list(counts.items())
    items.sort(key=lambda x:x[1], reverse=True)
    for i in range(10):
        word, count = items[i]
        print("{0:<10}{1:>5}".format(word, count))

word_count(cluster_lists[0])

产品          762
行业          615
公司          614
科技          590
汽车          583
新闻          539
投资          528
服务          527
生产          496
社会          473


In [13]:
#对相应文件做lda分析
lda_topic,lda_topic_prob,Perplexity = LDA(cluster_lists[2],3)    #第二个系数为lda主题个数
df_topic = pd.DataFrame(lda_topic)
df_topic_prob = pd.DataFrame(lda_topic_prob,columns=['1','2','3','4','5','6','7','8','9','10'])
print(df_topic)
print(df_topic_prob)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


ValueError: arrays must all be same length