In [27]:
import os
import time
import numpy as np
import pandas as pd
import jieba
from jieba import analyse
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer #词集转换成向量
from sklearn.naive_bayes import MultinomialNB #朴素贝叶斯多分类
from sklearn.metrics import classification_report
import gensim #自然语言处理库
from gensim import corpora,models,similarities


def main():
    data = read_file()
    #data_1= content.split(data)
    contents_clean,all_words = separate_words(data)
    df_train = pd.DataFrame({"contents_clean":contents_clean,"label":data["category"]})
    df_train = shuffle(df_train)
    #切分数据集
    x_train,x_test,y_train,y_test = train_test_split(df_train["contents_clean"].values,df_train["label"].values,test_size=0.5)
    #训练
    words_train = format_transform(x_train) 
    vectorizer = TfidfVectorizer(analyzer='word', max_features=4000,ngram_range=(1, 3),lowercase = False)
    vectorizer.fit(words_train)#将清洗过的文章分词转化成朴素贝叶斯的矩阵形式[[],[],[],[]...]
    classifier = MultinomialNB()
    classifier.fit(vectorizer.transform(words_train), y_train)


    #测试
    words_test = format_transform(x_test)
    score = classifier.score(vectorizer.transform(words_test), y_test)
    print("----------------------------------分类结果报告-----------------------------------------")
    print("分类准确率:" + str(score))
    # 预测结果
    y_predict=classifier.predict(vectorizer.transform(words_test))
    print(classification_report(y_test,y_predict))
    #用保存的all_word统计一下词频
    words_count = df_all_words.groupby(by=["all_words"])["all_words"].agg({"count":np.size}) #groupby就是按词分类
    words_count = words_count.reset_index().sort_values(by=["count"],ascending=False) #降序
    print(words_count.head())





#读取数据源
def read_file():
    data = pd.read_excel('/Users/apple/Desktop/toutiao.xlsx',names=['category','title','digest','content'])
    data=data.dropna()    #直接丢弃包括NAN的整条数据
    df = data.groupby('category').count()#展示数据规模
    print(df)
    return data

#jieba分词并去停用词
def separate_words(data):
    content=data.content.values.tolist()#将content文本内容转换为list格式
    #读入停用词表
    stopwords = pd.read_csv("/Users/apple/Desktop/stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8') #list
    stopwords = stopwords.stopword.values.tolist()
    print("正在分词,请耐心等候......")
    contents_clean = []   #存储分完词之后结果
    all_words = []
 
    for line in content:
        current_segment = jieba.lcut(line) #jieba分词
        current_segment = [x.strip() for x in current_segment if x.strip()!=''] #去掉分词后出现的大量空字符串
        if len(current_segment) > 1 and current_segment != "\r\n":
            line_clean = []
            for word in current_segment:
                if word in stopwords:
                    continue
                line_clean.append(word)
                all_words.append(str(word))
            contents_clean.append(line_clean)        
    print('------------分词完成-----------')
    return contents_clean, all_words


#开始训练
def format_transform(x): #x是数据集（训练集或者测试集）
    words =[]
    for line_index in range(len(x)):
        try:
            words.append(" ".join(x[line_index]))
        except:
            print("数据格式有问题")
    return words








if __name__=='__main__':
    
    main()

          title  digest  content
category                        
100        5965    5965     5965
101        9782    9782     9782
102       33672   33672    33672
103       25350   25350    25350
104       22127   22127    22127
106       10337   10337    10337
107       27471   27471    27471
108       17942   17942    17942
109       24315   24315    24315
110       15865   15865    15865
112       14773   14773    14773
113       15803   15803    15803
115       16081   16081    16081
116       20513   20513    20513
正在分词,请耐心等候......
------------分词完成-----------
----------------------------------分类结果报告-----------------------------------------
分类准确率:0.823097278419668
              precision    recall  f1-score   support

         100       0.80      0.67      0.73      2955
         101       0.75      0.50      0.60      4917
         102       0.76      0.90      0.83     16769
         103       0.93      0.89      0.91     12793
         104       0.81      0.77      0.79     10

NameError: name 'df_all_words' is not defined