In [2]:
import numpy as np
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
from gensim import models
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
data=pd.read_json("News_Category_Dataset_v2.json", lines=True)

In [4]:
data['text']=data.headline+" "+data.short_description

In [5]:
stemmer = SnowballStemmer('english')
words = stopwords.words('english')
data['cleaned_text'] = data['text'].apply(lambda x:" ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]"," ",x).split() if i not in words]))

In [6]:
model = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
#文本向量
def compute_doc_vec_single(clean_text):
    vec = np.zeros((model.vector_size,), dtype=np.float32)
    n = 0
    tokenized_clean_text=nltk.word_tokenize(clean_text)
    for word in tokenized_clean_text:
        if word in model:
            vec += model[word]
            n += 1
    if(n==0):
        return (model["Hello"]*0)
    else:
        return (vec/n)


def compute_doc_vec(clean_text):
    return np.row_stack([compute_doc_vec_single(x) for x in clean_text])
x = compute_doc_vec(data['cleaned_text'])

In [42]:
y_encoder=LabelEncoder()
y=y_encoder.fit_transform(data['category'])
print(y)

train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y)
# print(train_idx)
# print(test_idx)
train_x = x[train_idx, :]
train_y = y[train_idx]
test_x = x[test_idx, :]
test_y = y[test_idx] 
from sklearn.linear_model import LogisticRegression
# 常用参数说明
# penalty：惩罚项，str类型，可选参数为l1和l2，默认为l2
# dual：对偶或原始方法，bool类型，默认为False。
# tol：停止求解的标准，float类型，默认为1e-4。
# c：正则化系数λ的倒数，float类型，默认为1.0。
# fit_intercept：是否存在截距或偏差，bool类型，默认为True。
# intercept_scaling：仅在正则化项为”liblinear”，且fit_intercept设置为True时有用。float类型，默认为1。
# class_weight：用于标示分类模型中各种类型的权重
# random_state：随机数种子，int类型，可选参数，默认为无，仅在正则化优化算法为sag,liblinear时有用。
# solver：优化算法选择参数，只有五个可选参数，即newton-cg,lbfgs,liblinear,sag,saga。默认为liblinear。
# max_iter：算法收敛最大迭代次数，int类型，默认为10。
# multi_class：分类方式选择参数，str类型，可选参数为ovr和multinomial，默认为ovr。
# verbose：日志冗长度，int类型。默认为0。
# warm_start：热启动参数，bool类型。默认为False。
# n_jobs：并行数。int类型，默认为1
model1 = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=3000, n_jobs=-1)
model1.fit(train_x, train_y)

[ 6 10 10 ... 28 28 28]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=3000,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support 

# 在测试集上计算模型的表现
test_y_pred = model1.predict(test_x)

# 输出混淆矩阵
pd.DataFrame(confusion_matrix(test_y, test_y_pred), columns=y_encoder.classes_, index=y_encoder.classes_)

Unnamed: 0,ARTS,ARTS & CULTURE,BLACK VOICES,BUSINESS,COLLEGE,COMEDY,CRIME,CULTURE & ARTS,DIVORCE,EDUCATION,...,TASTE,TECH,THE WORLDPOST,TRAVEL,WEDDINGS,WEIRD NEWS,WELLNESS,WOMEN,WORLD NEWS,WORLDPOST
ARTS,55,10,5,5,0,2,0,8,1,2,...,0,2,0,25,1,2,31,3,1,1
ARTS & CULTURE,21,22,8,0,0,9,1,3,0,0,...,0,2,2,20,0,3,14,15,0,0
BLACK VOICES,2,1,221,12,6,5,50,0,6,5,...,2,1,0,12,4,2,41,7,0,2
BUSINESS,0,3,17,435,0,6,11,0,4,3,...,4,27,5,42,0,1,145,16,3,6
COLLEGE,0,1,7,11,33,2,9,0,0,5,...,0,0,2,7,1,1,40,1,0,1
COMEDY,2,4,5,20,0,194,2,0,3,0,...,3,13,4,20,3,24,54,5,0,2
CRIME,0,0,20,6,2,0,355,0,3,1,...,0,3,9,10,0,26,19,0,1,0
CULTURE & ARTS,22,4,4,3,0,1,0,39,0,0,...,0,4,0,20,0,1,13,1,0,1
DIVORCE,0,1,0,10,0,9,2,0,193,1,...,0,9,1,10,17,2,136,12,1,1
EDUCATION,0,0,2,7,4,0,1,0,0,41,...,0,0,0,3,0,0,33,2,0,0


In [10]:
# 各类型新闻指标
def eval_model(y_true, y_pred, labels):

    # 每个新闻类型的Precision, Recall, f1, support
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
    # 所有类总和的平均Precision, Recall, f1, support
    tot_p = np.average(p, weights=s)
    tot_r = np.average(r, weights=s)
    tot_f1 = np.average(f1, weights=s)
    tot_s = np.sum(s)
    res1 = pd.DataFrame({u'Label': labels,u'Precision': p,u'Recall': r,u'F1': f1,u'Support': s})
    res2 = pd.DataFrame({u'Label': [u'所有类总体'],u'Precision': [tot_p],u'Recall': [tot_r],u'F1': [tot_f1],u'Support': [tot_s]})
    res2.index = [1000]
    res = pd.concat([res1, res2])
    return res[[u'Label', u'Precision', u'Recall', u'F1', u'Support']] 

eval_model(test_y, test_y_pred, y_encoder.classes_) #输出训练结果 

Unnamed: 0,Label,Precision,Recall,F1,Support
0,ARTS,0.323529,0.182119,0.233051,302
1,ARTS & CULTURE,0.271605,0.08209,0.126074,268
2,BLACK VOICES,0.444668,0.243929,0.315039,906
3,BUSINESS,0.399816,0.36647,0.382418,1187
4,COLLEGE,0.383721,0.144105,0.209524,229
5,COMEDY,0.346429,0.18744,0.24326,1035
6,CRIME,0.525926,0.521292,0.523599,681
7,CULTURE & ARTS,0.534247,0.18932,0.27957,206
8,DIVORCE,0.471883,0.281752,0.352834,685
9,EDUCATION,0.369369,0.20398,0.262821,201


In [27]:
import os
output_dir = u'output'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [30]:
# 保存模型到文件

import dill
import pickle
model1_file = os.path.join(output_dir, u'model.pkl')
with open(model_file, 'wb') as outfile:
    pickle.dump({'y_encoder': y_encoder, 'lr': model1}, outfile)

In [31]:
#对某篇新闻预测
class Predictor(object):
    
    def __init__(self, w2v_model_file, lr_model_file):
        self.model = models.KeyedVectors.load_word2vec_format(w2v_model_file, binary=True)
        with open(lr_model_file, 'rb') as infile:
            self.model1 = pickle.load(infile)
    
    def predict(self, articles):
        x = self._compute_doc_vec(articles)
        y = self.model1['lr'].predict(x)
        y_label = self.model1['y_encoder'].inverse_transform(y)
        return y_label
    
    def _compute_doc_vec(self, clean_text):
        return np.row_stack([compute_doc_vec_single(x) for x in clean_text])

    def _compute_doc_vec_single(self, clean_text):
        vec = np.zeros((model.vector_size,), dtype=np.float32)
        n = 0
        tokenized_clean_text=nltk.word_tokenize(clean_text)
        for word in tokenized_clean_text:
            if word in model:
                vec += model[word]
                n += 1
        if(n==0):
            return (model["Hello"]*0)
        else:
            return (vec/n)
# 加载pretrained google news vector
predictor = Predictor('GoogleNews-vectors-negative300.bin', model_file)

# 单篇新闻分类
new_y_pred = predictor.predict(data['cleaned_text'][:1])

# 对比预测
pd.DataFrame({u'预测新闻类别': new_y_pred, u'实际新闻类别': data[u'category'][:1]})

Unnamed: 0,预测新闻类别,实际新闻类别
0,ENTERTAINMENT,CRIME


In [13]:
#交叉检验cross validation 
from sklearn.model_selection import cross_val_score
# clf=LogisticRegression()
score=cross_val_score(LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=3000), x,y, cv=10)
print(np.mean(score))

0.5136087435038728


In [82]:
import keras as K
init = K.initializers.glorot_uniform(seed=1)
simple_adam = K.optimizers.Adam()
#default:keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, amsgrad=False)
model2 = K.models.Sequential()
model2.add(K.layers.Dense(units=235, input_dim=300, kernel_initializer=init, activation='relu'))
# model2.add(K.layers.Dense(units=235, kernel_initializer=init, activation='relu'))
model2.add(K.layers.Dense(units=41, kernel_initializer=init, activation='softmax'))
model2.compile(loss='categorical_crossentropy', optimizer=simple_adam, metrics=['accuracy'])

In [83]:
#训练模型
from sklearn.preprocessing import LabelBinarizer
b_size = 16
y1_encoder=LabelBinarizer()
y1=y1_encoder.fit_transform(data['category'])
# train1_idx, test1_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y)
# # print(train_idx)
# # print(test_idx)
# train1_x=x[train_idx, :]
train1_y = y1[train_idx]
# test1_x = x[test_idx, :]
test1_y = y1[test_idx] 
max_epochs = 80
print("Starting training ")
h = model2.fit(train_x, train1_y, batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1)
print("Training finished \n")


Starting training 
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Training finished 



In [46]:
print(range(len(y)))

range(0, 200853)


In [84]:
eval = model2.evaluate(test_x, test1_y, verbose=0)
print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%% \n" % (eval[0], eval[1]*100 ) )


Evaluation on test data: loss = 2.131848 accuracy = 50.00% 

