题目：（源文件下载不了，无法运行）

2万个原始文本资料分为20个类别，根据时间的先后顺序划分为训练集和测试集，进行文本分类

## 1.导入库

In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer   # TF-idf 文本向量化包
from sklearn.linear_model import RidgeClassifier              # Ridge回归
from sklearn.neighbors import KNeighborsClassifier            # K临近回归
from sklearn.svm import SVC                                   # SVM
from sklearn.ensemble import RandomForestClassifier           # 随机森林
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from time import time
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib as mpl

## 2.准备好各种学习模型及参数

In [2]:
# 定义包含了各种分类器和计算用时的函数
def test_clf(clf):
    print(u'分类器：', clf)
    alpha_can = np.logspace(-3, 2, 10)
    model = GridSearchCV(clf, param_grid={'alpha': alpha_can}, cv=5)
    m = alpha_can.size
    
    # Ridge回归 及 参数alpha的选择
    if hasattr(clf, 'alpha'):
        model.set_params(param_grid={'alpha': alpha_can})
        m = alpha_can.size
        
    # K临近 及 参数K（领域）的选择
    if hasattr(clf, 'n_neighbors'):
        neighbors_can = np.arange(1, 15)  # 这里取（1,15）
        model.set_params(param_grid={'n_neighbors': neighbors_can})
        m = neighbors_can.size
        
    # SVM 及 参数 C和gamma的选择
    if hasattr(clf, 'C'):
        C_can = np.logspace(1, 3, 3)
        gamma_can = np.logspace(-3, 0, 3)
        model.set_params(param_grid={'C':C_can, 'gamma':gamma_can})
        m = C_can.size * gamma_can.size
    
    # 随机森林 及 参数最大深度的选择
    if hasattr(clf, 'max_depth'):
        max_depth_can = np.arange(4, 10)
        model.set_params(param_grid={'max_depth': max_depth_can})
        m = max_depth_can.size
    
    t_start = time()
    model.fit(x_train, y_train)  # 训练模型； 并在开头和结尾卡了时间，计算用了多久
    t_end = time()
    t_train = (t_end - t_start) / (5*m)  # 计算出训练一个样本所用的时间
    print(u'5折交叉验证的训练时间为：%.3f秒/(5*%d)=%.3f秒' % ((t_end - t_start), m, t_train))
    print(u'最优超参数为：', model.best_params_)
    
    t_start = time()
    y_hat = model.predict(x_test) # 给出预测； 并在开头和结尾卡了时间，计算用了多久
    t_end = time()
    t_test = t_end - t_start   # 计算预测用时
    print(u'测试时间：%.3f秒' % t_test)
    
    acc = metrics.accuracy_score(y_test, y_hat)  # 计算模型的得分
    print(u'测试集准确率：%.2f%%' % (100 * acc))
    name = str(clf).split('(')[0]
    
    # 为了好看，没什么用
    index = name.find('Classifier')
    if index != -1:
        name = name[:index]     # 去掉末尾的Classifier
    if name == 'SVC':
        name = 'SVM'
    return t_train, t_test, 1-acc, name

## 3.对数据进行预处理

下载数据并分为训练和测试集

In [None]:
# 源文件下载不了
print(u'开始下载/加载数据...')

t_start = time()

# remove = ('headers', 'footers', 'quotes')
remove = ()
categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'
# categories = None     # 若分类所有类别，请注意内存是否够用

# 将数据分为测试数据和训练数据
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=0, remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=0, remove=remove)

t_end = time()

print(u'下载/加载数据完成，耗时%.3f秒' % (t_end - t_start))
print(u'数据类型：', type(data_train))
print(u'训练集包含的文本数目：', len(data_train.data))
print(u'测试集包含的文本数目：', len(data_test.data))
print(u'训练集和测试集使用的%d个类别的名称：' % len(categories))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


开始下载/加载数据...


In [None]:
categories = data_train.target_names  # 类别名字
print(categories)

y_train = data_train.target
y_test = data_test.target

In [None]:
print(u' -- 前10个文本 -- ')
for i in np.arange(10):
    print(u'文本%d(属于类别 - %s)：' % (i+1, categories[y_train[i]]))
    print(data_train.data[i])

对划分好了的文本进行向量化，这里用的是TF-idf方法

In [None]:
vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True)  # 使用TF-idf对文本向量化

x_train = vectorizer.fit_transform(data_train.data)  # x_train是稀疏的，scipy.sparse.csr.csr_matrix
x_test = vectorizer.transform(data_test.data)

print(u'训练集样本个数：%d，特征个数：%d' % x_train.shape)
print( u'停止词:\n',)
print(vectorizer.get_stop_words())

feature_names = np.asarray(vectorizer.get_feature_names())

## 4.目标实现

In [None]:
# 罗列出所用的分类器
clfs = (MultinomialNB(),                # 0.87(0.017), 0.002, 90.39%
        BernoulliNB(),                  # 1.592(0.032), 0.010, 88.54%
        KNeighborsClassifier(),         # 19.737(0.282), 0.208, 86.03%
        RidgeClassifier(),              # 25.6(0.512), 0.003, 89.73%
        RandomForestClassifier(n_estimators=200),   # 59.319(1.977), 0.248, 77.01%
        SVC()                           # 236.59(5.258), 1.574, 90.10%
        )

使用之前写好的函数进行机器学习

In [None]:
result = []
# 遍历所用之前定义的分类器，记下结果
for clf in clfs:
    a = test_clf(clf)
    result.append(a)

In [None]:
result = np.array(result)
time_train, time_test, err, names = result.T
x = np.arange(len(time_train))

## 5.画图

In [None]:
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10, 7), facecolor='w')
ax = plt.axes()
b1 = ax.bar(x, err, width=0.25, color='#77E0A0')

# 打印的是两个y轴的坐标轴
ax_t = ax.twinx()
b2 = ax_t.bar(x+0.25, time_train, width=0.25, color='#FFA0A0')
b3 = ax_t.bar(x+0.5, time_test, width=0.25, color='#FF8080')
plt.xticks(x+0.5, names, fontsize=10)
leg = plt.legend([b1[0], b2[0], b3[0]], (u'错误率', u'训练时间', u'测试时间'), loc='upper left', shadow=True)

# for lt in leg.get_texts():
#     lt.set_fontsize(14)

plt.title(u'新闻组文本数据不同分类器间的比较', fontsize=18)
plt.xlabel(u'分类器名称')
plt.grid(True)
plt.tight_layout(2)
plt.show()