In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,confusion_matrix,classification_report

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/news classification/train_set.csv', sep='\t')
test_df = pd.read_csv('/content/drive/MyDrive/news classification/test_a.csv', sep='\t')

In [None]:

tfidf = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1,3),
    max_features=10000)

tfidf.fit(pd.concat([train_df['text'], test_df['text']))
train_word_features = tfidf.transform(train_df['text'])
test_word_features = tfidf.transform(test_df['text'])

In [None]:
train_word_features = tfidf.transform(train_df['text'])
test_word_features = tfidf.transform(test_df['text'])
X_train = train_word_features
y_train = train_df['label']
X_test = test_word_features

In [None]:
KF = KFold(n_splits=5, random_state=7,shuffle=True)
clf = LinearSVC()
test_pred = np.zeros((X_test.shape[0], 1), int)  # 存储测试集预测结果 行数：len(X_test) ,列数：1列
for KF_index, (train_index,valid_index) in enumerate(KF.split(X_train)):
    print('第', KF_index+1, '折交叉验证开始...')
    # 训练集划分
    x_train_, x_valid_ = X_train[train_index], X_train[valid_index]
    y_train_, y_valid_ = y_train[train_index], y_train[valid_index]
    # 模型构建
    clf.fit(x_train_, y_train_)
    # 模型预测
    val_pred = clf.predict(x_valid_)#f1_score,accuracy_score,precision_score,recall_score
    print("第"+str(KF_index+1)+"次accuracy_score为：",accuracy_score(y_valid_, val_pred))   
    print('混淆矩阵输出:\n',confusion_matrix(y_valid_, val_pred))#混淆矩阵输出
    print('分类报告:\n', classification_report(y_valid_, val_pred))
    # 保存测试集预测结果
    test_pred = np.column_stack((test_pred, clf.predict(X_test)))  # 将矩阵按列合并
# 取测试集中预测数量最多的数
preds = []
for i, test_list in enumerate(test_pred):
    preds.append(np.argmax(np.bincount(test_list)))
preds = np.array(preds)

第 1 折交叉验证开始...
第1次accuracy_score为： 0.9476
混淆矩阵输出:
 [[7401  102   11   45   68   93   20   15   24   59    3    2    0    0]
 [ 100 7088    2    7   81    6    1  125    5    1    4    0    1    0]
 [   8    4 6213   19    2   11    3    0    4    0    1    4    2    0]
 [  40    7   22 4318   25   33    5    1   13    5    1   12    0    0]
 [  51   51   14   17 2785   48   19    8    3    1    1    6    1    1]
 [  62    4   10   33   76 2214   33    6    5    0    6    0    7    0]
 [  18    5    6    4   29   43 1843    4    7    2    1    1    0    1]
 [  22  161    0    5   10    7    3 1476    6    0    2    1    1    0]
 [  31   12    3   19    1    6    3    3 1476    1    0    8    0    1]
 [  79    1    1    6    2    7    1    0    1 1046    0    0    0    0]
 [   2    5    0    3    3    5    0    1    5    0  958    2    0    0]
 [   8    1    0   16    5    3    0    0   11    0    0  589    0    3]
 [   0    0   18    0    0    8    0    0    0    0    0    0  325    0]


In [None]:
submission = pd.read_csv('/content/drive/MyDrive/news classification/test_a_sample_submit.csv')
submission['label'] = preds
submission.to_csv('/content/drive/MyDrive/news classification/LinearSVC_submission2.csv', index=False)

https://eli5.readthedocs.io/en/latest/tutorials/black-box-text-classifiers.html