In [35]:
import warnings
warnings.filterwarnings("ignore")
import os
import time
import random
import jieba
import sklearn
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn import metrics
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import matplotlib.pyplot as plt
from numpy import vstack


In [36]:
with open('./train/train_data.txt','r') as fr:
    train_data_list = [line.strip().split('\t') for line in fr.readlines()]
train_data_list = [" ".join(x) for x in train_data_list ]

In [37]:
with open('./train/train_labels.txt','r') as fr:
    train_label = [line.strip().split('\t') for line in fr.readlines()]
train_label_list = [" ".join(x) for x in train_label ]

In [38]:
def SplitDataSet(data_list,class_list,test_size=0.2):
    # 划分训练集和测试集
    data_class_list = list(zip(data_list, class_list))
    random.shuffle(data_class_list)
    index = int(len(data_class_list)*test_size)#+1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list, train_class_list = zip(*train_list)
    test_data_list, test_class_list = zip(*test_list)

    return list(train_data_list), list(test_data_list), list(train_class_list), list(test_class_list)


In [39]:
train_data_list, test_data_list, train_class_list, test_class_list = SplitDataSet(train_data_list,train_label_list,test_size=0.2)

In [40]:
def TextClassifier(train_feature_list, test_feature_list, 
                    train_class_list, test_class_list,):
    """
    函数说明:分类器
    Parameters:
        train_feature_list - 训练集向量化的特征文本
        test_feature_list - 测试集向量化的特征文本
        train_class_list - 训练集分类标签
        test_class_list - 测试集分类标签
        classifier      - 使用的分类器函数
        params          - 分类器要优化的参数
    Returns:
        test_accuracy - 分类器精度
    """
    X_train = train_feature_list
    Y_train  = train_class_list

    X_train_c = np.copy(train_feature_list)
    Y_train_c  = np.copy(train_class_list)
    
    X_val  = test_feature_list
    Y_val = test_class_list

    len_X_train = len(X_train)
    len_X_val = len(X_val)

    X = vstack([X_train,X_val])
    X = np.array(X)
    Y_train.extend(Y_val)
    Y = np.array(Y_train)

    #Mark the training-validation splits
    train_i = np.ones((len_X_train,), dtype = int) * -1
    valid_i = np.zeros((len_X_val,), dtype = int)
    split_fold = np.concatenate((train_i, valid_i))
    ps = PredefinedSplit(split_fold)
    
    params = {'alpha':np.linspace(0.0001,1,10000)}
    classifier = MultinomialNB()
    
    param_search = GridSearchCV(classifier,
                            params, 
                    scoring=metrics.make_scorer(metrics.f1_score, average='macro'),
                                cv=ps,
                                return_train_score=True)
    param_search.fit(X,Y)
    results = param_search.cv_results_
    best_params = param_search.best_params_ 
    
    clf = MultinomialNB(alpha = best_params['alpha'])
    clf.fit(X_train_c,Y_train_c)
    Y_pred = clf.predict(X_val)
    test_accuracy = metrics.f1_score(Y_val, Y_pred, average='macro')

    return test_accuracy


In [41]:

vectorizer = CountVectorizer() 

tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf = tfidf_transformer.fit_transform(vectorizer.fit_transform(train_data_list))
tf_idf_vector=tfidf_transformer.transform(vectorizer.transform(test_data_list))
m = TextClassifier(tfidf.toarray().tolist(), tf_idf_vector.toarray().tolist(),train_class_list, test_class_list)
print(m)

KeyboardInterrupt: 