In [1]:
%autosave 0

Autosave disabled


In [2]:
import numpy as np
import pandas as pd
import jieba

In [3]:
train = pd.read_csv('train_01.csv', sep=',', encoding='utf-8', engine='python')

stopwords = pd.read_csv('stopwords.txt', index_col=False, quoting=3, sep='\t', names=['stopword'], encoding='utf-8', engine='python')
stopwords = stopwords['stopword'].values

In [4]:
def text_process(content, subject, sentences):
    i = 0
    for line in content:
        segs = jieba.lcut(line)
        segs = [v for v in segs if not str(v).isdigit()]
        segs = filter(lambda x: x.strip(), segs)
        segs = filter(lambda x: len(x)>1, segs)
        segs = filter(lambda x: x not in stopwords, segs)
        sentences.append((" ".join(list(segs)), subject[i]))
        i += 1

In [5]:
data = []
text_process(train['content'], train['subject'], data)
x, y = zip(*data)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\qufang\AppData\Local\Temp\jieba.cache
Loading model cost 1.812 seconds.
Prefix dict has been built succesfully.


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
vec = CountVectorizer(analyzer='word',
                     max_features=4000,
                     decode_error='replace')
vec_train = vec.fit_transform(x_train)

In [10]:
from sklearn.naive_bayes import MultinomialNB

In [11]:
classifier_nb = MultinomialNB()
classifier_nb.fit(vec_train, y_train)
y_pred_nb = classifier_nb.predict(vec.transform(x_test))
print(classifier_nb.score(vec.transform(x_test), y_test))

0.6224307417336908


In [21]:
import pickle

In [22]:
f = open('./model_saved/vec_CountVectorizer.pkl', 'wb')
pickle.dump(vec.vocabulary_, f)
f.close()
f = open('./model_saved/NB_classifier.pkl', 'wb')
pickle.dump(classifier_nb, f)
f.close()

In [12]:
from sklearn.svm import SVC

In [13]:
classifier_svm = SVC(kernel='linear')
classifier_svm.fit(vec_train, y_train)
y_pred_svm = classifier_svm.predict(vec.transform(x_test))
print(classifier_svm.score(vec.transform(x_test), y_test))

0.6411974977658623


In [27]:
f = open('./model_saved/SVM_classifier.pkl', 'wb')
pickle.dump(classifier_svm, f)
f.close()

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
classifier_lr = LogisticRegression()
classifier_lr.fit(vec_train, y_train)
y_pred_lr = classifier_lr.predict(vec.transform(x_test))
print(classifier_lr.score(vec.transform(x_test), y_test))



0.660857908847185


In [31]:
f = open('./model_saved/LR_classifier.pkl', 'wb')
pickle.dump(classifier_lr, f)
f.close()

In [16]:
a = {'nb': y_pred_nb,
    'svm': y_pred_svm,
    'lr': y_pred_lr,
    'true': y_test}

In [17]:
result = pd.DataFrame(a)

In [21]:
result.to_csv('result_nv_svm_lr.csv', encoding='utf-8', index=False)

In [18]:
len(result)

2238

In [25]:
result['nb'][0]

'动力'

In [49]:
from operator import itemgetter

In [65]:
result_vote = []
for i in range(len(result)):
    temp = {}
    for r in ('nb', 'svm', 'lr'):
        t = result[r][i]
        if t not in temp.items():
            temp.setdefault(t, 0)
        temp[t] += 1
    result_vote.append(sorted(temp.items(), key=itemgetter(1), reverse=True)[0][0])


In [66]:
result_vote[0:10]

['动力', '操控', '操控', '动力', '价格', '油耗', '动力', '动力', '油耗', '价格']

In [69]:
y_test[0]

'动力'

In [70]:
s = 0
for i in range(len(result)):
    if result_vote[i] == y_test[i]:
        s += 1

In [71]:
s / len(result)

0.6621983914209115

In [72]:
s1 = classifier_nb.score(vec.transform(x_test), y_test)
s2 = classifier_svm.score(vec.transform(x_test), y_test)
s3 = classifier_lr.score(vec.transform(x_test), y_test)

In [73]:
print(s1, s2, s3)

0.6224307417336908 0.6411974977658623 0.660857908847185


In [75]:
s01 = s1 / (s1+s2+s3)
s02 = s2 / (s1+s2+s3)
s03 = s3 / (s1+s2+s3)

In [76]:
print(s01, s02, s03)

0.3234269793359647 0.33317854655212437 0.3433944741119108


In [None]:
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone
import numpy as np
#对于分类问题可以使用 ClassifierMixin


class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    # 我们将原来的模型clone出来，并且进行实现fit功能
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)

        #对于每个模型，使用交叉验证的方法来训练初级学习器，并且得到次级训练集
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                self.base_models_[i].append(instance)
                instance = clone(model)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred

        # 使用次级训练集来训练次级学习器
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self

    #在上面的fit方法当中，我们已经将我们训练出来的初级学习器和次级学习器保存下来了
    #predict的时候只需要用这些学习器构造我们的次级预测数据集并且进行预测就可以了
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)