### Stacking模型

#### 定义基础模型

In [5]:
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


def get_models():

    nb = MultinomialNB(alpha=0.04)
    lg = LogisticRegression(C=2)
    svm = SVC(C=10000, probability=True)
    knn = KNeighborsClassifier(n_neighbors=5)
    rf = RandomForestClassifier()
    gb = GradientBoostingClassifier()

    models = [nb,lg]

    return models

In [6]:
base_learners = get_models()
base_learners

[MultinomialNB(alpha=0.04, class_prior=None, fit_prior=True),
 LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='warn',
           tol=0.0001, verbose=0, warm_start=False)]

#### 定义权重分配模型（第二层架构）

In [7]:
# meta_learner = LogisticRegression(C=2)
meta_learner = SVC(C=50, probability=True)

#### 将基础模型分成两部分，主要供第二层来使用

In [8]:
import pandas as pd
df_news = pd.read_excel('../../data/分词并去停用词的新闻数据.xlsx')

In [9]:
X = df_news['content']
y = df_news['label']

dic = {'体育':0, '军事':1, '国际':2, '娱乐':3, '时尚':4, '汽车':5, '科技':6, '财经':7}

y = y.apply(lambda x:dic[x])

# from sklearn.feature_extraction.text import TfidfVectorizer
# vect = TfidfVectorizer(max_features=30000, lowercase=False).fit(X)
# X_tfidf = vect.transform(X)

# 切分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train.index = range(len(X_train))
y_train.index = range(len(y_train))
X_test.index = range(len(X_test))
y_test.index = range(len(y_test))

# 加载tf-idf模型
with open('models_save/vect.pkl', 'rb') as f:
    vect = pickle.load(f)
X_train_tfidf = vect.transform(X_train)
X_test_tfidf = vect.transform(X_test)

In [336]:
X_train_tfidf.shape

(7639, 30000)

In [337]:
X_test_tfidf.shape

(1910, 30000)

In [1]:
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
import numpy as np
#对于分类问题可以使用 ClassifierMixin


class StackingModels(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    # 我们将原来的模型clone出来，并且进行实现fit功能
    def fit(self, X, y):
        
        self.base_models_ = [list() for x in self.base_models]

        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)

        #对于每个模型，使用交叉验证的方法来训练初级学习器，并且得到次级训练集
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)*8))
        
        for i, model in enumerate(self.base_models):
            print("正在训练第{}个model".format(i+1))
            j = 1
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                instance.fit(X[train_index], y[train_index])
                self.base_models_[i].append(instance)
                y_pred = instance.predict_proba(X[holdout_index])
                out_of_fold_predictions[holdout_index, i*8:(i+1)*8] = y_pred
                print("Fold {} done".format(j))
                j += 1         
        print('fit meta_model!')
        # 使用次级训练集来训练次级学习器
        self.meta_model.fit(out_of_fold_predictions, y)
        return self
    

    #在上面的fit方法当中，我们已经将我们训练出来的初级学习器和次级学习器保存下来了
    #predict的时候只需要用这些学习器构造我们的次级预测数据集并且进行预测就可以了
    def predict(self, X):
        meta_features = np.zeros((X.shape[0], len(self.base_models)*8))
        for i,models in enumerate(self.base_models_):
            fold_xtest = np.zeros((X.shape[0],8))
            for model in models:
                fold_xtest += model.predict_proba(X)
            meta_features[:, i*8:(i+1)*8] = fold_xtest / self.n_folds
        return self.meta_model.predict(meta_features), self.meta_model.predict_proba(meta_features)

In [10]:
stack = StackingModels(base_learners, meta_learner)

In [11]:
stack.fit(X_train_tfidf, y_train)

正在训练第1个model
Fold 1 done
Fold 2 done
Fold 3 done
Fold 4 done
Fold 5 done
正在训练第2个model




Fold 1 done
Fold 2 done
Fold 3 done
Fold 4 done
Fold 5 done
fit meta_model!




StackingModels(base_models=[MultinomialNB(alpha=0.04, class_prior=None, fit_prior=True), LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)],
        meta_model=SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
        n_folds=5)

In [12]:
y_pred, _ = stack.predict(X_test_tfidf)

In [13]:
sum(y_pred == y_test) / len(y_test)

0.8507853403141361

In [14]:
from sklearn.metrics import classification_report
labels = {v:k for k,v in dic.items()}
y_test = y_test.apply(lambda x:labels[x])
y_pred = [labels[i] for i in y_pred.tolist()]
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          体育       0.99      1.00      0.99       209
          军事       0.74      0.85      0.79       199
          国际       0.82      0.68      0.74       218
          娱乐       0.88      0.89      0.88       189
          时尚       0.93      0.93      0.93       314
          汽车       0.96      0.96      0.96       252
          科技       0.71      0.82      0.76       260
          财经       0.80      0.68      0.73       269

   micro avg       0.85      0.85      0.85      1910
   macro avg       0.85      0.85      0.85      1910
weighted avg       0.85      0.85      0.85      1910



In [355]:
import pickle

pickle.dump(stack, open("./models_save/Stacking.pkl", 'wb'))