In [11]:
import warnings
warnings.filterwarnings('ignore')
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import StratifiedKFold, KFold

In [9]:
# 以python自带的鸢尾花数据集为例
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
#切分一部分数据作为测试集
x_train,x_test,y_train,y_text = train_test_split(X, y, test_size=0.2, random_state=914)

In [18]:
x_train.shape

(120, 2)

In [19]:
y_text.shape

(30,)

In [173]:
class BasicModel(object):
    """Parent class of basic models"""
    def train(self, x_train, y_train, x_val, y_val):
        """return a trained model and eval metric o validation data"""
        pass
    
    def predict(self, model, x_test):
        """return the predicted result"""
        pass
    
    def get_oof(self, x_train, y_train, x_test, n_folds = 5):
        """K-fold stacking"""
        num_train, num_test = x_train.shape[0], x_test.shape[0]
        oof_train = np.zeros((num_train,3)) 
        oof_test = []
        oof_test_all_fold = np.zeros((num_test, n_folds))
        aucs = []
        KF = KFold(n_splits = n_folds, random_state=2017)
        for i, (train_index, val_index) in enumerate(KF.split(x_train)):
            print('{0} fold, train {1}, val {2}'.format(i, len(train_index), len(val_index)))
            x_tra, y_tra = x_train[train_index], y_train[train_index]
            x_val, y_val = x_train[val_index], y_train[val_index]
            model, auc = self.train(x_tra, y_tra, x_val, y_val)
            oof_train[val_index] = self.predict(model, x_val)
            oof_test.append(self.predict(model, x_test))
        oof_test = np.mean(np.hstack(oof_test), axis=1)
#         print('all aucs {0}, average {1}'.format(aucs, np.mean(aucs)))
        return oof_train, oof_test

In [174]:
import lightgbm as lgb
class LGBClassifier(BasicModel):
    def __init__(self):
        self.num_boost_round = 2000
        self.early_stopping_rounds = 15
        self.params = {
            'task': 'train',
            'boosting_type': 'gbdt',
#             'objective': 'binary',
            'objective': 'multiclass',
            'metric': 'multi_error',
            'num_leaves': 80,
            'learning_rate': 0.05,
            # 'scale_pos_weight': 1.5,
            'feature_fraction': 0.5,
            'bagging_fraction': 1,
            'bagging_freq': 5,
            'max_bin': 300,
            'is_unbalance': True,
            'lambda_l2': 5.0,
            'verbose' : -1,
            'num_class':3
            }
        
    def train(self, x_train, y_train, x_val, y_val):
        print('train with lgb model')
        lgbtrain = lgb.Dataset(x_train, y_train)
        lgbval = lgb.Dataset(x_val, y_val)
        model = lgb.train(self.params, 
                          lgbtrain,
                          valid_sets = lgbval,
                          verbose_eval = self.num_boost_round,
                          num_boost_round = self.num_boost_round,
                          early_stopping_rounds = self.early_stopping_rounds)
        return model, model.best_score['valid_0']
    
    def predict(self, model, x_test):
        print('test with lgb model')
        a=model.predict(x_test, num_iteration=model.best_iteration)
#         print(a.shape)
        return a

In [175]:
# get output of first layer models and construct as input for the second layer          
lgb_classifier = LGBClassifier()
lgb_oof_train, lgb_oof_test = lgb_classifier.get_oof(x_train, y_train, x_test)
print(lgb_oof_train.shape, lgb_oof_test.shape) 

0 fold, train 96, val 24
train with lgb model
Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[8]	valid_0's multi_error: 0.416667
test with lgb model
test with lgb model
1 fold, train 96, val 24
train with lgb model
Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[5]	valid_0's multi_error: 0.458333
test with lgb model
test with lgb model
2 fold, train 96, val 24
train with lgb model
Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[4]	valid_0's multi_error: 0.583333
test with lgb model
test with lgb model
3 fold, train 96, val 24
train with lgb model
Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[3]	valid_0's multi_error: 0.541667
test with lgb model
test with lgb model
4 fold, train 96, val 24
train with lgb model
Training until validation scores don't improve for 15 rounds
Early stopping, best 

In [177]:
# create two models for first-layer stacking: xgb and lgb
import xgboost as xgb
class XGBClassifier(BasicModel):
    def __init__(self):
        """set parameters"""
        self.num_rounds=1000
        self.early_stopping_rounds = 15
        self.params = {
#             'objective': 'binary:logistic',
            'objective': 'multi:softprob',
            'eta': 0.1,
            'max_depth': 8,
#             'eval_metric': 'auc',
            'metric': 'multi_error',
            'seed': 0,
            'silent' : 1,
            'num_class':3
            
         }
        
    def train(self, x_train, y_train, x_val, y_val):
        print('train with xgb model')
        xgbtrain = xgb.DMatrix(x_train, y_train)
        xgbval = xgb.DMatrix(x_val, y_val)
        watchlist = [(xgbtrain,'train'), (xgbval, 'val')]
        model = xgb.train(self.params, 
                          xgbtrain, 
                          self.num_rounds,
                          watchlist,
                          early_stopping_rounds = self.early_stopping_rounds)
        return model, float(model.eval(xgbval).split()[1].split(':')[1])

    def predict(self, model, x_test):
        print('test with xgb model')
        xgbtest = xgb.DMatrix(x_test)
        return model.predict(xgbtest)

In [178]:
xgb_classifier = XGBClassifier()
xgb_oof_train, xgb_oof_test = xgb_classifier.get_oof(x_train, y_train, x_test)
print(xgb_oof_train.shape, xgb_oof_test.shape)

0 fold, train 96, val 24
train with xgb model
Parameters: { metric, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-merror:0.04167	val-merror:0.08333
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 15 rounds.
[1]	train-merror:0.04167	val-merror:0.08333
[2]	train-merror:0.04167	val-merror:0.08333
[3]	train-merror:0.04167	val-merror:0.08333
[4]	train-merror:0.04167	val-merror:0.08333
[5]	train-merror:0.04167	val-merror:0.08333
[6]	train-merror:0.04167	val-merror:0.08333
[7]	train-merror:0.04167	val-merror:0.08333
[8]	train-merror:0.04167	val-merror:0.08333
[9]	train-merror:0.04167	val-merror:0.08333
[10]	train-merror:0.04167	val-merror:0.08333
[11]	train-merror:0.04167	val-merr

In [207]:
input_train = [xgb_oof_train, lgb_oof_train] 
input_test = np.array([xgb_oof_test, lgb_oof_test]).T
input_test.shape

(30, 2)

In [209]:
stacked_train = np.concatenate(input_train, axis=1)
stacked_test = input_test

print(stacked_train.shape, stacked_test.shape)

(120, 6) (30, 2)


In [210]:
# use LR as the model of the second layer
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [214]:
from sklearn import svm

In [None]:
# split for validation
n = int(stacked_train.shape[0] * 0.8)
x_tra, y_tra = stacked_train[:n], y_train[:n]
x_val, y_val = stacked_train[n:], y_train[n:]

In [220]:
# 输出标签
clf = svm.SVC(decision_function_shape='ovo',probability=False)
clf.fit(x_tra,y_tra)
y_pred=clf.predict(x_val)

In [217]:
# 输出概率
clf = svm.SVC(decision_function_shape='ovo',probability=True)
clf.fit(x_tra,y_tra)
y_pred=clf.predict_proba(x_val)

array([[0.94063807, 0.03245605, 0.02690587],
       [0.94063807, 0.03245605, 0.02690587],
       [0.94063807, 0.03245605, 0.02690587],
       [0.01554226, 0.53502404, 0.44943371],
       [0.01126673, 0.91413412, 0.07459915],
       [0.01131326, 0.91407162, 0.07461512],
       [0.01191187, 0.09403685, 0.89405128],
       [0.01170821, 0.09416351, 0.89412828],
       [0.93953054, 0.03317928, 0.02729018],
       [0.93906664, 0.03347105, 0.02746231],
       [0.01124835, 0.91409274, 0.07465891],
       [0.93953054, 0.03317928, 0.02729018],
       [0.01170821, 0.09416351, 0.89412828],
       [0.01191187, 0.09403685, 0.89405128],
       [0.01084207, 0.9026358 , 0.08652213],
       [0.01176141, 0.094129  , 0.89410959],
       [0.93953054, 0.03317928, 0.02729018],
       [0.01176141, 0.094129  , 0.89410959],
       [0.93906664, 0.03347105, 0.02746231],
       [0.94063807, 0.03245605, 0.02690587],
       [0.94063807, 0.03245605, 0.02690587],
       [0.01091375, 0.90341611, 0.08567014],
       [0.

In [223]:
y_pred,y_val

(array([0, 0, 0, 1, 1, 1, 2, 2, 0, 0, 1, 0, 2, 2, 1, 2, 0, 2, 0, 0, 0, 1,
        1, 2]),
 array([0, 0, 0, 1, 1, 1, 2, 2, 0, 0, 1, 0, 2, 2, 1, 2, 0, 2, 0, 0, 0, 1,
        2, 2]))

In [235]:
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import precision_score, confusion_matrix

In [236]:
sw = compute_sample_weight(class_weight='balanced',y=y_val)
cm =confusion_matrix(y_val, y_pred, sample_weight=sw)
cm

array([[8., 0., 0.],
       [0., 8., 0.],
       [0., 1., 7.]])

In [211]:
# 并没有用逻辑回归做鸢尾花的多分类
# 逻辑回归
model = LinearRegression()
model.fit(x_tra,y_tra)

y_pred = model.predict(x_val)
print(metrics.roc_auc_score(y_val, y_pred))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

stacking:

1. stacking的原理介绍：https://wulc.me/2018/01/21/stacking%20%E7%9A%84%E5%9F%BA%E6%9C%AC%E6%80%9D%E6%83%B3%E5%8F%8A%E4%BB%A3%E7%A0%81%E5%AE%9E%E7%8E%B0/
2. stacking项目：https://github.com/WuLC/MachineLearningAlgorithm/blob/master/python/Stacking.py

svc:https://blog.csdn.net/BabyBirdToFly/article/details/72886879

xgb,lgb：https://www.cnblogs.com/nxf-rabbit75/p/9748345.html

xgb，lgb参数介绍：https://www.jianshu.com/p/1100e333fcab

逻辑回归多分类：https://blog.csdn.net/weixin_39541558/article/details/80621692

鸢尾花多分类：https://blog.csdn.net/golden1314521/article/details/46564227

macro,micro评价指标：https://zhuanlan.zhihu.com/p/59862986