# 模型融合 (Model Yuugou)

Yuugou？？？哈哈哈最近刷日剧有点上头

### Bagging
    - Reduces variance and increases accuracy
    - Robust against outliers or noisy data
    - Often used with Decision Trees
### Boosting 
    - Also reduces varience and increases accuracy
    - NOT robust against outliers or noisy data
    - Flexible - can be used with any loss function
### Stacking 
    - Used to ensemble a diverse group of strong learners
    - Involves training a second-level ML algorithm called a "metalearner" 
      to learn theoptimal combination of the base learners

In [23]:
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics

## Example: Regression Problem (Stacking)

In [24]:
def Stacking_method(train_reg1, train_reg2, train_reg3, y_train_true, 
                    test_pre1, test_pre2, test_pre3, 
                    model_L2= linear_model.LinearRegression()):
    
    model_L2.fit(pd.concat([pd.Series(train_reg1), 
                            pd.Series(train_reg2), 
                            pd.Series(train_reg3)], axis = 1).values, 
                 y_train_true)
    
    Stacking_result = model_L2.predict(pd.concat([pd.Series(test_pre1), 
                                                  pd.Series(test_pre2), 
                                                  pd.Series(test_pre3)], axis = 1).values)
    
    return Stacking_result

In [25]:
## 生成数据

train_reg1 = [3.2, 8.2, 9.1, 5.2]
train_reg2 = [2.9, 8.1, 9.0, 4.9]
train_reg3 = [3.1, 7.9, 9.2, 5.0]

y_train_true = [3, 8, 9, 5] 

test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

y_test_true = [1, 3, 2, 6] 

In [26]:
model_L2= linear_model.LinearRegression()

Stacking_pre = Stacking_method(train_reg1, train_reg2, train_reg3, y_train_true,
                               test_pre1, test_pre2, test_pre3, model_L2)

print('Stacking_pre MAE:', metrics.mean_absolute_error(y_test_true, Stacking_pre))

Stacking_pre MAE: 0.042134831460675204


In [27]:
# 加权平均

def Weighted_method(test_pre1, test_pre2, test_pre3, w=[1/3,1/3,1/3]):
    
    Weighted_result = w[0]*pd.Series(test_pre1) + w[1]*pd.Series(test_pre2) + w[2]*pd.Series(test_pre3)
    
    return Weighted_result

In [28]:
w = [0.3,0.4,0.3] # 权重

Weighted_pre = Weighted_method(test_pre1, test_pre2, test_pre3, w)

print('Weighted_pre MAE:',metrics.mean_absolute_error(y_test_true, Weighted_pre))
print('Stacking_pre MAE:', metrics.mean_absolute_error(y_test_true, Stacking_pre))

Weighted_pre MAE: 0.05750000000000027
Stacking_pre MAE: 0.042134831460675204


## Example: Classification Problem

In [29]:
from sklearn.datasets import make_blobs
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

### Voting Mechanism 

- **Majority Voting**: 

The final output class label is the one that receives more than half of the votes.

- **Plurality Voting**: 

It takes the class label which receives the largest number of votesas the final winner.

- **Weighting Voting**: 

It gives more power to the stronger classifiers in voting.

- **Soft Voting**: 

For individual classifiers which produce class probability outputs

(from *Ensemble Methods: Foundations and Algorithms*)

In [30]:
iris = datasets.load_iris()

x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

clf1 = XGBClassifier(learning_rate = 0.1, n_estimators = 150, 
                     max_depth = 3, min_child_weight = 2, subsample = 0.7,
                     colsample_bytree = 0.6, objective = 'binary:logistic')

clf2 = RandomForestClassifier(n_estimators=50, max_depth=1, min_samples_split=4,
                              min_samples_leaf = 63, oob_score = True)

clf3 = SVC(C = 0.1, gamma = 'auto')

In [31]:
# 硬投票

eclf_hard = VotingClassifier(estimators = [('xgb', clf1), ('rf', clf2), ('svc', clf3)], 
                             voting = 'hard')

for clf, label in zip([clf1, clf2, clf3, eclf_hard], ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    
    scores = cross_val_score(clf, x, y, cv = 5, scoring = 'accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.96 (+/- 0.02) [XGBBoosting]
Accuracy: 0.33 (+/- 0.00) [Random Forest]
Accuracy: 0.95 (+/- 0.03) [SVM]
Accuracy: 0.95 (+/- 0.03) [Ensemble]


In [32]:
# 软投票

clf3 = SVC(C = 0.1, probability = True, gamma = 'auto')

eclf_soft = VotingClassifier(estimators = [('xgb', clf1), ('rf', clf2), ('svc', clf3)], 
                             voting = 'soft', weights = [2, 1, 1])

for clf, label in zip([clf1, clf2, clf3, eclf_soft], ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    
    scores = cross_val_score(clf, x, y, cv = 5, scoring = 'accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.96 (+/- 0.02) [XGBBoosting]
Accuracy: 0.33 (+/- 0.00) [Random Forest]
Accuracy: 0.95 (+/- 0.03) [SVM]
Accuracy: 0.96 (+/- 0.02) [Ensemble]


### Stacking

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
import pandas as pd

data_0 = iris.data
data = data_0[:100,:]

target_0 = iris.target
target = target_0[:100]

X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.3, random_state=2020)
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))

In [34]:
clfs = [LogisticRegression(solver = 'lbfgs'),
        RandomForestClassifier(n_estimators = 5, n_jobs = -1, criterion = 'gini'),
        ExtraTreesClassifier(n_estimators = 5, n_jobs = -1, criterion = 'gini'),
        ExtraTreesClassifier(n_estimators = 5, n_jobs = -1, criterion = 'entropy'),
        GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 5)]


In [35]:
n_splits = 5
skf = StratifiedKFold(n_splits)
skf = skf.split(X, y)

# skf是一个generator
for j, clf in enumerate(clfs):

    dataset_blend_test_j = np.zeros((X_predict.shape[0], 5))
    for i, (train, test) in enumerate(skf):

        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]

    # 对于测试集，直接用这k个模型的预测值均值作为新的特征。
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))

val auc Score: 1.000000
val auc Score: 0.500000
val auc Score: 0.500000
val auc Score: 0.500000
val auc Score: 0.500000


In [36]:
clf = LogisticRegression(solver = 'lbfgs')
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

print("Val auc Score of Stacking: %f" % (roc_auc_score(y_predict, y_submission)))

Val auc Score of Stacking: 1.000000


### Blending