In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import mean_absolute_error

### 简单加权平均，结果直接融合

In [2]:
## 生成一些简单的样本数据，test_prei 代表第i个模型的预测值
test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

# y_test_true 代表第模型的真实值
y_test_true = [1, 3, 2, 6] 

In [3]:
## 三个基学习器，w为权重，这里默认为粗略的平均值
def Weighted_method(test_pre1,test_pre2,test_pre3,w=[1/3,1/3,1/3]):
    Weighted_result = w[0]*pd.Series(test_pre1)+w[1]*pd.Series(test_pre2)+w[2]*pd.Series(test_pre3)
    return Weighted_result

In [6]:
mean_absolute_error(y_test_true, test_pre1),mean_absolute_error(y_test_true, test_pre2),mean_absolute_error(y_test_true, test_pre3)

(0.1750000000000001, 0.07499999999999993, 0.10000000000000009)

In [7]:
# 定义权重
w = [0.3, 0.4, 0.3]
weighted_pre = Weighted_method(test_pre1, test_pre2, test_pre3, w)
weighted_pre

0    1.05
1    3.07
2    2.09
3    6.02
dtype: float64

In [9]:
weighted_pre.values

array([1.05, 3.07, 2.09, 6.02])

### MAE变得更小了

In [10]:
mean_absolute_error(y_test_true, weighted_pre.values)

0.05750000000000027

### mean平均法

In [11]:
pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)])

0    1.2
1    3.2
2    2.1
3    6.2
0    0.9
1    3.1
2    2.0
3    5.9
0    1.1
1    2.9
2    2.2
3    6.0
dtype: float64

In [17]:
pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)], axis=1)

Unnamed: 0,0,1,2
0,1.2,0.9,1.1
1,3.2,3.1,2.9
2,2.1,2.0,2.2
3,6.2,5.9,6.0


In [13]:
pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)], axis=1).mean()

0    3.175
1    2.975
2    3.050
dtype: float64

In [19]:
## 定义结果的平均函数
def Mean_method(test_pre1, test_pre2, test_pre3):
    Mean_result = pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)],axis=1).mean(axis=1)
    return Mean_result

In [20]:
Mean_pre = Mean_method(test_pre1, test_pre2, test_pre3)
mean_absolute_error(y_test_true, Mean_pre)

0.06666666666666693

In [23]:
pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)],axis=1)

Unnamed: 0,0,1,2
0,1.2,0.9,1.1
1,3.2,3.1,2.9
2,2.1,2.0,2.2
3,6.2,5.9,6.0


In [24]:
pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)],axis=1).median(axis=1)

0    1.1
1    3.1
2    2.1
3    6.0
dtype: float64

In [21]:
## 定义结果的中位数函数（取中位数）
def Median_method(test_pre1,test_pre2,test_pre3):
    Median_result = pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)],axis=1).median(axis=1)
    return Median_result

In [22]:
Median_pre = Median_method(test_pre1,test_pre2,test_pre3)
mean_absolute_error(y_test_true, Median_pre)

0.07500000000000007

### Stacking融合（回归）

In [28]:
pd.concat([pd.Series(train_reg1),pd.Series(train_reg2),pd.Series(train_reg3)],axis=1).values

array([[3.2, 2.9, 3.1],
       [8.2, 8.1, 7.9],
       [9.1, 9. , 9.2],
       [5.2, 4.9, 5. ]])

In [38]:
from sklearn import linear_model
# 使用线性回归作stacking层
def Stacking_method(train_reg1,train_reg2,train_reg3,y_train_true,test_pre1,test_pre2,test_pre3,model_L2= linear_model.LinearRegression()):
    model_L2.fit(pd.concat([pd.Series(train_reg1),pd.Series(train_reg2),pd.Series(train_reg3)],axis=1).values,y_train_true)
    Stacking_result = model_L2.predict(pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)],axis=1).values)
    # 打印一下lr的参数
    print(model_L2.coef_, model_L2.intercept_)
    return Stacking_result

In [35]:
## 生成一些简单的样本数据，test_prei 代表第i个模型的预测值
train_reg1 = [3.2, 8.2, 9.1, 5.2]
train_reg2 = [2.9, 8.1, 9.0, 4.9]
train_reg3 = [3.1, 7.9, 9.2, 5.0]
# y_test_true 代表第模型的真实值
y_train_true = [3, 8, 9, 5] 

test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

# y_test_true 代表第模型的真实值
y_test_true = [1, 3, 2, 6] 

In [39]:
mean_absolute_error(y_test_true, test_pre1), mean_absolute_error(y_test_true, test_pre2), mean_absolute_error(y_test_true, test_pre3)

(0.1750000000000001, 0.07499999999999993, 0.10000000000000009)

In [40]:
model_L2= linear_model.LinearRegression()
Stacking_pre = Stacking_method(train_reg1, train_reg2, train_reg3, y_train_true, test_pre1, test_pre2, test_pre3, model_L2)
mean_absolute_error(y_test_true, Stacking_pre)

[ 0.84269663 -0.05617978  0.2247191 ] -0.2303370786516865


0.04213483146067476

## 分类模型的融合

In [41]:
# 导入工具包
from sklearn.datasets import make_blobs
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

### 1）Voting投票机制（软投票与硬投票）

#### 硬投票就是 对多个模型进行投票，不区分重要程度（没有权重），投票数最多的类为最终被预测的类

In [49]:
# 用iris数据集进行分类
iris = datasets.load_iris()
type(iris)

sklearn.utils.Bunch

In [47]:
x = iris.data
type(x)

numpy.ndarray

In [50]:
y=iris.target
type(y)

numpy.ndarray

In [51]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [57]:
# 用3种方法训练模型
clf1 = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=3, min_child_weight=2, subsample=0.7, colsample_bytree=0.6,  objective='binary:logistic')

clf2 = RandomForestClassifier(n_estimators=1000, random_state=10)

clf3 = SVC(C=0.1)

In [58]:
# 硬投票
eclf = VotingClassifier(estimators=[('xgb', clf1), ('rf', clf2), ('svc', clf3)], voting='hard')
for clf, label in zip([clf1, clf2, clf3, eclf], ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy') #在这里面fit
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.96 (+/- 0.02) [XGBBoosting]
Accuracy: 0.97 (+/- 0.02) [Random Forest]
Accuracy: 0.92 (+/- 0.03) [SVM]
Accuracy: 0.96 (+/- 0.02) [Ensemble]


### 软投票：和硬投票原理相同，增加了设置权重的功能，可以为不同模型设置不同权重，进而区别模型不同的重要度。

In [60]:
# SVC要做一些修改，不然报错
clf3 = SVC(C=0.1, probability=True)

eclf = VotingClassifier(estimators=[('xgb', clf1), ('rf', clf2), ('svc', clf3)], voting='soft', weights=[2, 1, 1])
clf1.fit(x_train, y_train)

for clf, label in zip([clf1, clf2, clf3, eclf], ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.96 (+/- 0.02) [XGBBoosting]
Accuracy: 0.97 (+/- 0.02) [Random Forest]
Accuracy: 0.92 (+/- 0.03) [SVM]
Accuracy: 0.97 (+/- 0.02) [Ensemble]
