# 集成学习(Ensemble Learning)
**姓名:** 独宇涵 **邮箱:** 231880151@smail.nju.edu.cn

**实验环境:** python 3.12.8

## 复现材料
实验过程中需要我们手动实现三种常见的集成学习算法（**Stacking**、**Bagging**、**AdaBoost**），下面是三种算法的代码实现
### Stacking
**输入:** 训练集 $D = \{(x_1, y_1), (x_2, y_2), \ldots, (x_m, y_m)\};$

初级学习算法 $\mathfrak{L}_1, \mathfrak{L}_2, \ldots, \mathfrak{L}_T;$

次级学习算法 $\mathfrak{L}.$

**过程：**
1. $\text{for } t = 1, 2, \ldots, T \text{ do}$
2. $h_t = \mathfrak{L}_t(D);$
3. $\text{end for}$
4. $D' = \emptyset;$
5. $\text{for } i = 1, 2, \ldots, m \text{ do}$
6. $\quad \text{for } t = 1, 2, \ldots, T \text{ do}$
7. $\quad z_{it} = h_t(x_i);$
8. $\quad \text{end for}$
9. $D' = D' \cup ((z_{i1}, z_{i2}, \ldots, z_{iT}), y_i);$
10. $\text{end for}$
11. $h' = \mathfrak{L}(D');$

**输出：**$H(x) = h'(h_1(x), h_2(x), \ldots, h_T(x))$

In [1]:
import numpy as np
from sklearn.model_selection import KFold

np.random.seed(2025)
def get_stacking(clf,x_train,y_train,x_test,n_folds = 10):
    # 核心是使用交叉验证算法得到次级训练集
    train_num, test_num = x_train.shape[0], x_test.shape[0]
    second_level_train_set = np.zeros((train_num,))
    second_level_test_set = np.zeros((test_num,))
    test_nfolds_sets = np.zeros((test_num,n_folds))
    kf = KFold(n_splits = n_folds) #将数据进行折叠

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tra, y_tra = x_train[train_index], y_train[train_index]
        x_tst, y_tst = x_train[test_index], y_train[test_index]

        clf.fit(x_tra, y_tra)

        second_level_train_set[test_index] = clf.predict(x_tst)
        test_nfolds_sets[:,i] = clf.predict(x_test)

    second_level_test_set[:] = test_nfolds_sets.mean(axis = 1)
    return second_level_train_set, second_level_test_set

In [2]:
#构造初级和次级学习器
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, StackingClassifier)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier()
adb_model = AdaBoostClassifier()
gdbc_model = GradientBoostingClassifier()
et_model = ExtraTreesClassifier()
svc_model = SVC()

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
train_x,test_x,train_y,test_y = train_test_split(iris.data, iris.target, test_size = 0.2)
train_sets = []
test_sets = []
base_classifiers = [rf_model, adb_model, gdbc_model, et_model, svc_model]

for clf in base_classifiers:
    train_set, test_set = get_stacking(clf, train_x, train_y, test_x)
    train_sets.append(train_set)
    test_sets.append(test_set)

meta_train = np.concatenate([result_set.reshape(-1,1) for result_set in train_sets], axis = 1)
meta_test = np.concatenate([result_set.reshape(-1,1) for result_set in test_sets], axis = 1)

# 使用决策树作为次级分类器
from sklearn.tree import DecisionTreeClassifier
meta_clf = DecisionTreeClassifier()
meta_clf.fit(meta_train,train_y)
predict_y = meta_clf.predict(meta_test)

accuracy = accuracy_score(test_y, predict_y)
print(f"Stacking Classifier Accuracy:{accuracy:.4f}")

Stacking Classifier Accuracy:0.9333


#### 封装为自定义类
上面实现了简单的一个运用Stacking集成学习方法的分类器，现在将上述算法封装为自定义的类，方便后续调用函数

In [9]:
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin,clone

class StackingAverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models,meta_model, n_folds = 5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self, X, y): #克隆原来的model，并且实现fit功能
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits = self.n_folds, shuffle = True, random_state = 156)

        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X,y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
            
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([np.column_stack([model.predict(X) for model in base_models]) for base_models in self.base_models_])
        return self.meta_model_.predict(meta_features)
        
stacking_avg = StackingAverageModels(base_models = base_classifiers, meta_model = meta_clf)
stacking_avg.fit(train_x, train_y)
pred_y = stacking_avg.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)
print(f"Stacking Classifier Accuracy:{accuracy:.4f}")

ValueError: X has 25 features, but DecisionTreeClassifier is expecting 5 features as input.