# 集成学习(Ensemble Learning)
**姓名:** 独宇涵 **邮箱:** 231880151@smail.nju.edu.cn

**实验环境:** python 3.12.8

## 复现材料
实验过程中需要我们手动实现三种常见的集成学习算法（**Stacking**、**Bagging**、**AdaBoost**），下面是三种算法的代码实现
### Stacking
**输入:** 训练集 $D = \{(x_1, y_1), (x_2, y_2), \ldots, (x_m, y_m)\};$

初级学习算法 $\mathfrak{L}_1, \mathfrak{L}_2, \ldots, \mathfrak{L}_T;$

次级学习算法 $\mathfrak{L}.$

**过程：**
1. $\text{for } t = 1, 2, \ldots, T \text{ do}$
2. $h_t = \mathfrak{L}_t(D);$
3. $\text{end for}$
4. $D' = \emptyset;$
5. $\text{for } i = 1, 2, \ldots, m \text{ do}$
6. $\quad \text{for } t = 1, 2, \ldots, T \text{ do}$
7. $\quad z_{it} = h_t(x_i);$
8. $\quad \text{end for}$
9. $D' = D' \cup ((z_{i1}, z_{i2}, \ldots, z_{iT}), y_i);$
10. $\text{end for}$
11. $h' = \mathfrak{L}(D');$

**输出：**$H(x) = h'(h_1(x), h_2(x), \ldots, h_T(x))$

In [10]:
import numpy as np
from sklearn.model_selection import KFold

np.random.seed(2025)
def get_stacking(clf,x_train,y_train,x_test,n_folds = 10):
    # 核心是使用交叉验证算法得到次级训练集
    train_num, test_num = x_train.shape[0], x_test.shape[0]
    second_level_train_set = np.zeros((train_num,))
    second_level_test_set = np.zeros((test_num,))
    test_nfolds_sets = np.zeros((test_num,n_folds))
    kf = KFold(n_splits = n_folds) #将数据进行折叠

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tra, y_tra = x_train[train_index], y_train[train_index]
        x_tst, y_tst = x_train[test_index], y_train[test_index]

        clf.fit(x_tra, y_tra)

        second_level_train_set[test_index] = clf.predict(x_tst)
        test_nfolds_sets[:,i] = clf.predict(x_test)

    second_level_test_set[:] = test_nfolds_sets.mean(axis = 1)
    return second_level_train_set, second_level_test_set

In [11]:
#构造初级和次级学习器
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, StackingClassifier,BaggingClassifier)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier()
adb_model = AdaBoostClassifier()
gdbc_model = GradientBoostingClassifier()
et_model = ExtraTreesClassifier()
svc_model = SVC()

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
train_x,test_x,train_y,test_y = train_test_split(iris.data, iris.target, test_size = 0.2)
train_sets = []
test_sets = []
base_classifiers = [rf_model, adb_model, gdbc_model, et_model, svc_model]

for clf in base_classifiers:
    train_set, test_set = get_stacking(clf, train_x, train_y, test_x)
    train_sets.append(train_set)
    test_sets.append(test_set)

meta_train = np.concatenate([result_set.reshape(-1,1) for result_set in train_sets], axis = 1)
meta_test = np.concatenate([result_set.reshape(-1,1) for result_set in test_sets], axis = 1)

# 使用决策树作为次级分类器
from sklearn.tree import DecisionTreeClassifier
meta_clf = DecisionTreeClassifier()
meta_clf.fit(meta_train,train_y)
predict_y = meta_clf.predict(meta_test)

accuracy = accuracy_score(test_y, predict_y)
print(f"Stacking Classifier Accuracy:{accuracy:.4f}")

Stacking Classifier Accuracy:0.9333


#### 封装为自定义类
上面实现了简单的一个运用Stacking集成学习方法的分类器，现在将上述算法封装为自定义的类，方便后续调用函数

In [12]:
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin,clone

class StackingAverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models,meta_model, n_folds = 5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self, X, y): #克隆原来的model，并且实现fit功能
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits = self.n_folds, shuffle = True, random_state = 156)

        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X,y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
            
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([np.column_stack([model.predict(X) for model in base_models]).mean(axis = 1) for base_models in self.base_models_])
        return self.meta_model_.predict(meta_features)
        
stacking_avg = StackingAverageModels(base_models = base_classifiers, meta_model = meta_clf)
stacking_avg.fit(train_x, train_y)
pred_y = stacking_avg.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)
print(f"Stacking Classifier Accuracy:{accuracy:.4f}")

Stacking Classifier Accuracy:0.9333


### Bagging
**输入：**

训练集 $D = \{(x_1, y_1), (x_2, y_2), \ldots, (x_m, y_m)\}$;

基学习算法 $\mathcal{L}$;
    
训练轮数 $T$.

**过程：**

1: for $t = 1, 2, \ldots, T$ do

2: $\quad h_t = \mathcal{L}(D, \mathcal{D}_{bs})$

3: end for

输出：$H(x) = \arg\max_{y \in \mathcal{Y}} \sum_{t=1}^{T} \mathbb{I}(h_t(x) = y)$

In [13]:
class MyBaggingClassifier:
    def __init__(self, base_learner, n_learners):
        self.learners = [clone(base_learner) for _ in range(n_learners)]

    def fit(self,X,y):
        for learner in self.learners:
            examples = np.random.choice(np.arange(len(X)),int(len(X)),replace = True)
            learner.fit(X[examples,:],y[examples])

    def predict(self,X):
        preds = [learner.predict(X) for learner in self.learners]
        return self._aggregate(np.array(preds))
    
    def _aggregate(self, predictions):
        final_pred = np.apply_along_axis(lambda x:np.bincount(x).argmax(),axis = 0, arr = predictions.astype(int))
        return final_pred

### AdaBoost
**输入：**
    训练集 $D = \{(x_1, y_1), (x_2, y_2), \ldots, (x_m, y_m)\}$；
    基学习算法 $\mathcal{L}$；
    训练轮数 $T$。

**过程：**
1. $\mathcal{D}_1(x) = 1/m$。
2. for $t = 1, 2, \ldots, T$ do
3. $\quad h_t = \mathcal{L}(D, \mathcal{D}_t)$;
4. $\quad \epsilon_t = P_{x \sim \mathcal{D}_t}(h_t(x) \neq f(x))$;
5. $\quad if \epsilon_t > 0.5$ then break
6. $\quad \alpha_t = \frac{1}{2} \ln \left( \frac{1 - \epsilon_t}{\epsilon_t} \right)$;
7. $\quad \mathcal{D}_{t+1}(x) = \frac{\mathcal{D}_t(x)}{Z_t} \times \left\{
\begin{array}{ll}
\exp(-\alpha_t), & \text{if } h_t(x) = f(x) \\
\exp(\alpha_t), & \text{if } h_t(x) \neq f(x)
\end{array}
\right.$

$\quad \quad \quad \quad \quad \quad \quad  = \mathcal{D}_t(x) \exp(-\alpha_t f(x) h_t(x)) / Z_t$

8. end for

**输出：**$H(x) = \text{sign} \left( \sum_{t=1}^{T} \alpha_t h_t(x) \right)$

In [14]:
class MyAdaBoostClassifier:
    def __init__(self, base_learner, n_learners):
        self.learners = [clone(base_learner) for _ in range(n_learners)]
        self.learning_rate = 1.0
        self.weak_classifiers = []

    def fit(self, X, y):
        sample_weights = np.ones_like(y) / len(y)
        for clf in self.learners:
            clf = DecisionTreeClassifier(max_depth = 1)
            clf.fit(X, y, sample_weight=sample_weights)
            y_pred = clf.predict(X)
            incorrect = np.sum(sample_weights * (y != y_pred))
            error_rate = incorrect / np.sum(sample_weights)
            if error_rate > 0.5:
                continue

            alpha = np.log((1.0 - error_rate) / error_rate) / 2.0
            self.weak_classifiers.append((clf, alpha))
            sample_weights *= np.exp(-alpha * y * y_pred)
            sample_weights /= np.sum(sample_weights)

    def predict(self, X):
        votes = np.zeros((X.shape[0],))
        for clf, alpha in self.weak_classifiers:
            votes += alpha * clf.predict(X)
        return np.sign(votes)

### 与sklearn的实现进行对比
将自主实现的三种集成学习算法与sklearn库中已有的方法进行性能对比

In [15]:
stacking_avg = StackingAverageModels(base_models=base_classifiers, meta_model=meta_clf)
stacking_avg.fit(train_x, train_y)
pred_y = stacking_avg.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)
print(f"My Stacking Classifier Accuracy: {accuracy:.4f}")

# base_classifier的格式与上面我们自己实现的稍有不同
base_classifiers = [
    ('rf', rf_model),
    ('adb', adb_model),
    ('gdbc', gdbc_model),
    ('et', et_model),
    ('svc', svc_model)
]
stacking_clf = StackingClassifier(estimators=base_classifiers,final_estimator=meta_clf, cv=5)
stacking_clf.fit(train_x, train_y)
pred_y = stacking_clf.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)
print(f"Sklearn Stacking Classifier Accuracy: {accuracy:.4f}")


My Stacking Classifier Accuracy: 0.9333
Sklearn Stacking Classifier Accuracy: 0.9000


In [18]:
base_classifier = DecisionTreeClassifier()
my_bagging_clf = MyBaggingClassifier(base_learner=base_classifier,n_learners=100)
my_bagging_clf.fit(train_x, train_y)
pred_y = my_bagging_clf.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)
print(f"My Bagging Classifier Accuracy: {accuracy:.4f}")

bagging_clf = BaggingClassifier(base_classifier,n_estimators=100,max_samples=0.5,max_features=0.5,random_state=42)
bagging_clf.fit(train_x, train_y)
pred_y = bagging_clf.predict(test_x)
accuracy = accuracy_score(test_y, pred_y)
print(f"Sklearn Bagging Classifier Accuracy: {accuracy:.4f}")

My Bagging Classifier Accuracy: 0.9333
Sklearn Bagging Classifier Accuracy: 0.9333


In [19]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2,n_redundant=0, random_state=42)
y = np.where(y == 0,-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
dt_clf = DecisionTreeClassifier(max_depth=1)

my_adaboost_clf = MyAdaBoostClassifier(dt_clf, n_learners=50)
my_adaboost_clf.fit(X_train, y_train)
pred_y = my_adaboost_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred_y)
print(f"My AdaBoost Classifier Accuracy: {accuracy:.4f}")

adaboost_clf = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost_clf.fit(X_train, y_train)
pred_y = adaboost_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred_y)
print(f"Sklearn AdaBoost Classifier Accuracy: {accuracy:.4f}")

My AdaBoost Classifier Accuracy: 0.8800
Sklearn AdaBoost Classifier Accuracy: 0.8800
