## AdaBoost 算法实现


In [4]:
import numpy as np
import matplotlib.pyplot as plt

In [5]:
def loadDataset():
    X = np.array([
        [1.0, 2.1],
        [2.0, 1.1],
        [1.3, 1.0],
        [1.0, 1.0],
        [2.0, 1.0]
    ])
    y = np.array([1.0, 1.0, -1.0, -1.0, 1.0]).reshape(5, 1)
    return X, y

In [6]:
def stumpClassify(X, dimen, threshVal, threshIneq):
    '''
    根据维度的阈值做一个简单的分类
        - dataset: 样本集
        - dimen : 数据维度
        - threshVal: 阈值
        - threshIneq: 比较操作符
    '''
    m, _ = X.shape
    y_hat = np.ones((m, 1))
    if threshIneq == 'lt':
        y_hat[X[:, dimen] <= threshVal] = -1.0
    else:
        y_hat[X[:, dimen] > threshVal] = 1.0
    return y_hat

In [7]:
# def exponentialLoss(y, y_hat):
#     '''
#     指数损失函数
#     '''
#     return sum(np.exp(-y * y_hat))

def buildStump(X, y, D):
    '''
    在数据集 X 上根据权重 D 寻找最好的分类器。
    - X 训练样本特征
    - y 训练样本标签
    - D 样本权重
    对于每一维的特征，如何找到最佳的分隔点那？我们可以对特征的所有值
    进行排序，然后依次对两个相邻的值除 2, 检测是否是最佳分隔点。
    在这里我们用别一种方法，即寻找出最大和最小值，对其平均分隔 10 个
    区间，看在区间的边界上是否是最佳分隔。
    '''
    m, n = X.shape
    minErr = np.inf
    bestStump = {}
    bestY = np.zeros((m, 1))
    intervals = 10
    
    for dim in range(n):
        dmin = X[:,dim].min()
        dmax = X[:,dim].max()
        
        span = (dmax - dmin) / intervals;
        # 简单一点寻找 10 次划分即可.
        for i in range(intervals):
            threshVal = dmin + i * span
            for op in ['lt', 'gt']:
                y_hat = stumpClassify(X, dim, threshVal, op)
                
                errArr = np.ones((m, 1))
                errArr[y_hat == y] = 0
                # 注意 D 权重向量的用法。
                err = np.dot(D.T, errArr).flatten()[0]
                
#                 print("split: dim {0}, thresh {1:0.2f}, op:{2}, err:{3:0.2f}%".format(
#                     dim, threshVal, op, err * 100
#                 ))
                if err < minErr:
                    minErr = err
                    bestY = y_hat.copy()
                    bestStump['dim'] = dim
                    bestStump['thresh'] = threshVal
                    bestStump['op'] = op
#     print(bestStump, minErr)
    return bestStump, minErr, bestY

In [8]:
X, y = loadDataset()
m, n = X.shape

# 测试

D = np.ones((m, 1))/m
print(D.shape, D.T)
buildStump(X, y, D)


(5, 1) [[0.2 0.2 0.2 0.2 0.2]]


({'dim': 0, 'thresh': 1.3, 'op': 'lt'}, 0.2, array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

In [9]:
class AdaBoost():
    def __init__(self, n_estimators = 10):
        '''
        初始化函数：
            - n_estimators 需要多少个弱分类器
        '''
        assert n_estimators > 0
        self.estimators = []
        self.alphas = []
        self.n_estimators = n_estimators
        
    def fit(self, X, y):
        '''
        复用 AdaBoost 算法来生成分类器。
        '''
        m, n = X.shape
        D = np.ones((m, 1)) / m
        epsilon = 0.0001
        
        for i in range(self.n_estimators):
            stump, err, y_hat = buildStump(X, y, D)
#             print(stump, err, y_hat)
            # 防止 err 为 0 造成无法收敛。
            err  += epsilon 
            alpha = 0.5 * np.log((1 - err) / err)
            # new weights and renormalization
            D = D * np.exp(-alpha * y * y_hat)
#             print(alpha,D)
            D = D / np.sum(D) # keep sum(D) = 1
            
            
            self.alphas.append(alpha)
            self.estimators.append(stump)
        return
    
    def predict(self, X):
        m, _ = X.shape
        y_hat = np.zeros((m, 1))
        
        for i in range(self.n_estimators):
            stump = self.estimators[i]
            y_hat += self.alphas[i] * stumpClassify(X, stump['dim'], stump['thresh'], stump['op'])
        return np.sign(y_hat)

In [10]:
clf = AdaBoost(10)
clf.fit(X, y)
y_hat = clf.predict(X)
m, n = X.shape
accuracy = np.sum(y_hat == y) / m * 100.0
print('accuracy: {0:0.2f}%'.format(accuracy))


accuracy: 100.00%


In [11]:
data = np.loadtxt('test-ada.txt')
X = data[:,0:2]
y = data[:,2].reshape(-1, 1)

train_X, train_y = X[0:80], y[0:80]
test_X, test_y = X[80:-1], y[80:-1]

for i in range(8, 16, 4):
    clf = AdaBoost(i)
    clf.fit(train_X, train_y)
    y_hat = clf.predict(test_X)

    accuracy = np.sum(y_hat == test_y) / m * 100.0
    print('{0} estimators, accuracy: {1:0.2f}%'.format(i, accuracy))
    



8 estimators, accuracy: 80.00%
12 estimators, accuracy: 80.00%


In [12]:
# 使用 scikit 来测试下, 那问题来了？如果特征均分刚好可以二分怎么办？ 其前面的误判为 0.
from sklearn.datasets import load_iris
X, y = load_iris(True)

X, y = X[0:100], y[0:100]
y[0:50] = -1

idx = np.random.permutation(100)
X = X[idx]
y = y[idx]

train_X, train_y = X[0:80], y[0:80].reshape(-1, 1)
test_X, test_y = X[80:-1], y[80:-1].reshape(-1, 1)

clf = AdaBoost(2)
clf.fit(train_X, train_y)
y_hat = clf.predict(test_X)

accuracy = np.sum(y_hat == test_y) / 20 * 100.0
print('{0} estimators, accuracy: {1:0.2f}%'.format(i, accuracy))

ModuleNotFoundError: No module named 'sklearn'