# AdaBoost元算法
 * 前面已经介绍了五种不同的分类算法，它们各有优缺点。我们自然可以将不同的分类器组合起来，而这种组合结果则被称为集成方法（ensemble method）或者元算法（meta-algorithm）。使用集成方法时会有多种形式：可以是不同算法的集成，也可以是同一算法在不同设置下的集成，还可以是数据集不同部分分配给不同分类器之后的集成。接下来，我们将介绍基于同一种分类器多个不同实例的两种计算方法。在这些方法当中，数据集也会不断变化，而后应用于不同的实例分类器上。最后，我们会讨论如何利用机器学习问题的通用框架来应用AdaBoost算法。
   * bagging：基于数据随机重抽样的分类器构建方法
   * boosting

## 

## 基于单层决策树（decision stump，也称决策树桩）构建弱分类器

In [1]:
from numpy import *

In [42]:
def loadSimpData():
    datMat = matrix([[1., 2.1],
                    [2., 1.1],
                    [1.3, 1.],
                    [1., 1.],
                    [2., 1.]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return datMat, classLabels

In [43]:
datMat, classLabels = loadSimpData()

### 单层决策树生成函数

In [45]:
# 用于测试是否有某个值小于或者大于我们正在测试的阈值
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    retArray = ones((shape(dataMatrix)[0], 1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArray
# 在一个加权数据集中循环，并找到具有最低错误率的单层决策树
def buildStump(dataArr, classLabels, D):
    dataMatrix = mat(dataArr)
    labelMat = mat(classLabels).T
    m, n = shape(dataMatrix)
    numSteps = 10.0
    bestStump = {}
    bestClassEst = mat(zeros((m, 1)))
    minError = inf
    for i in range(n):
        rangeMin = dataMatrix[:, i].min()
        rangeMax = dataMatrix[:, i].max()
        stepSize = (rangeMax-rangeMin)/numSteps
        for j in range(-1, int(numSteps) + 1):
            for inequal in ['lt', 'gt']:
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = \
                        stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = mat(ones((m,1)))
                errArr[predictedVals == labelMat] = 0
                # 计算加权错误率
                weightedError = D.T*errArr
#                 print 'split: dim %d, thresh %.2f, thresh ineqal: \ 
#                        %s, the weighted error is %.3f' % (i, threshVal, inequal, weightedError)
                if weightedError < minError:
                    minError = weightedError
                    bestClassEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClassEst

# 官方文档
# def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
#     retArray = ones((shape(dataMatrix)[0],1))
#     if threshIneq == 'lt':
#         retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
#     else:
#         retArray[dataMatrix[:,dimen] > threshVal] = -1.0
#     return retArray
    

# def buildStump(dataArr,classLabels,D):
#     dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
#     m,n = shape(dataMatrix)
#     numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
#     minError = inf #init error sum, to +infinity
#     for i in range(n):#loop over all dimensions
#         rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
#         stepSize = (rangeMax-rangeMin)/numSteps
#         for j in range(-1,int(numSteps)+1):#loop over all range in current dimension
#             for inequal in ['lt', 'gt']: #go over less than and greater than
#                 threshVal = (rangeMin + float(j) * stepSize)
#                 predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan
#                 errArr = mat(ones((m,1)))
#                 errArr[predictedVals == labelMat] = 0
#                 weightedError = D.T*errArr  #calc total error multiplied by D
#                 #print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
#                 if weightedError < minError:
#                     minError = weightedError
#                     bestClasEst = predictedVals.copy()
#                     bestStump['dim'] = i
#                     bestStump['thresh'] = threshVal
#                     bestStump['ineq'] = inequal
#     return bestStump,minError,bestClasEst

In [9]:
D = mat(ones((5, 1))/5)
buildStump(dataMat, classLabels, D)

({'dim': 0, 'ineq': 'lt', 'thresh': 1.3}, matrix([[0.2]]), array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

## 完整 AdaBoost 算法的实现

In [46]:
# 基于单层决策树的AdaBoost训练过程
def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m, 1))/m)
    aggClassEst = mat(zeros((m, 1)))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        print 'D:', D.T
        alpha = float(0.5*log((1.0-error)/max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        print 'classEst: ',classEst.T
        # 为下一次迭代计算D
        expon = multiply(-1*alpha*mat(classLabels).T, classEst)
        D = multiply(D, exp(expon))
        D = D/D.sum()
        # 错误率累加计算
        aggClassEst += alpha*classEst
        print 'aggClassEst: ',aggClassEst.T
        aggErrors = multiply(sign(aggClassEst) != 
                            mat(classLabels).T, ones((m, 1)))
        errorRate = aggErrors.sum()/m
        print 'total error:',errorRate, '\n'
        if errorRate == 0.0:
            break
    return weakClassArr

###官方的文档
# def adaBoostTrainDS(dataArr,classLabels,numIt=40):
#     weakClassArr = []
#     m = shape(dataArr)[0]
#     D = mat(ones((m,1))/m)   #init D to all equal
#     aggClassEst = mat(zeros((m,1)))
#     for i in range(numIt):
#         bestStump,error,classEst = buildStump(dataArr,classLabels,D)#build Stump
#         print "D:",D.T
#         alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#calc alpha, throw in max(error,eps) to account for error=0
#         bestStump['alpha'] = alpha  
#         weakClassArr.append(bestStump)                  #store Stump Params in Array
#         print "classEst: ",classEst.T
#         expon = multiply(-1*alpha*mat(classLabels).T,classEst) #exponent for D calc, getting messy
#         D = multiply(D,exp(expon))                              #Calc New D for next iteration
#         D = D/D.sum()
#         #calc training error of all classifiers, if this is 0 quit for loop early (use break)
#         aggClassEst += alpha*classEst
#         print "aggClassEst: ",aggClassEst.T
#         aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
#         errorRate = aggErrors.sum()/m
#         print "total error: ",errorRate
#         if errorRate == 0.0: break
#     return weakClassArr

In [47]:
classifierArray = adaBoostTrainDS(datMat, classLabels, 9)

D: [[0.2 0.2 0.2 0.2 0.2]]
classEst:  [[-1.  1. -1. -1.  1.]]
aggClassEst:  [[-0.69314718  0.69314718 -0.69314718 -0.69314718  0.69314718]]
total error: 0.2 

D: [[0.5   0.125 0.125 0.125 0.125]]
classEst:  [[ 1.  1. -1. -1. -1.]]
aggClassEst:  [[ 0.27980789  1.66610226 -1.66610226 -1.66610226 -0.27980789]]
total error: 0.2 

D: [[0.28571429 0.07142857 0.07142857 0.07142857 0.5       ]]
classEst:  [[1. 1. 1. 1. 1.]]
aggClassEst:  [[ 1.17568763  2.56198199 -0.77022252 -0.77022252  0.61607184]]
total error: 0.0 



In [48]:
classifierArray

[{'alpha': 0.6931471805599453, 'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
 {'alpha': 0.9729550745276565, 'dim': 1, 'ineq': 'lt', 'thresh': 1.0},
 {'alpha': 0.8958797346140273, 'dim': 0, 'ineq': 'lt', 'thresh': 0.9}]

## 测试算法：基于 AdaBoost 的分类