## 构建多个函数来建立单层决策树

In [58]:
import numpy as np
import adaboost

### 1、通过阈值比较对数据进行分类
#### 小于等于阈值的为-1，大于阈值的为1

In [59]:
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
    retArray=np.ones((np.shape(dataMatrix)[0],1))
    if threshIneq=='lt':
        retArray[dataMatrix[:,dimen] <= threshVal]=-1.0 #数组过滤，最终等号左边输出为满足[]内条件的数组元素
    else:
        retArray[dataMatrix[:,dimen] > threshVal]=1.0
    return retArray

### 2、在加权数据集中循环，并找到具有最低错误率的单层决策树
#### 单层决策树：仅基于单个特征进行决策

In [60]:
def buildStump(dataArr,classLabels,D):
    dataMatrix=np.mat(dataArr)
    labelMat=np.mat(classLabels).T
    m,n=np.shape(dataMatrix) # m=5,n=2
    numSteps=10.0 # 用于在特征的所有可能值上进行遍历
    bestStump={} # 空字典，用于存储给定权重向量D时所得到的最佳单层决策树的相关信息
    bestClassEst=np.mat(np.zeros((m,1)))
    minError=np.inf # inf 无穷大（最大的正数）
    for i in range(n): # 随数据集的每一列，执行以下操作
        rangeMin=dataMatrix[:,i].min()
        rangeMax=dataMatrix[:,i].max()
        stepSize=(rangeMax-rangeMin)/numSteps # 计算步长
        for j in range(-1,int(numSteps)+1): # 对每一步，执行如下操作
            for inequal in ['lt','gt']: # lt:less than（小于），gt：greater and equal（大于等于）
                threshVal=(rangeMin+float(j)*stepSize) # 每一步取得值
                predictedVals=stumpClassify(dataMatrix,i,threshVal,inequal) # 计算预测值
                errArr=np.mat(np.ones((m,1))) 
                errArr[predictedVals == labelMat] = 0 # 如果predictedVals中的值不等于labelMat中的真正类别标签值，则errArr的相应位置为1
                weightedError=D.T*errArr # 加权错误率
                
                print ("split:dim %d,thresh %.2f,thresh ineqal:%s,the weighted error is %.3f" %(i,threshVal,inequal,weightedError))
                
                if weightedError< minError:
                    minError=weightedError
                    bestClassEst=predictedVals.copy()
                    bestStump['dim']=i
                    bestStump['threshVal']=threshVal
                    bestStump['ineq']=inequal
    return bestStump,minError,bestClassEst

In [61]:
D=np.mat(np.ones((5,1))/5) # 权重
datMat,classLabels=adaboost.loadSimpData()
buildStump(datMat,classLabels,D)

split:dim 0,thresh 0.90,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 0.90,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.00,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.00,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.10,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.10,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.20,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.20,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.30,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.30,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.40,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.40,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.50,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.50,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.60,thresh ine

({'dim': 0, 'ineq': 'lt', 'threshVal': 1.3}, matrix([[0.2]]), array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

### 3、基于单层决策树的AdaBoost训练过程

In [82]:
import math

In [85]:
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr=[]
    m=np.shape(dataArr)[0]
    D=np.mat(np.ones((m,1))/m)
    aggClassEst=np.mat(np.zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst=buildStump(dataArr,classLabels,D)
        print ("D",D.T)
        alpha=float(0.5*math.log((1.0-error)/max(error,1e-16))) # 公式
        bestStump['alpha']=alpha
        weakClassArr.append(bestStump)
        print ("classEst:",classEst.T)
        expon=np.multiply(-1*alpha*np.mat(classLabels).T,classEst)
        D=np.multiply(D,np.exp(expon))
        D=D/D.sum()
        aggClassEst+=alpha*classEst
        print ("aggClassEst:",aggClassEst.T)
        aggErrors=np.multiply(np.sign(aggClassEst)!=np.mat(classLabels).T,np.ones((m,1)))
        errorRate=aggErrors.sum()/m
        print ("total error:",errorRate,"\n")
        if errorRate ==0.0:
            break
    return weakClassArr

In [86]:
classifierArray=adaBoostTrainDS(datMat,classLabels,9)

split:dim 0,thresh 0.90,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 0.90,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.00,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.00,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.10,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.10,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.20,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.20,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.30,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.30,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.40,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.40,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.50,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.50,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.60,thresh ine

### 4、测试算法：基于AdaBoost的分类

In [91]:
def adaClassify(datToClass,classifierArr):
    dataMatrix=np.mat(datToClass)
    m=np.shape(dataMatrix)[0]
    aggClassEst=np.mat(np.zeros((m,1)))
    for i in range (len(classifierArr)):
        classEst=stumpClassify(dataMatrix,classifierArr[i]['dim'],classifierArr[i]['threshVal'],classifierArr[i]['ineq'])
        aggClassEst+=classifierArr[i]['alpha']*classEst
        print (aggClassEst)
    return np.sign(aggClassEst)

In [92]:
datArr,labelArr=adaboost.loadSimpData()
classifierArr=adaBoostTrainDS(datArr,labelArr,30)
adaClassify([0,0],classifierArr)

split:dim 0,thresh 0.90,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 0.90,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.00,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.00,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.10,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.10,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.20,thresh ineqal:lt,the weighted error is 0.400
split:dim 0,thresh 1.20,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.30,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.30,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.40,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.40,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.50,thresh ineqal:lt,the weighted error is 0.200
split:dim 0,thresh 1.50,thresh ineqal:gt,the weighted error is 0.400
split:dim 0,thresh 1.60,thresh ine

matrix([[-1.]])