# 在一个难数据集上应用 AdaBoost

**说明:**

将 `horseColicTraining2.txt` 和 `horseColicTest2.txt` 放在当前目录下。

In [53]:
import numpy as np
def get_data(filename):
    dataMatirx = []
    dataLabel = []
    with open(filename,'r+') as file:
        dataList=file.readlines()
        for data in dataList:
            data_tmp = data.strip().split()
            dataMatirx.append(data_tmp[:-1])
            dataLabel.append(data_tmp[-1])

    dataMatrix = np.array(dataMatirx,dtype=float)
    dataLabel = np.array(dataLabel,dtype=float)
    return (dataMatrix,dataLabel)

def get_stump_classify(dataMatrix,dimen,threshold_val,threshold_type):
    
    n_sample = dataMatrix.shape[0]
    result = np.ones((n_sample,1))
    if threshold_type == 'lt':
        result[dataMatrix[:,dimen] <= threshold_val] = -1.0
    else:
        result[dataMatrix[:,dimen] > threshold_val] = -1.0
    
    return result

def get_stump(dataMatrix,dataLabel,D):
    (n_sample,n_feature)=dataMatrix.shape
    dataLabel=dataLabel.reshape((n_sample,1))
    step_cnt=10.0
    min_error=np.inf
    best_stump={}
    best_predict_label=np.ones((n_sample,1))

    for i in range(n_feature):
        val_min=min(dataMatrix[:,i])
        val_max=max(dataMatrix[:,i])
        step_size=(val_max-val_min)/step_cnt
        for j in range(-1,int(step_cnt)+1):
            for types in ['lt','gt']:
                threshold_val = val_min + float(step_size)*j
                predictLabel = get_stump_classify(dataMatrix = dataMatrix,
                                                  dimen = i,
                                                  threshold_val = threshold_val,
                                                  threshold_type = types)
                error_arr=np.ones((n_sample,1))
                error_arr[predictLabel==dataLabel] = 0
                weight_error=np.dot(D.T,error_arr)
                weight_error=weight_error[0][0]
                if weight_error < min_error:
                    min_error = weight_error
                    best_stump['dim'] = i
                    best_stump['threshold_val'] = threshold_val
                    best_stump['threshold_type'] = types
                    best_predict_label = predictLabel.copy()
    
    return min_error,best_stump,best_predict_label

def AdaBoostDS(dataMatrix,dataLabel,baseModelCnt=40):
    n_sample=dataMatrix.shape[0]
    dataLabel=dataLabel.reshape((n_sample,1))
    D=np.ones((n_sample,1))/n_sample
    AdaBoost_Model=[]
    agg_predict_label=np.zeros((n_sample,1))
    for i in range(0,baseModelCnt):
        min_error,best_stump,best_predict_label = get_stump(dataMatrix,dataLabel,D)
     
        alpha = float(0.5*np.log((1-min_error)/(min_error+1e-16)))
        best_stump['alpha'] = alpha
        AdaBoost_Model.append(best_stump)
        
        tmp = -1.0*alpha*dataLabel*best_predict_label
        D = D*np.exp(tmp)
        D = D/np.sum(D)
        agg_predict_label += alpha*best_predict_label
        agg_error_rate = np.sign(agg_predict_label) == dataLabel
        error_rate=np.sum(agg_error_rate)/n_sample
        
        if error_rate==0:
            break
        
    return AdaBoost_Model

def predict(model,dataMatrix):
    
    base_model_cnt=len(model)
    agg_predict_label=np.zeros((dataMatrix.shape[0],1))
    for i in range(0,base_model_cnt):
        base_model = model[i]
        alpha = base_model['alpha']
        dim = base_model['dim']
        threshold_val = base_model['threshold_val']
        threshold_type = base_model['threshold_type']
        predict_label_tmp = get_stump_classify(dataMatrix,dim,threshold_val,threshold_type)
        agg_predict_label += alpha*predict_label_tmp
    
    predict_label=np.sign(agg_predict_label)
    predict_label=np.squeeze(predict_label)
    return predict_label


### 加载数据

In [51]:

filename = 'data/horseColicTraining2.txt'
data_train,label_train = get_data(filename)

filename = 'data/horseColicTest2.txt'
data_test,label_test = get_data(filename)

print("训练集：",data_train.shape)
print("训练集标签：",label_train.shape)
print("测试集：",data_test.shape)
print("测试集标签：",label_test.shape)


训练集： (299, 21)
训练集标签： (299,)
测试集： (67, 21)
测试集标签： (67,)


### 模型构建和预测

In [67]:
baseModelCntList=[1,10,50,100,500,1000,5000,10000]

for baseModelCnt in baseModelCntList:
    
    adaboost_model=AdaBoostDS(data_train,label_train,baseModelCnt)
    pre_label_train=predict(adaboost_model,data_train)
    error_rate_train=1-np.sum(pre_label_train==label_train)/len(label_train)
    error_rate_train=round(error_rate_train,3)
    
    pre_label_train=predict(adaboost_model,data_test)
    error_rate_test=1-np.sum(pre_label_train==label_test)/len(label_test)
    error_rate_test=round(error_rate_test,3)
    
    print('iters={0},训练集:errer_rate={1},测试集:error_rate={2}'.format(baseModelCnt,error_rate_train,error_rate_test))


iters=1,训练集:errer_rate=0.284,测试集:error_rate=0.269
iters=10,训练集:errer_rate=0.231,测试集:error_rate=0.239
iters=50,训练集:errer_rate=0.187,测试集:error_rate=0.209
iters=100,训练集:errer_rate=0.191,测试集:error_rate=0.224
iters=500,训练集:errer_rate=0.157,测试集:error_rate=0.254
iters=1000,训练集:errer_rate=0.14,测试集:error_rate=0.313
iters=5000,训练集:errer_rate=0.11,测试集:error_rate=0.328
iters=10000,训练集:errer_rate=0.11,测试集:error_rate=0.328
