In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

In [2]:
class Stump():
    def __init__(self):
        self.State = 1
        self.MinError = 1
        self.BestSplitFea = 0
        self.Threshold = 0
    
    def Error(self,y,y_pred,weight):
        not_correct_num = np.where( y != y_pred, 1,0)
        error = np.dot(weight,not_correct_num)
        return error

        
    def fit(self,X,y,weight):
        for i in range(X.shape[ 1 ]):
            fea_i = X[ :,i ]
            min_of_fea_i = min(fea_i)
            max_of_fea_i = max(fea_i)
            step = (max_of_fea_i - min_of_fea_i)/10
            n = 1
            while min_of_fea_i +  n * step < max_of_fea_i:
                threshold = min_of_fea_i +  n * step             
                for j in [0,1]:
                    if j == 0:
                        y_pred = np.where( fea_i < threshold,1,-1)
                        error = self.Error(y,y_pred,weight)
                    else:
                        y_pred = np.where( fea_i > threshold,1,-1)
                        error = self.Error(y,y_pred,weight)
                        
                    if  error < self.MinError:
                        self.MinError = error
                        self.State = j
                        self.BestSplitFea = i
                        self.Threshold = threshold
                n += 1
    
    def predict( self,X ):
        feature = X[ :,self.BestSplitFea ]
        if self.State == 0:
            pred = np.where(feature < self.Threshold,1,-1)
        else:
            pred = np.where(feature > self.Threshold,1,-1)
        return pred
    
if __name__ == "__main__":
    datas = load_breast_cancer()
    X = datas.data
    y = datas.target
    y = np.where(y== 0,-1,1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)
    model = Stump()
    weight = np.ones(X_train.shape[ 0 ])* 1/X_train.shape[ 0 ]
    model.fit(X_train, y_train,weight)
    y_pred = model.predict(X_test)
    print('模型的准确率是:', accuracy_score(y_test, y_pred)) 
    print('模型的误差率是:', model.MinError)

模型的准确率是: 0.9122807017543859
模型的误差率是: 0.08131868131868131


In [3]:
class AdaBoost():
    def __init__(self,n_stump = 20):
        self.n_stump = n_stump
        self.Stumps = [ ]
        self.alpha = [ ]
        
    def Alpha(self,error):
        return 0.5 * np.log( (1 - error) / error )
        
    def Weight(self,y,y_pred,alpha,weight):
        exp = np.exp( -alpha * np.multiply( y , y_pred ))
        Z = np.multiply( weight,exp )
        new_weight = ( 1 / sum( Z ) ) * Z
        return new_weight
    
    def fit(self,X,y):
        weight = (1 / X.shape[ 0 ]) * np.ones(X.shape[ 0 ])
        for i in range(self.n_stump):
            classifier = Stump()
            classifier.fit(X,y,weight)
            error = classifier.MinError
            new_alpha = self.Alpha( error )
            y_pred = classifier.predict( X )
            weight = self.Weight(y,y_pred,new_alpha,weight)
            self.alpha.append( new_alpha )
            self.Stumps.append( classifier )
            
    def predict( self,X ):
        model_preds = np.array([model.predict(X) for model in self.Stumps])
        alphas = self.alpha
        y_pred = np.sum(np.multiply( alphas,model_preds.T).T,axis = 0)
        y_pred = np.sign( y_pred )
        return y_pred

if __name__ == "__main__":
    datas = load_breast_cancer()
    X = datas.data
    y = datas.target
    y = np.where(y== 0,-1,1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)
    model = AdaBoost(n_stump = 50)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('提升树的准确率是:', accuracy_score(y_test, y_pred))
        

提升树的准确率是: 0.9736842105263158
