In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

In [2]:
class Node( ):
    def __init__(self):
        self.Value = 0
        self.Depth = 1
        self.Left = None 
        self.Right = None 
        self.Threshold = None
        self.Is_terminal = None 
        self.Feature = None

In [3]:
class RegTree( ):
    def __init__(self,max_depth = 10, min_sample_leaf = 1,min_sample_split = 2):
        self.max_depth = max_depth 
        self.min_sample_leaf = min_sample_leaf
        self.min_sample_split = min_sample_split
        self.Tree = None
        
    def Avg(self,y):
        return sum(y) / y.shape[ 0 ]
        
    def MSE( self,y,c ):
        return sum((y-c)**2)
    
    def BestSplitFeature(self,X,y):
        BestFeature = None
        Threshold = None
        MinMse = np.inf 
        for index in range( X.shape[ 1 ]):
            fea_i = X[ :,index ]
            max_fea_i = max(fea_i)
            min_fea_i = min(fea_i)
            step = (max_fea_i - min_fea_i) /10
            n = 1
            while min_fea_i + n * step < max_fea_i:
                threshold = min_fea_i + n * step
                R1 = y[ fea_i <= threshold ]
                R2 = y[ fea_i > threshold ]
                
                if R1.shape[ 0 ]==0 or R2.shape[ 0 ]==0:
                    continue
                    
                c1 = self.Avg( R1 )
                c2 = self.Avg( R2 )
                mse = self.MSE(R1,c1) + self.MSE(R2,c2)
                
                if MinMse > mse:
                    MinMse = mse
                    BestFeature = index
                    Threshold = threshold
                n += 1
        X_col = X[:,BestFeature]
        X_R1,X_R2 = X[ X_col<= Threshold,: ],X[ X_col > Threshold,: ]
        y_R1,y_R2 = y[ X_col<= Threshold ],y[ X_col > Threshold ]
        
        return BestFeature,Threshold,X_R1,y_R1,X_R2,y_R2
    
    def CreateTree(self,X,y,node):
        if node.Depth > self.max_depth or X.shape[ 0 ] <= self.min_sample_split:
            node.Is_terminal = True
            return 
        
        splitCol,thresh,X_left,y_left,X_right,y_right = self.BestSplitFeature( X,y )
        if X_left.shape[ 0 ] < self.min_sample_leaf or X_right.shape[ 0 ] < self.min_sample_leaf:
            node.Is_terminal = True
            return 
        
        node.Feature = splitCol
        node.Threshlod = thresh
        
        node.Left = Node()
        node.Left.Depth = node.Depth + 1
        node.Left.Value = self.Avg( y_left )
        
        node.Right = Node()
        node.Right.Depth = node.Depth + 1
        node.Right.Value = self.Avg( y_right )
        
        self.CreateTree( X_left,y_left,node.Left )
        self.CreateTree( X_right,y_right,node.Right )
        
    def fit(self,X,y):
        self.Tree = Node()
        self.Tree.Value = 0
        self.CreateTree( X,y,self.Tree )
        
    def predictSample(self,x,node):
        if node.Is_terminal:
            return node.Value
        
        if x[node.Feature] > node.Threshlod:
            y_pred =  self.predictSample( x,node.Right )
        else:
            y_pred =  self.predictSample( x,node.Left )
            
        return y_pred
    
    def predict( self,X ):
        y_preds = [ ]
        for x in X:
            y_pred = self.predictSample(x,self.Tree)
            y_preds.append( y_pred )
        return np.asarray( y_preds ) 
    
if __name__ == "__main__":
    dates = load_boston()
    X = dates.data
    y= dates.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    model = RegTree(max_depth = 7, min_sample_leaf = 1,min_sample_split = 2)
    model.fit(X_train, y_train)  
    y_pred=model.predict(X_test)
    print("验证集的均方根误差（rmse）是：",np.sqrt(mean_squared_error(y_test,y_pred)))

验证集的均方根误差（rmse）是： 5.82639325667181


In [15]:
class AdaBoostReg( ):
    def __init__(self,n_reg = 20,max_depth = 10, min_sample_leaf = 1,min_sample_split = 2):
        self.models = [ ]
        self.n_reg = n_reg
        self.max_depth = max_depth
        self.min_sample_leaf = min_sample_leaf
        self.min_sample_split = min_sample_split
        
    def fit(self,X,y):
        y_pred = 0
        for i in range( self.n_reg  ):
            residual = y - y_pred
            tree = RegTree(max_depth = self.max_depth, min_sample_leaf = self.min_sample_leaf
                           ,min_sample_split = self.min_sample_split)
            tree.fit(X,residual)
            self.models.append( tree )
            y_pred = self.predict( X )
            
            
    def predict(self,X):
        y_pred = np.sum(np.array([ tree.predict(X) for tree in self.models ]),axis = 0)
        return y_pred

if __name__ == "__main__":
    dates = load_boston()
    X = dates.data
    y= dates.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    model = AdaBoostReg(n_reg = 4,max_depth = 3, min_sample_leaf = 1,min_sample_split = 2)
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test)
    print("验证集的均方根误差（rmse）是：",np.sqrt(mean_squared_error( y_test,y_pred )))

验证集的均方根误差（rmse）是： 5.18055892167202
