In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

In [2]:
class Node():
    def __init__(self):
        self.Average = 0
        self.Depth = 1
        self.Left = None
        self.Right = None
        self.Feature = None
        self.Threshlod = None
        self.Is_terminal = None

In [3]:
 class DecisionTreeReg():
    def __init__(self,max_depth = 3,min_sample_leaf = 1,min_sample_split = 2):
        self.max_depth = max_depth
        self.min_sample_leaf = min_sample_leaf
        self.min_sample_split = min_sample_split
        self.Tree = None
        
    def Avg(self,y):
        return sum(y) / y.shape[ 0 ]
    
    def MSE( self,y,y_pred ):
        return sum((y-y_pred)**2)
    
    def BestFeatureSplit( self,X,y ):
        BestSplitFea = None
        BestThre = None
        MinMse = np.inf
        
        for i in range( X.shape[ 1 ] ):
            fea_i = X[ :,i ]
            for item_j in np.unique(fea_i):
                threshold = item_j
                y_left = y[ fea_i <= threshold ]
                y_right = y[ fea_i > threshold ]
                if y_left.shape[ 0 ] == 0 or y_right.shape[ 0 ] == 0 :
                    continue
                    
                y_pred_left = self.Avg( y_left )
                y_pred_right = self.Avg( y_right )
                mse = self.MSE( y_left,y_pred_left ) + self.MSE( y_right, y_pred_right)
                
                if MinMse > mse:
                    MinMse = mse
                    BestSplitFea = i
                    BestThre = threshold
                    
        X_col = X[ :,BestSplitFea ]
        X_left ,X_right = X[ X_col <= BestThre,: ],X[ X_col > BestThre,: ]
        y_left, y_right = y[ X_col <= BestThre ],y[ X_col > BestThre]
        
        return BestSplitFea,BestThre,X_left,y_left,X_right,y_right
    
    def CreateTree(self,X,y,node):
        
        if node.Depth > self.max_depth:
            node.Is_terminal = True
            return 
        if X.shape[ 0 ] <= self.min_sample_split:
            node.Is_terminal = True
            return 
        
        splitCol,thresh,X_left,y_left,X_right,y_right = self.BestFeatureSplit( X,y )
        if X_left.shape[ 0 ] < self.min_sample_leaf or X_right.shape[ 0 ] < self.min_sample_leaf:
            node.Is_terminal = True
            return 
        
        node.Feature = splitCol
        node.Threshlod = thresh
        
        node.Left = Node()
        node.Left.Depth = node.Depth + 1
        node.Left.Average = self.Avg( y_left )
        
        node.Right = Node()
        node.Right.Depth = node.Depth + 1
        node.Right.Average = self.Avg( y_right )
        
        self.CreateTree( X_left,y_left,node.Left )
        self.CreateTree( X_right,y_right,node.Right )
        
    def fit(self,X,y):
        self.Tree = Node()
        self.Tree.Average = 0
        self.CreateTree( X,y,self.Tree )
        
    def predictSample(self,x,node):
        if node.Is_terminal:
            return node.Average
        
        if x[node.Feature] > node.Threshlod:
            y_pred =  self.predictSample( x,node.Right )
        else:
            y_pred =  self.predictSample( x,node.Left )
            
        return y_pred
    
    def predict( self,X ):
        y_preds = [ ]
        for x in X:
            y_pred = self.predictSample(x,self.Tree)
            y_preds.append( y_pred )
        return np.asarray( y_preds ) 
    
    
if __name__ == "__main__":
    dates = load_boston()
    X = dates.data
    y= dates.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    model = DecisionTreeReg(max_depth = 20, min_sample_leaf = 1,min_sample_split = 2)
    model.fit(X_train, y_train)  
    y_pred=model.predict(X_test)
    print("验证集的均方根误差（rmse）是：",np.sqrt(mean_squared_error(y_test,y_pred)))

验证集的均方根误差（rmse）是： 6.1504841990166055
