In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
import copy
from sklearn.metrics import r2_score

In [2]:
# 对数据集的说明：
# CRIM：城镇人均犯罪率。
# ZN：住宅用地超过 25000 sq.ft. 的比例。
# INDUS：城镇非零售商用土地的比例。
# CHAS：查理斯河空变量（如果边界是河流，则为1；否则为0）。
# NOX：一氧化氮浓度。
# RM：住宅平均房间数。                             
# AGE：1940 年之前建成的自用房屋比例。
# DIS：到波士顿五个中心区域的加权距离。
# RAD：辐射性公路的接近指数。
# TAX：每 10000 美元的全值财产税率。
# PTRATIO：城镇师生比例。
# B：1000（Bk-0.63）^ 2，其中 Bk 指代城镇中黑人的比例。
# LSTAT：人口中地位低下者的比例。
# MEDV：自住房的平均房价，以千美元计。

In [3]:
def loadData(filePath):
    data = pd.read_csv(filePath)
    if 'MEDV' in data.columns:
        X = data.values[:,:-1]
        y = data.values[:,-1]
        return X, y
    else:
        X = data.values
        return X

In [4]:
class TreeNode:
    def __init__(self):
#         self.isdispersed = None
        self.featureID = None
        self.featureValue = None
        self.leftBranch = None
        self.rightBranch = None
        self.value = None
        
class GBDT:
    def __init__(self):
        pass
    
    # 找到最佳切分点， 最小化两边的平方误差和
    def findBestSplitPoint(self, X, residual):
        featureID, featureValue, leftList, rightList = -1, -1, None, None
        err = 1e16
        for col in range(self.m):
            if self.isdispersed[col]:
                featureValues = set(X[:,col])
                if len(featureValues) == 1:
                    continue
                for val in featureValues:
                    _leftList = np.arange(X.shape[0])[X[:,col]!=val]
                    _rightList = np.arange(X.shape[0])[X[:,col]==val]
                    leftMean = residual[_leftList].mean()
                    rightMean = residual[_rightList].mean()
                    tmpErr = np.square(residual[_leftList] - leftMean).sum() + np.square(residual[_rightList] - rightMean).sum()
                    if tmpErr < err:
                        err, featureID, featureValue, leftList, rightList = tmpErr, col, val, _leftList, _rightList
            else:
                featureValues = list(set(X[:,col]))
                featureValues.sort()
                if len(featureValues) == 1:
                    continue
                for i in range(len(featureValues) - 1):
                    val = (featureValues[i + 1] + featureValues[i]) / 2
                    _leftList = np.arange(X.shape[0])[X[:,col]<=val]
                    _rightList = np.arange(X.shape[0])[X[:,col]>val]
                    leftMean = residual[_leftList].mean()
                    rightMean = residual[_rightList].mean()
                    tmpErr = np.square(residual[_leftList] - leftMean).sum() + np.square(residual[_rightList] - rightMean).sum()
                    if tmpErr < err:
                        err, featureID, featureValue, leftList, rightList = tmpErr, col, val, _leftList, _rightList
        return featureID, featureValue, leftList, rightList
    
    # 建树拟合残差(梯度)
    def buildTree(self, X, residual, depth):
        curNode = TreeNode()
        if depth >= self.maxDepth or X.shape[0] <= self.minLeafSamples:
            curNode.value = residual.mean()
            return curNode
        featureID, featureValue, leftList, rightList = self.findBestSplitPoint(X, residual)
        if featureID == -1:
            curNode.value = residual.mean()
            return curNode
        curNode.featureID, curNode.featureValue, curNode.leftList, curNode.rightList = featureID, featureValue, leftList, rightList
        curNode.leftBranch = self.buildTree(X[leftList,:], residual[leftList], depth + 1)
        curNode.rightBranch = self.buildTree(X[rightList,:], residual[rightList], depth + 1)
        return curNode
    
    
    # 单样本单树预测
    def __predict(self, root, x):
        if root.value is not None:
            return root.value
        if self.isdispersed[root.featureID]:
            if x[root.featureID] != root.featureValue:
                return self.__predict(root.leftBranch, x)
            else:
                return self.__predict(root.rightBranch, x)
        else:
            if x[root.featureID] <= root.featureValue:
                return self.__predict(root.leftBranch, x)
            else:
                return self.__predict(root.rightBranch, x)
        
    def fit(self, X, y, treeNum = 10, maxDepth = 10, minLeafSamples = 10):
        self.X, self.y = X, y
        self.treeNum = treeNum                 # 弱学习器的数量
        self.maxDepth = maxDepth               # 树的最大深度
        self.minLeafSamples = minLeafSamples   # 节点样本少于minLeafSamples不再分支
        self.residual = copy.deepcopy(self.y)
        self.treeList = []
        self.N, self.m = X.shape
        self.isdispersed = np.array([False for _ in range(self.N)])   # 是否是离散值
        
        # 不同值只有10个以下的就作为离散值处理， 其他作为连续值处理
        for col in range(self.m):
            if len(set(self.X[:,col])) < 10:
                self.isdispersed[col] = True
        
        # 拟合残差
        for m in range(treeNum):
            self.treeList.append(self.buildTree(self.X, self.residual, 0))
            for i in range(self.N):
                self.residual[i] -= self.__predict(self.treeList[-1], self.X[i])
            print('step = %d, MSE = %f.' % (m + 1, np.square(self.residual).sum() / self.N))
    
    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            for tree in self.treeList:
                y_pred[i] += self.__predict(tree, X[i])
        return y_pred

In [5]:
trainFilePath = '../boston_housing/train.csv'
X, y = loadData(trainFilePath)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
model = GBDT()
model.fit(X_train, y_train, treeNum = 10, maxDepth = 10, minLeafSamples = 15)

step = 1, MSE = 3.692460.
step = 2, MSE = 2.414796.
step = 3, MSE = 1.418032.
step = 4, MSE = 0.586545.
step = 5, MSE = 0.313230.
step = 6, MSE = 0.171256.
step = 7, MSE = 0.123592.
step = 8, MSE = 0.071455.
step = 9, MSE = 0.049005.
step = 10, MSE = 0.026968.


In [7]:
y_pred = model.predict(X_valid)
valid_MSE = np.square(y_pred - y_valid).sum() / y_pred.shape[0]
valid_RMSE = np.sqrt(valid_MSE)
r2 = r2_score(y_valid, y_pred)
print('MSE = %f, RMSE = %f, r2_score = %f.' % (valid_MSE, valid_RMSE, r2))
# treeNum = 10, maxDepth = 10, minLeafSamples = 10  MSE = 17.973345, RMSE = 4.239498, r2_score = 0.833910.
# treeNum = 10, maxDepth = 10, minLeafSamples = 15  MSE = 17.290799, RMSE = 4.158221, r2_score = 0.840217.

MSE = 17.290799, RMSE = 4.158221, r2_score = 0.840217.


In [8]:
testFilePath = '../boston_housing/test.csv'
X_test = loadData(testFilePath)
y_pred = model.predict(X_test)
res = pd.DataFrame({'index' : np.arange(y_pred.shape[0]),'target' : y_pred})
res.to_csv('./result.csv', index = False)