## GBM L2

Gradient Boosting 的前奏。选用简单的 MSE 来寻找残差的最佳划分。

这里有一个很好的解释，[GBM - MSE](http://explained.ai/gradient-boosting/L2-loss.html)

实现一个最常用的 GBM。它是通过不断优化 MSE, 即称了 L2 cost 来完成建模。L2 的缺点是对 outlier 点敏感。

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [166]:
def data():
    return pd.read_csv('rent.txt', delimiter='\t')

class DecisionTree(object):
    '''
        构建用于回归的决策树桩
    '''
    def __init__(self):
        return 
    
    def model(self, X, y):
        m, n = X.shape
        split_feature = 0
        split_val = 0
        lmean, rmean = 0, 0
        min_mse = np.inf
        for i in range(n):
            vals = list(set(X[:, i]))
            vals.sort()
            candidates = [(vals[i] + vals[i + 1]) / 2 for i in range(len(vals) - 1)]
            for c in candidates:
                xVals = X[:, i]
                mse = np.var(y[xVals <= c]) + np.var(y[xVals > c])
                if mse <= min_mse:
                    min_mse, split_feature, split_val = mse, i, c
                    lmean, rmean = round(np.mean(y[xVals <= c]), 2), round(np.mean(y[xVals > c]), 2)
                    
        self.split_feature = split_feature
        self.split_val = split_val
        self.lmean, self.rmean = lmean, rmean
        return self

    def predict(self, X):
        m = X.shape[0]
        y = np.ones(m) * self.lmean
        ridx = X[:, self.split_feature] > self.split_val
        y[ridx] = self.rmean
        return y      

In [188]:
class BoostMse(object):
    def __init__(self):
        self.trees = []
        self.f0 = 0
        return
    
    def model(self, X, y, alpha = 1, iters = 10):
        self.alpha, self.iters = alpha, iters
        self.f0 = np.mean(y)
        m = X.shape[0]
        yHat = np.ones(m) * self.f0
        for t in range(iters):
            residuals = y - yHat
            print(t, y, yHat, residuals)
            tree = DecisionTree().model(X, residuals)
            yHat = yHat + alpha * tree.predict(X)
            self.trees.append(tree)
            
        return self
        
    def predict(self, X):
        m = X.shape[0]
        yHat = np.ones(m) * self.f0
        for i in range(self.iters):
            yHat = yHat + self.alpha * self.trees[i].predict(X)
            
        return yHat
    
    def printTrees(self):
        print('F0:', self.f0)
        for i in range(self.iters):
            tree = self.trees[i]
            print('Tree ', i)
            print('\t split feature:', tree.split_feature)
            print('\t split feature:', tree.split_val)
            print('\t left mean', tree.lmean)
            print('\t right mean', tree.rmean)

In [189]:
df = data()
X,y = df.values[:, :-1], df.values[:,-1]
bmse = BoostMse().model(X, y, 1, 3)
bmse.printTrees()
bmse.predict(X)

0 [1200 1280 2000 1450 1160] [1418. 1418. 1418. 1418. 1418.] [-218. -138.  582.   32. -258.]
1 [1200 1280 2000 1450 1160] [1272.5 1272.5 2000.  1272.5 1272.5] [ -72.5    7.5    0.   177.5 -112.5]
2 [1200 1280 2000 1450 1160] [1180.   1334.17 2061.67 1334.17 1180.  ] [ 20.   -54.17 -61.67 115.83 -20.  ]
F0: 1418.0
Tree  0
	 split feature: 0
	 split feature: 925.0
	 left mean -145.5
	 right mean 582.0
Tree  1
	 split feature: 0
	 split feature: 825.0
	 left mean -92.5
	 right mean 61.67
Tree  2
	 split feature: 0
	 split feature: 925.0
	 left mean 15.41
	 right mean -61.67


array([1195.41, 1349.58, 2000.  , 1349.58, 1195.41])