In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

%matplotlib inline

In [2]:
data = load_boston()
X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=["MEDV"])
df = pd.concat([X, y], axis=1)
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1)

In [3]:
BOOSTING_ITERATIONS = 5
Y_COL = "MEDV"
MIN_LEAF = 20
MAX_DEPTH = 3
LR = 0.1
feature_cols = df_train.columns[:-1]

In [4]:
y_pred = df_train[Y_COL].mean()
trees = []
for i in range(BOOSTING_ITERATIONS):
    df_train[f"error_{i}"] = df_train[Y_COL] - y_pred
    Y_COL = f"error_{i}"
    cart = DecisionTreeRegressor(min_samples_leaf=MIN_LEAF, max_depth=MAX_DEPTH)
    cart.fit(df_train[feature_cols], df_train[Y_COL])
    y_pred = LR * cart.predict(df_train[feature_cols])
    trees.append(cart)
    print(f"\rRunning iteration {i+1}", end="")

Running iteration 1Running iteration 2Running iteration 3Running iteration 4Running iteration 5

In [5]:
df_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,error_0,error_1,error_2,error_3,error_4
242,0.10290,30.0,4.93,0.0,0.428,6.358,52.9,7.0355,6.0,300.0,16.6,372.75,11.22,22.2,-0.353187,-0.252786,-0.115060,-0.097253,-0.000802
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7,6.146813,5.958469,5.796168,5.193316,5.026922
168,2.30040,0.0,19.58,0.0,0.605,6.319,96.1,2.1000,5.0,403.0,14.7,297.09,11.10,23.8,1.246813,1.347214,1.484940,1.502747,1.599198
490,0.20746,0.0,27.74,0.0,0.609,5.093,98.0,1.8226,4.0,711.0,20.1,318.43,29.68,8.1,-14.453187,-13.444059,-12.956841,-12.528200,-11.616083
62,0.11027,25.0,5.13,0.0,0.453,6.456,67.8,7.2255,8.0,284.0,19.7,396.90,6.73,22.2,-0.353187,-0.541531,-0.703832,-0.686024,-0.852419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,0.03548,80.0,3.64,0.0,0.392,5.876,19.1,9.2203,1.0,315.0,16.4,395.18,9.25,20.9,-1.653187,-1.841531,-2.003832,-1.986024,-2.152419
72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52,22.8,0.246813,0.058469,-0.103832,-0.086024,-0.252419
396,5.87205,0.0,18.10,0.0,0.693,6.405,96.0,1.6768,24.0,666.0,20.2,396.90,19.37,12.5,-10.053187,-9.044059,-8.556841,-8.128200,-7.216083
235,0.33045,0.0,6.20,0.0,0.507,6.086,61.5,3.6519,8.0,307.0,17.4,376.75,10.88,24.0,1.446813,1.547214,1.684940,1.702747,1.799198


In [5]:
df_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,error_0,error_1,error_2,error_3,error_4
242,0.10290,30.0,4.93,0.0,0.428,6.358,52.9,7.0355,6.0,300.0,16.6,372.75,11.22,22.2,-0.353187,-0.152330,-0.142328,-0.133326,-0.048883
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7,6.146813,5.733194,5.743196,5.752198,5.442270
168,2.30040,0.0,19.58,0.0,0.605,6.319,96.1,2.1000,5.0,403.0,14.7,297.09,11.10,23.8,1.246813,1.447670,1.457672,1.466674,1.551117
490,0.20746,0.0,27.74,0.0,0.609,5.093,98.0,1.8226,4.0,711.0,20.1,318.43,29.68,8.1,-14.453187,-13.444059,-12.969623,-12.542631,-11.762832
62,0.11027,25.0,5.13,0.0,0.453,6.456,67.8,7.2255,8.0,284.0,19.7,396.90,6.73,22.2,-0.353187,-0.766806,-0.756804,-0.747802,-1.057730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,0.03548,80.0,3.64,0.0,0.392,5.876,19.1,9.2203,1.0,315.0,16.4,395.18,9.25,20.9,-1.653187,-2.066806,-2.056804,-2.047802,-1.963359
72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52,22.8,0.246813,-0.166806,-0.156804,-0.147802,-0.457730
396,5.87205,0.0,18.10,0.0,0.693,6.405,96.0,1.6768,24.0,666.0,20.2,396.90,19.37,12.5,-10.053187,-9.044059,-8.569623,-8.142631,-7.362832
235,0.33045,0.0,6.20,0.0,0.507,6.086,61.5,3.6519,8.0,307.0,17.4,376.75,10.88,24.0,1.446813,1.647670,1.657672,1.666674,1.751117


In [6]:
"hello".startswith("he")

True

In [9]:
error_cols

Index(['error_0', 'error_1', 'error_2', 'error_3', 'error_4'], dtype='object')

In [8]:
error_cols = df_train.columns[df_train.columns.str.startswith("error")]
df_train[error_cols].apply(lambda x: x**2).mean()

error_0    83.475105
error_1    71.254619
error_2    61.029100
error_3    52.940515
error_4    45.975453
dtype: float64

In [6]:
error_cols = df_train.columns[df_train.columns.str.startswith("error")]
df_train[error_cols].apply(lambda x: x**2).mean()

error_0    83.475105
error_1    70.836472
error_2    59.809802
error_3    50.878200
error_4    43.772540
dtype: float64

In [None]:
y_pred = y_mean + 0.1 * prediction(tree1) + 0.1 * prediction(tree2) + ...