In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [25]:
concrete = pd.read_csv('datasets/concrete_data.csv')

concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [26]:
X = concrete.drop('csMPa', axis=1)
Y = concrete['csMPa']

In [27]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

Fit a DecisionTreeRegressor to the training set

In [28]:
from sklearn.tree import DecisionTreeRegressor

In [29]:
tree_reg1 = DecisionTreeRegressor(max_depth=3)
tree_reg1.fit(x_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

Now train a second DecisionTreeRegressor on the residual errors made by the first predictor

In [30]:
y2 = y_train - tree_reg1.predict(x_train)

y2[:10]

776     5.623669
733    -6.557755
699    -5.976331
484    -8.813514
972    -2.412913
372    -2.897755
849   -22.137755
732    -6.526331
546    -8.197755
380    -6.353514
Name: csMPa, dtype: float64

In [31]:
tree_reg2 = DecisionTreeRegressor(max_depth=4)
tree_reg2.fit(x_train, y2)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=4,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

Now we train a third regressor on the residual errors made by the second predictor

In [32]:
y3 = y2 - tree_reg2.predict(x_train)

y3[:10]

776    10.568766
733    -1.612658
699    -4.317676
484    -3.146242
972    -0.754258
372    -7.658571
849    -7.833345
732    -1.581234
546    -3.252658
380   -11.114330
Name: csMPa, dtype: float64

In [33]:
tree_reg3 = DecisionTreeRegressor(max_depth=5)
tree_reg3.fit(x_train, y3)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

Now we have an ensemble containing three trees. It can make predictions on a new instance simply by adding up all the predictions of all the trees

In [34]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [35]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.8238628523574831

In [36]:
y4 = y3 - tree_reg3.predict(x_train)

y4[:10]

776    6.157825
733    1.561576
699   -1.530726
484   -9.037180
972   -1.116122
372   -5.672014
849   -4.659111
732   -5.992175
546   -0.078424
380    0.570487
Name: csMPa, dtype: float64

In [37]:
tree_reg4 = DecisionTreeRegressor(max_depth=5)
tree_reg4.fit(x_train, y4)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [38]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3, tree_reg4))

r2_score(y_test, y_pred)

0.8480529998998076

### GradientBoostingRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

In [39]:
from sklearn.ensemble import GradientBoostingRegressor

In [40]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=1.0)

gbr.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=1.0, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [41]:
y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.7587872252267074

In [42]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=0.1)

gbr.fit(x_train, y_train)

y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.29206841145984286

In [43]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=30, learning_rate=0.1)

gbr.fit(x_train, y_train)

y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.8448981724556075