In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
concrete = pd.read_csv('datasets/concrete_data.csv')

concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
X = concrete.drop('csMPa', axis=1)
Y = concrete['csMPa']

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

Fit a DecisionTreeRegressor to the training set

In [5]:
from sklearn.tree import DecisionTreeRegressor

In [11]:
tree_reg1 = DecisionTreeRegressor(max_depth=3)
tree_reg1.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

Now train a second DecisionTreeRegressor on the residual errors made by the first predictor

In [12]:
y2 = y_train - tree_reg1.predict(x_train)

y2[:10]

964     4.071735
665    -4.824118
584     5.705882
69      8.261579
626    -6.224118
536    -6.774333
92      7.885882
754     5.931579
859   -10.708265
983     3.271735
Name: csMPa, dtype: float64

In [13]:
tree_reg2 = DecisionTreeRegressor(max_depth=4)
tree_reg2.fit(x_train, y2)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

Now we train a third regressor on the residual errors made by the second predictor

In [14]:
y3 = y2 - tree_reg2.predict(x_train)

y3[:10]

964    3.227978
665    1.522733
584    4.862126
69     1.948240
626    5.375813
536   -1.242437
92     0.006255
754   -0.381760
859    0.891665
983   -4.607892
Name: csMPa, dtype: float64

In [15]:
tree_reg3 = DecisionTreeRegressor(max_depth=5)
tree_reg3.fit(x_train, y3)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

Now we have an ensemble containing three trees. It can make predictions on a new instance simply by adding up all the predictions of all the trees

In [16]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [17]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.8332852428276365

In [18]:
y4 = y3 - tree_reg3.predict(x_train)

y4[:10]

964     1.555972
665     2.382028
584    -0.096680
69     12.806784
626     0.417007
536     4.554709
92      6.438501
754    -0.866725
859     1.710637
983    -0.402830
Name: csMPa, dtype: float64

In [19]:
tree_reg4 = DecisionTreeRegressor(max_depth=5)
tree_reg4.fit(x_train, y4)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [20]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3, tree_reg4))

r2_score(y_test, y_pred)

0.8511584064801863

### GradientBoostingRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

In [23]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=1.0)

gbr.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1.0, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [26]:
y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.8066564237518583

In [27]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=0.1)

gbr.fit(x_train, y_train)

y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.28611269160351926

In [28]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=30, learning_rate=0.1)

gbr.fit(x_train, y_train)

y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.8498205416692346