In [1]:
import numpy as np

In [2]:
import pandas

In [3]:
from sklearn.datasets import make_regression

In [4]:
X, y = make_regression(n_features=5, n_samples=10000, noise=10)

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
dt = []
np.random.seed(42)
for i in range(10):
    boot_ind = np.random.choice(1000, replace=True, size=1000)
    X_bootstrap = X_train[boot_ind]
    y_bootstrap = y_train[boot_ind]
    
    dt_boot = DecisionTreeRegressor()
    dt_boot.fit(X_bootstrap, y_bootstrap)
    dt.append(dt_boot)

In [8]:
prediction = np.zeros(len(X_test))
for i in range(10):
    prediction += dt[i].predict(X_test)
prediction /= len(dt)    


In [9]:
from sklearn.metrics import mean_squared_error

In [10]:
mean_squared_error(y_test, prediction)

1737.965053402304

In [11]:
real_dt = DecisionTreeRegressor()

In [12]:
real_dt.fit(X_train ,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [13]:
mean_squared_error(y_test, real_dt.predict(X_test))

1905.4625575625544

# Stacking

In [14]:
dt_list = []
for i in range(5):
    dt = DecisionTreeRegressor(max_depth=i + 3)
    dt.fit(X_train, y_train)
    dt_list.append(dt)


In [15]:
from sklearn.model_selection import KFold

In [16]:
cv = KFold(5)

In [18]:
meta_features = []

for i in range(5):
    dt = DecisionTreeRegressor(max_depth=i+3)
    meta_feature = np.zeros(len(X_train))
    
    for train_indeces, test_indeces in cv.split(X_train):
        dt.fit(X_train[train_indeces], y_train[train_indeces])
        meta_feature[test_indeces] = dt.predict(X_train[test_indeces])
        
    meta_features.append(meta_feature)

In [19]:
test_meta_features = []
for i in range(5):
    dt = DecisionTreeRegressor(max_depth=i+3)
    dt.fit(X_train, y_train)
    test_meta_features.append(dt.predict(X_test))

In [20]:
meta_features = np.array(meta_features).T
test_meta_features = np.array(test_meta_features).T

In [27]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(meta_features, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [29]:
mean_squared_error(y_test, lin_reg.predict(test_meta_features))

3567.1587814940317