In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingRegressor

In [4]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)


In [5]:
bag_model = BaggingRegressor(bootstrap_features=True).fit(X_train, y_train)

In [6]:
bag_model.n_estimators

10

In [7]:
bag_model.estimators_

[DecisionTreeRegressor(random_state=1367730337),
 DecisionTreeRegressor(random_state=1788584823),
 DecisionTreeRegressor(random_state=115703812),
 DecisionTreeRegressor(random_state=769826858),
 DecisionTreeRegressor(random_state=814064420),
 DecisionTreeRegressor(random_state=752024325),
 DecisionTreeRegressor(random_state=21349314),
 DecisionTreeRegressor(random_state=602556669),
 DecisionTreeRegressor(random_state=633353202),
 DecisionTreeRegressor(random_state=401274329)]

In [8]:
bag_model.estimators_samples_

[array([108, 159, 147, 184, 156,  31,  82, 141,   5,  63,   6,  95,   7,
        189, 143,  75, 133,  37, 193,  29, 121,   4,  92, 160, 155, 129,
        185, 121,  60, 135, 106,   7,  15, 128, 146,  71,  59,  42,  73,
         84,  12, 166, 151,  69, 187, 139,  31,   6, 164,  40,  31,  44,
         50, 118, 110,  13, 142,   9, 186,  84, 186, 172, 147,  41,  56,
        147, 113, 124, 132,  58,  50, 148,  35,  44, 104,  84, 184, 147,
         76, 145,  68, 100,  69,  59,  93,  75,  96, 174, 158,  81, 100,
         81,  20,  46, 158,  19, 117,  75, 135, 178, 166,  30, 163,  70,
        120,  41, 162, 188,  81,   5, 105,  40, 190,  83,  79,  55,  33,
        105, 136, 144,  53, 178,  30,   3, 111, 175, 114,  71, 104,   9,
        132, 122, 158, 146, 125,  46,  78,  92,  58, 179, 173,  64, 114,
        170, 168, 189, 126, 173, 194, 132,   0,  70,  26, 139,  78,  22,
         93, 194,  87,  69, 192,   0,  73,  28, 166, 185,  70,  32, 184,
         59, 169,  50,   7, 137,  21,  51, 191,  26

In [9]:
bag_model.estimators_features_

[array([ 1,  3, 13,  6,  6,  7,  0,  5, 11,  6, 13, 14, 16,  9,  8,  0,  5,
         0, 11]),
 array([ 8, 12,  3,  4, 13, 16, 13,  5,  4,  5, 15, 18,  2,  4, 16,  0,  8,
        17,  7]),
 array([ 4,  0, 17,  0, 18,  7, 10,  1,  5, 15, 14,  4, 10,  9, 16, 11,  3,
         1,  1]),
 array([10, 17,  5, 12,  8, 16, 10, 13,  8, 12, 10, 14, 10,  3,  4,  8,  5,
        16,  8]),
 array([ 4, 10,  5,  0,  6,  8,  8,  3,  7, 10, 18,  1,  1,  7, 14,  9, 11,
        18,  5]),
 array([ 5,  8, 11,  5,  0, 11, 14, 16,  5, 15,  4, 11, 12, 11, 15, 15,  1,
         8,  7]),
 array([ 2, 16, 13,  3, 15,  5, 10, 13,  1,  7,  0,  6, 11, 10,  8, 12, 14,
         9,  5]),
 array([11,  6,  3,  4, 11, 10,  7,  8, 15, 18,  0,  2, 11, 10, 16, 17, 10,
        11,  7]),
 array([18,  2,  6,  6, 15,  9,  0, 12,  4,  6, 18,  7,  9,  4,  1, 12,  1,
         7, 17]),
 array([ 1,  7, 16,  9, 16,  4,  3, 17,  8,  6, 16, 10,  8,  3,  1,  5,  5,
         6, 14])]

In [10]:
bag_model.estimators_[0]

PREDICT

In [11]:
y_pred = bag_model.predict(X_test)

In [12]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, y_pred))

355.0775971909757

In [31]:
second_pred_y = bag_model.estimators_[1].fit(X_train, y_train).predict(X_test)
# you can see that the predictions are different for each estimator

In [32]:
np.sqrt(mean_squared_error(y_test, second_pred_y))

455.84559954771436

MODEL TUNING

In [33]:
bag_params = {"n_estimators": range(2, 20)}

In [34]:
from sklearn.model_selection import GridSearchCV

bag_cv_model = GridSearchCV(bag_model, bag_params, cv=10).fit(X_train, y_train)

In [35]:
bag_cv_model.best_params_

{'n_estimators': 19}

In [None]:
bag_tuned_model = BaggingRegressor(n_estimators=bag_cv_model.best_params_['n_estimators'],
                                    random_state=45).fit(X_train, y_train)
y_tuned_pred = bag_tuned_model.predict(X_test)

In [41]:
np.sqrt(mean_squared_error(y_test, y_tuned_pred))

331.6722259721113