In [None]:
pip install ISLP

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error
import numpy as np
from ISLP import load_data
import pandas as pd
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# Load Boston dataset
Boston = load_data("Boston")
X = Boston.drop('medv', axis=1).values # save all the values apart from medv in x
y = Boston['medv'].values # save all the values of medv in y
# Split the data into 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

My dog is not sick

In [None]:
# Fit the initial regression tree
initial_tree = DecisionTreeRegressor(random_state=42)
initial_tree.fit(X_train, y_train)

# Get the cost complexity pruning path
# Start with the full tree, find the change in MSE for every added branch,
# prune the branches starting with the smallest increase in MSE. Each time a branch is
# cut an assosiated alpha values is saved in ccp_alphas. The tree is then reuvalated for the best cut.
path = initial_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
alpha_results = [] # create an empty list, this will be used to store the CCP alpha values

# Use CV with every ccp_alpha
for ccp_alpha in ccp_alphas:
    tree = DecisionTreeRegressor(random_state=42, ccp_alpha=ccp_alpha) # specific tree with related alpha value
    scores = cross_val_score(tree, X_train, y_train, cv=5, scoring='neg_mean_squared_error') # perfrom cv
    average_mse = -scores.mean()  # convert negative and take the mean
    alpha_results.append({"ccp_alpha": ccp_alpha, "MSE": average_mse}) # save it in the list alpha_results

cv_results_df = pd.DataFrame(alpha_results) # save it in a df for easier viewing
optimal_alpha = cv_results_df.loc[cv_results_df['MSE'].idxmin(), 'ccp_alpha'] # find the smallest MSE

# Using the optimal alpha fit the best reg tree
optimal_tree = DecisionTreeRegressor(random_state=42, ccp_alpha=optimal_alpha)
optimal_tree.fit(X_train, y_train)
predictions = optimal_tree.predict(X_test)
optimal_test_mse = mean_squared_error(y_test, predictions)

# Print the results
print(f"Optimal CCP Alpha: {optimal_alpha}")
print(f"Optimal MSE on test data: {optimal_test_mse}")
pd.set_option('display.max_rows', None)
#cv_results_df - this is the whole dataframe containing all the alpha with the correpsonding MSE

Optimal CCP Alpha: 0.18176217107303233
Optimal MSE on test data: 10.039659929938216


In [None]:
# BAGGING
bagging_model = BaggingRegressor(estimator=DecisionTreeRegressor(), random_state=42)

# Define the parameter to search through, removed some options from final to allow faster compuation
param_grid = {
    'n_estimators': [140, 150],  # number of trees
    'max_samples': [0.9, 1.0], # sample size used to rain each tree on
    'max_features': [0.9], # features to consider for each split
    'estimator__max_depth': [15, 16, 17], # depth of each tree
    'estimator__min_samples_split': [2], # min number of samples for each split
    'estimator__min_samples_leaf': [1] # min observations each leaf must contain
}

# Search through all the options specified in the params_grid with cv
grid_search = GridSearchCV(estimator=bagging_model, param_grid=param_grid,
                           cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train) # fit the gridsearch to the data
print(f"Best parameters: {grid_search.best_params_}") # print optimal paramters accroding to gridsearch

# Make predictions using the optimal model from grid search
best_model = grid_search.best_estimator_
bagging_predictions = best_model.predict(X_test)

# Calculate the MSE on the test data
bagging_test_mse = mean_squared_error(y_test, bagging_predictions)
print(f"GridSearchCV Bagging MSE on test data: {bagging_test_mse}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'estimator__max_depth': 17, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'max_features': 0.9, 'max_samples': 0.9, 'n_estimators': 150}
GridSearchCV Bagging MSE on test data: 10.676318108499382


In [None]:
# RANDOM FOREST
random_forest_model = RandomForestRegressor(random_state=42)

# Define the parameter grid - you may adjust this based on your computational budget and needs- removed some options from final to allow faster compuation
param_grid = {
    'n_estimators': [100, 110, 120],  # number of trees in forest
    'max_depth': [15],  # max depth of tree
    'min_samples_split': [2, 3],  # min sampels for split
    'min_samples_leaf': [1, 2],  # min observations for each leaf
    'max_features': ['sqrt'],  # features to condiser for every split ['auto', 'sqrt', 'log2']
    'max_samples': [1.0]  # the number of samples to draw from X to train each base estimator
}

# Search through all the options specified in the params_grid with cv
grid_search_rf = GridSearchCV(estimator=random_forest_model, param_grid=param_grid,
                              cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)
# fit the model with the parameters from gridsearch
grid_search_rf.fit(X_train, y_train)
print(f"Best parameters: {grid_search_rf.best_params_}")

# Make predictions using the best model from grid search
best_model_rf = grid_search_rf.best_estimator_
rf_predictions = best_model_rf.predict(X_test)

# Calculate the MSE with test data
rf_test_mse = mean_squared_error(y_test, rf_predictions)
print(f"GridSearchCV Random Forest MSE on test data: {rf_test_mse}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'max_depth': 15, 'max_features': 'sqrt', 'max_samples': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 110}
GridSearchCV Random Forest MSE on test data: 9.853333825932712


In [None]:
# BOOSTING
# Define the parameter grid - removed some options from final to allow faster compuation
param_grid = {
    'n_estimators': [180, 190, 200], # number of trees
    'learning_rate': [0.05, 0.1, 0.15],  # rate of learning
    'max_depth': [3],  # depth of each tree [3, 5, 7, 9, 11]
    'min_samples_split': [8],  # min number of samples required to split a node [2, 4, 6, 8, 10]
    'min_samples_leaf': [1, 2]  # min number of samples required to be at a leaf node
}

# build a Gradient Boosting model
gradient_boosting_model = GradientBoostingRegressor(random_state=42)
# search through params_grid
grid_search_gb = GridSearchCV(estimator=gradient_boosting_model, param_grid=param_grid,
                              cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)
# Fit with best params
grid_search_gb.fit(X_train, y_train)
print(f"Best parameters: {grid_search_gb.best_params_}")

# predictions with best parameters
best_gb_model = grid_search_gb.best_estimator_
gb_predictions = best_gb_model.predict(X_test)

# find the MSE
gb_test_mse = mean_squared_error(y_test, gb_predictions)
print(f"GridSearchCV Gradient Boosting MSE on test data: {gb_test_mse}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 190}
GridSearchCV Gradient Boosting MSE on test data: 7.907888797423849
