In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets

In [None]:
boston_data=datasets.load_boston()

In [None]:
boston_df=pd.DataFrame(boston_data.data, columns= boston_data.feature_names)

In [None]:
boston_df['medv']= boston_data.target

In [None]:
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split

# The next statement imports the method to apply Random Forest

from sklearn.ensemble import RandomForestRegressor

# In case we need to build a regression tree, let's import these two classes:

from sklearn import tree

from sklearn.tree import DecisionTreeRegressor

In [None]:
X_train, X_test, y_train, y_test= train_test_split (boston_df.iloc[:,:-1],boston_df['medv'], test_size=0.2, random_state=1)

## Bagging (i.e., Non-random Forest)

In [None]:
# Number of trees to include in the forest. We are considering from 50 to 500 trees.

number_of_trees=np.arange(50,501,10)

The difference between Bagging and RF is that for the former,max_features=n_features; whereas for the latter, max_features < n_features

When applying Bagging, how to set max_features=n_features?

From scikit-learn: If max_features is None or 1.0, then max_features=n_features

Let's use a loop to compute the __oob MSE__ for a forest trained with i tress (where i goes from 50 to 500)

The expression "bag_loop.oob_prediction_" computes the prediction of Y for each oob observation.

__Note__: The following loop is built under the assumption that each observation in the training data will be an out-of-the-bag observation at least once. It is more likely that this assumption is satisfied IF the number of trees is large enough. That's why we are starting with at least 50 trees in this example.

In [None]:
mse_bagging_oob_scores=[]
for i in number_of_trees:
    bag_loop= RandomForestRegressor(n_estimators = i, oob_score= True, max_features=None, random_state=1)
    bag_loop.fit(X_train, y_train)
    mse_bagging_oob_scores.append(mean_squared_error(y_train, bag_loop.oob_prediction_))

In [None]:
min(mse_bagging_oob_scores)

In [None]:
indexmin_bagging= mse_bagging_oob_scores.index(min(mse_bagging_oob_scores))
indexmin_bagging

In [None]:
# The number of trees in this non-random forest that led to the lowest oob MSE was:

number_of_trees[indexmin_bagging]

#### Estimate the test MSE of the bagging (non-random) forest obtained with number of trees equals to 180

In [None]:
bagging_forest= RandomForestRegressor(n_estimators= 180, max_features=None, random_state=1)

In [None]:
bagging_forest.fit(X_train, y_train)

In [None]:
mean_squared_error( y_test, bagging_forest.predict (X_test))

A test MSE of 8.51 is the best test error we have got so far with the Boston dataset !!! (considering all the techniques we have applied to the Boston dataset: regression trees, linear regression, non-linear regression)

With multiple linear regression, we got a root MSE in the order of 5 to 6. The root MSE for this non-random forest is less than 3!

### GO BACK TO THE SLIDES !!!

## Random Forest

### Approach 1 (the theory-based approach): Do not prune the trees and use the OOB error to decide the best parameters

Let's use a loop to compute the oob MSE for a forest trained with i tress (where i goes from 50 to 500)

In Random Forest, in contrast with Bagging, we could also vary the parameter max_feature and try to select its "best" value. However, __we will not do that now (we will do it later)__. We will use max_features= sqrt (p) (the sqrt(p) is a usually recommended value for the max_feature parameter)

The expression "rf_loop.oob_prediction_" computes the prediction of Y for each oob observation.

<br>

In [None]:
mse_rf_oob_scores=[]
for i in number_of_trees:
    rf_loop= RandomForestRegressor(n_estimators = i, oob_score= True, max_features="sqrt", random_state=1)
    rf_loop.fit(X_train, y_train)
    mse_rf_oob_scores.append(mean_squared_error(y_train, rf_loop.oob_prediction_)) 

In [None]:
min(mse_rf_oob_scores)

In [None]:
indexmin_rf= mse_rf_oob_scores.index(min(mse_rf_oob_scores))
indexmin_rf

In [None]:
# The number of trees in this random forest that led to the lowest oob MSE was:

number_of_trees[indexmin_rf]

For homework 5, you could stop in the previous step and conclude that the number of trees is 320. __Doing the following re-tuning is optional.__

__Re-tunning__: Run the loop again using a different random state value to get different trees. This time, only consider a number of trees closer to 320. My goal here is to be more sure that the best number of trees is actually around 320 instead of a larger number closer to 500.

In [None]:
number_of_trees2=np.arange(300,501,10)

In [None]:
# change randomness by setting random state=2

mse_rf_oob_scores2=[]
for i in number_of_trees2:
    rf_loop= RandomForestRegressor(n_estimators = i, oob_score= True, max_features="sqrt", random_state=2)
    rf_loop.fit(X_train, y_train)
    mse_rf_oob_scores2.append(mean_squared_error(y_train, rf_loop.oob_prediction_)) 

In [None]:
min(mse_rf_oob_scores2)

In [None]:
indexmin_rf= mse_rf_oob_scores2.index(min(mse_rf_oob_scores2))
indexmin_rf

In [None]:
number_of_trees2[indexmin_rf]

#### Estimate the test MSE of the random forest obtained with number of trees equals to... 320 (if you did not do the optional re-tuning) or 340 (if you did the retuning)

In [None]:
rf= RandomForestRegressor(n_estimators= 340, max_features="sqrt", random_state=1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
mean_squared_error( y_test,rf.predict (X_test))

This is a good (low) test MSE, but it is a bit worse (greater) than the one using Bagging.

__Overall, what features are important in the trees part of the forest?__

In [None]:
# Here we are calling the attribute "feature_importances_" on the random forest object

df_feature_imp=pd.Series(data=rf.feature_importances_, index=boston_df.iloc[:,:-1].columns)

In [None]:
df_feature_imp

In [None]:
df_feature_imp.sort_values(ascending=False)

## Random Forest

### Approach 1 (version 2) (the theory-based approach): Do not prune the trees and use the OOB error to decide the best parameters. 

### This time, let's change both the max_features and the number of trees

Since the sqrt(p) is a good value to use for max_features, I want to know what's the sqrt(p)

In [None]:
print(X_train.shape[1])
print(np.sqrt(X_train.shape[1]))

Let's change the number of features from 3 to 7 (7 ~ p/2, since p=13)

In [None]:
number_of_features=np.arange(3,8)

In [None]:
number_of_trees2=np.arange(300,501,10)

# Starting at 300 to lower the burden of the upcoming loop 

In [None]:
mse_scores_rf_oob_matrix= np.empty((number_of_features.size, number_of_trees2.size))

mse_scores_rf_oob_matrix is an empty matrix with as many rows as number of features being tested
and as many columns as number of tree being tested

In [None]:
mse_scores_rf_oob_matrix.shape

In [None]:
r=0
for i in number_of_features:
    c=0
    for j in number_of_trees2:
        rf_loop= RandomForestRegressor(n_estimators = j, oob_score= True, max_features=i, random_state=1)
        rf_loop.fit(X_train, y_train)
        mse_scores_rf_oob_matrix[r,c]= mean_squared_error(y_train, rf_loop.oob_prediction_)
        c=c+1
    r= r+1

In [None]:
mse_scores_rf_oob_matrix

In [None]:
np.min(mse_scores_rf_oob_matrix)

In [None]:
np.where(mse_scores_rf_oob_matrix == np.min(mse_scores_rf_oob_matrix))

In [None]:
np.where(mse_scores_rf_oob_matrix == np.min(mse_scores_rf_oob_matrix))[0]

In [None]:
# Number of features for which the min MSE happens:

number_of_features[np.where(mse_scores_rf_oob_matrix == np.min(mse_scores_rf_oob_matrix))[0]]

In [None]:
# Number of trees for which the min MSE happens:

number_of_trees2[np.where(mse_scores_rf_oob_matrix == np.min(mse_scores_rf_oob_matrix))[1]]

#### Estimate the test MSE of the random forest obtained with number of trees equals to 430 and 6 features

In [None]:
rf2= RandomForestRegressor(n_estimators= 430, max_features=6, random_state=1)

In [None]:
rf2.fit(X_train, y_train)

In [None]:
mean_squared_error( y_test,rf2.predict (X_test))

## Random Forest

### Approach 2 (the practice-based approach): Do not prune the trees (similar to approach 1). 

### The difference with approach 1 is that instead of using the OOB error to decide the best parameters, we can use CV to decide the best parameter.

##### We can apply CV by using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

Since the sqrt(p) is a good value to use for max_features, I want to know what's the sqrt(p)

In [None]:
print(X_train.shape[1])
print(np.sqrt(X_train.shape[1]))

Let's change the number of features from 3 to 7 (7 ~ p/2, since p=13)

Let's change the number of trees from 300 to 500

Note: I chose to go from 300 to 500 rather than from 50 to 500 to make the fit process run faster

The search is going to take btw 3 to 4 mins

In [None]:
param_grid_rf = {    
    'n_estimators': np.arange(300,501,10),   
     'max_features': np.arange(3,8)
}

In [None]:
gridSearch_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5,scoring='neg_mean_squared_error')

In [None]:
gridSearch_rf.fit(X_train, y_train)

In [None]:
print('Parameters: ', gridSearch_rf.best_params_)

#### Estimate the test MSE of the random forest obtained with number of trees equals to ?? and 7 features

In [None]:
# rf3= RandomForestRegressor(n_estimators= , max_features=7, random_state=1)

In [None]:
# rf3.fit(X_train, y_train)

In [None]:
# mean_squared_error( y_test,rf3.predict (X_test))