In [1]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
SEED = 1

In [None]:
X_train, X_test, y_train, y_test = train-test_split(X, y , test_size = 0.3, stratify = y, random_state = SEED)

In [None]:
dt = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 0.16, random_state = SEED )

# BC consists of 300 classification tree` dt
we set n_jobs = -1 so that all CPU scores are used in computation

In [None]:
# Instantiate bagging classifier 'bc'
bc = BaggingClassifier(base_estimator = dt, n_estimators=300, n_jobs = -1)

In [None]:
# Instantiate a Bagging Classifier 'bc', set oob_score = True in order to evaluate the OOB_accuracy of BC after training
bc = BaggingClassifier(base_estimator = dt, n_estimators=300, oob_score = True, n_jobs = -1)

# Note in scikit-learn, the OOB-score corresponds to the accuracy of classifier and the r-squared score for regressors

In [None]:
# Extract the OOB-score from 'BC'
oob_accuracy = bc.oob_score_

In [None]:
# Print OOB accuracy
print('OOB accuracy : {:.3f}'.format(oob_accuracy))

# Fit bc to the training set

In [None]:
bc.fit(X_train, y_train)

# Predict the test set labels

In [None]:
y_pred = bc.predict(X_test)

# Evaluate and predict test set accuracy

In [None]:
accuracy = accuracy_score(y_test, y_pred)

In [None]:
print('Accuracy of Bagging Classifier : {:.3f}'.format(accuracy))

In [None]:
# Print acc_test and acc_oob
print('Test set accuracy: {:.3f}, OOB accuracy: {:.3f}'.format(acc_test, acc_oob))

# In Bagging the base estimator can be any model including Decision Tree,
# Logistic Regression, or even a Neural Network

# **Random forest**


**In Random Forest the base estimator is the Decision tree**

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [4]:
# set seed for reproducability
Seed= 1

In [None]:
X_train, X_test, y_train, y_test = train-test_split(X, y , test_size = 0.3,  random_state = SEED)

In [None]:
# Instantiate RandomForestRegressor 'rf' 400 estimators
rf = RandomForestRegressor(n_estimators = 400#(consists of 400 Regression trees), 
                           min_samples_leaf = 0.12, random_state = SEED)

In [None]:
# Fit 'rf' to the training set
rf.fit(X_train, y_train)

In [None]:
# Predict the test set lables 'y_pred'
y_pred = rf.predict(X_test)

In [None]:
# Evaluate the test set RMSE
rmse_test = MSE(y_test, y_pred)**(1/2)

In [None]:
# Print the test set RMSE
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))

# When a tree based method is trained, the predictive power of a feature of its importance can be assessed 

# In scikit-learn, feature importance is assessed by measuring how much the tree nodes use a particular feature to reduce impurity 

# Note that the importance of feature is expressed as a percentage indicating the weight of that feature in training and prediction

# Once you train a tree-based model in scikit-learn, the featues importances can be accessed by extracting the feature importance attribute from the model

# To visualize the importance of features as assessed by 'rf', you can create a pandas series of the features importance and then sort this series and make a horizontal-barplot

In [5]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Create a pd.series of feature importance
importances_rf = pd.Series(rf.feature_importances_, index = X.columns)

In [None]:
# Sort importance rf
sorted_importances_rf = importances_rf.sort_values()

In [None]:
# Make a horizontal bar plot
sorted_importances_rf.plot(kind = 'barh', color = 'lightgreen');
plt.title('Features Importances')
plt.show()

# **Tuning Random Forest hyperparameter**

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV
# set seed for reproducability
SEED= 1
# Instantiate RandomForestRegressor 'rf'
rf = RandomForestRegressor( random_state = SEED)

In [8]:
# Inspect hyperparameters of rf
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [None]:
# Define the grid of hyperparameters 'params_rf'
params_rf = {'n_estimators':[300,400,500],'max_depth': [4,6,8], 'min_samples_leaf':[0.1,0.2], 'max_features':['log2', 'sqrt']}

In [None]:
# Define the dictionary 'params_rf' Datacamp example
params_rf =  {'n_estimators':[100,350,500],'min_samples_leaf':[2,10,30], 'max_features':['log2','auto', 'sqrt']}

In [None]:
# Instantiate a 10 fold CV grid search ogject 'grid_rf'
grid_rf = GridSearchCV(estimator = rf, param_grid = params_rf, cv = 3, scoring = 'neg_mean_squared_error', verbode = 1, n_jobs = -1)

# Note that the parameter verbose controls the verbosity: the higer its value, the more messages are printed during fitting

In [None]:
# Fit the grid_rf to the training data
grid_rf.fit(X_train, y_train)

In [None]:
# Extract best hyperparameters from grid_rf
best_hyperparams = grid_rf.best_params_
print('Best hyperparameters:\n', best_hyperparams)

In [None]:
# Extract the best model from the 'grid_rf'
best_model = grid_rf.best_estimator_

In [None]:
# Predict the test set labesl
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate the test set RMSE
rmse_test = MSE(y_test, y_pred)**(1/2)

In [None]:
# Print the test set RMSE
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))