In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
from sklearn.ensemble import RandomForestClassifier

# The Caravan dataset

In [4]:
Caravan_df= pd.read_csv('C:\\Users\\jheredi2\\Documents\\PythonDataAnalytics\\1-Datasets\\Caravan.csv')

In [5]:
# Excluding the first predictor from the Caravan data set

X_train, X_test, y_train, y_test= train_test_split (Caravan_df.iloc[:,1:-1], Caravan_df['Purchase'], test_size=0.2, random_state=1)

## Applying Bagging (i.e., Non-random Forest) to the Caravan dataset

__Reminder from ML 1__ (for more reminders, review ML 1 notebook about random forest regression)

The difference between Bagging and RF is that for the former, max_features=n_features; whereas for the latter, max_features < n_features

When applying Bagging, how to set max_features=n_features?

From scikit-learn: If max_features is None or 1.0, then max_features=n_features

### Basic application: select based on accuracy (= error) and without changing any tuning hyperparameter

Let's grow the trees in the forest using Entropy first. Later on, we might try (if time permits) both the Gini index and Entropy and choose the best between the two.

We are going to use the theory-based approach that we learned in ML 1, which uses the Out-Of-the-Bag (i.e.,oob) observations  to choose the best configuration for the forest.

Let's __based our decision on the oob accuracy__ (i.e., select the forest with the highest oob accuracy = lowest oob error)

In [6]:
# Number of trees to include in the forest.
# We are considering only a few options because the computations in random forests classification are intense!

number_of_trees=np.array ([100, 250, 500, 750, 1000])

In [7]:
accuracy_score_oob=[]
for i in number_of_trees:
    bag_loop= RandomForestClassifier(n_estimators = i, criterion='entropy', oob_score= True, max_features=None, random_state=1)
    bag_loop.fit(X_train, y_train)
    accuracy_score_oob.append (np.round (bag_loop.oob_score_,3)) 
    # the attribute ob_score_ computes the accuracy based on the oob observations

In [8]:
max(accuracy_score_oob)

0.924

In [9]:
indexmax_bagging= accuracy_score_oob.index(max(accuracy_score_oob))

In [10]:
number_of_trees[indexmax_bagging]

100

__Estimate the metrics (on the test data) for the previous forest__

In [11]:
bagging_forest_caravan= RandomForestClassifier(n_estimators = 100, criterion='entropy', max_features=None, random_state=1)

In [12]:
bagging_forest_caravan.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_features=None, random_state=1)

In [13]:
y_predicted_caravan_bagging= bagging_forest_caravan.predict (X_test)

In [14]:
confusion_matrix(y_test, y_predicted_caravan_bagging)

array([[1075,   26],
       [  59,    5]], dtype=int64)

In [15]:
print (classification_report (y_test, y_predicted_caravan_bagging))

              precision    recall  f1-score   support

          No       0.95      0.98      0.96      1101
         Yes       0.16      0.08      0.11        64

    accuracy                           0.93      1165
   macro avg       0.55      0.53      0.53      1165
weighted avg       0.90      0.93      0.91      1165



NOTHING SURPRINSING!

For unbalanced datasets, if you select the best classifier based on overall accuracy (or equivalently, overall error), you usually do not get good values for the class-specific performance metrics!

### Let's change the probability threshold of the previous forest and compute the metrics again

In [16]:
# Array of probability thresholds

array_prob= np.arange(0.05, 0.51, 0.05)

In [17]:
prob_yes_bagging_forest= bagging_forest_caravan.predict_proba(X_test)[:,1]

In [18]:
dict_predictions= dict()

In [19]:
dict_f1_scores= dict()

In [20]:
for j in array_prob:
    dict_predictions[j]=np.empty(y_test.size, dtype=object)
    for i in np.arange(0, dict_predictions[j].size):
        if prob_yes_bagging_forest[i] > j:
            dict_predictions[j][i]= 'Yes'
        else:
            dict_predictions[j][i]= 'No'
    dict_f1_scores[j]= np.round (f1_score(y_test, dict_predictions[j],pos_label='Yes'),3)

In [21]:
dict_f1_scores

{0.05: 0.205,
 0.1: 0.174,
 0.15000000000000002: 0.206,
 0.2: 0.203,
 0.25: 0.164,
 0.3: 0.142,
 0.35000000000000003: 0.137,
 0.4: 0.109,
 0.45: 0.097,
 0.5: 0.105}

In [22]:
max(dict_f1_scores, key= dict_f1_scores.get)

0.15000000000000002

In [23]:
max(dict_f1_scores.values())

0.206

In [24]:
# This loop computes the prediction of Y (No or Yes) for each test observation
# The predictions of Y are stored in an array called 'y_predicted_prob015' 
# The prediction uses a prob threshold of 0.15

y_predicted_prob015=np.empty(y_test.size, dtype=object)

for i in np.arange(0,y_predicted_prob015.size):
    if prob_yes_bagging_forest[i] > 0.15:
        y_predicted_prob015[i]= 'Yes'
    else:
        y_predicted_prob015[i]= 'No'

In [25]:
confusion_matrix (y_test, y_predicted_prob015)

array([[982, 119],
       [ 43,  21]], dtype=int64)

In [26]:
print (classification_report (y_test, y_predicted_prob015))

              precision    recall  f1-score   support

          No       0.96      0.89      0.92      1101
         Yes       0.15      0.33      0.21        64

    accuracy                           0.86      1165
   macro avg       0.55      0.61      0.56      1165
weighted avg       0.91      0.86      0.88      1165



### Let's change the class weights and compute the metrics again

The classes will be weighted differently using class_weight= 'balanced'

In [36]:
accuracy_score_oob2=[]
for i in number_of_trees:
    bag_loop= RandomForestClassifier(n_estimators = i, criterion='entropy', class_weight= 'balanced', oob_score= True, max_features=None, random_state=1)
    bag_loop.fit(X_train, y_train)
    accuracy_score_oob2.append (np.round (bag_loop.oob_score_,3)) 
    # the attribute ob_score_ computes the accuracy based on the oob observations

In [37]:
max(accuracy_score_oob2)

0.925

In [38]:
indexmax_bagging2= accuracy_score_oob2.index(max(accuracy_score_oob2))

In [39]:
number_of_trees[indexmax_bagging2]

750

__Estimate the metrics (on the test data) for the previous forest__

In [40]:
bagging_forest_caravan2= RandomForestClassifier(n_estimators = 750, criterion='entropy', class_weight= 'balanced', max_features=None, random_state=1)

In [41]:
bagging_forest_caravan2.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_features=None, n_estimators=750, random_state=1)

In [42]:
y_predicted_caravan_bagging2= bagging_forest_caravan2.predict (X_test)

In [43]:
confusion_matrix(y_test, y_predicted_caravan_bagging2)

array([[1071,   30],
       [  60,    4]], dtype=int64)

In [44]:
print(classification_report(y_test, y_predicted_caravan_bagging2))

              precision    recall  f1-score   support

          No       0.95      0.97      0.96      1101
         Yes       0.12      0.06      0.08        64

    accuracy                           0.92      1165
   macro avg       0.53      0.52      0.52      1165
weighted avg       0.90      0.92      0.91      1165



### Let's change the class weights and select the best forest configuration based on f1-score


We are selecting the forest configuration that leads to the highest oob f1-score and the classes will be weighted differently using class_weight= 'balanced'

Note: We need a little trick to be able to compute any metric different from the accuacy on the oob observations. Sci-kit learn only includes the option to compute the accuracy. That "trick" is applied on the last statement of the following loop

In [27]:
f1_score_oob=[]
for i in number_of_trees:
    bag_loop= RandomForestClassifier(n_estimators = i, criterion='entropy', class_weight= 'balanced', oob_score= True, max_features=None, random_state=1)
    bag_loop.fit(X_train, y_train.map({'Yes': 1, 'No': 0}))
    f1_score_oob.append (np.round (f1_score(y_train.map({'Yes': 1, 'No': 0}), pd.Series(np.argmax(bag_loop.oob_decision_function_,axis=1)),pos_label=1),3)) 

In [28]:
max(f1_score_oob)

0.103

In [29]:
indexmax_bagging_f1_score= f1_score_oob.index(max(f1_score_oob))

In [30]:
number_of_trees[indexmax_bagging_f1_score]

750

Selecting based on f1-score also results in a forest with 750 trees. Nothing changed from the previous attempt

We do not need to obtain the confusion matrix and classification report again because we will get the same results. The previous configuration was also 750 trees and class_weight= 'balanced'

## Applying Random Forest to the Caravan dataset

Let's apply RF and tune both the number of trees and the max_features hyperparameter

max_feature will change from around sqrt(p) to around p/2

Let's apply the Grid Search approach BECAUSE USING THE OOB APPROACH GAVE ME SOME ERRORS I DID NOT HAVE TIME TO FIX!!!

### Basic application: select based on accuracy (= error) 

In [62]:
from sklearn.model_selection import GridSearchCV

In [94]:
# I reduceD the number of options for the number of trees to lessen the burden of the grid search

number_of_trees2=np.array ([250, 750, 1000])

Number of features to consider?

In [46]:
print(Caravan_df.shape[1])
print(np.sqrt(Caravan_df.shape[1]))
print(Caravan_df.shape[1]/2)

86
9.273618495495704
43.0


In [87]:
number_of_features=np.array ([10, 20, 30, 40, 50])

In [88]:
hyperparam_grid_rf = {
    'criterion': ['entropy'],
    'n_estimators': number_of_trees2,   
     'max_features': number_of_features
}

In [126]:
gridSearch_rf = GridSearchCV(RandomForestClassifier(), hyperparam_grid_rf, cv=3,scoring='accuracy')

# CV=3. Reducing the number of folds to speed up computation

DO NOT RUN THE NEXT CELL. IT WILL TAKE TOO LONG! (5 -6 MINUTES)

YOU DO NOT NEED TO RUN IT. YOU ONLY NEED TO OBSERVE THE RESULTS

YOU WON'T NEED TO DO THIS IN THE EXAM

In [92]:
gridSearch_rf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['entropy'],
                         'max_features': array([10, 20, 30, 40, 50]),
                         'n_estimators': array([ 250,  750, 1000])},
             scoring='accuracy')

In [93]:
print('Parameters: ', gridSearch_rf.best_params_)

Parameters:  {'criterion': 'entropy', 'max_features': 20, 'n_estimators': 750}


In [95]:
rf_caravan= RandomForestClassifier(n_estimators = 750, criterion='entropy', max_features=20, random_state=1)

In [96]:
rf_caravan.fit (X_train, y_train)

RandomForestClassifier(criterion='entropy', max_features=20, n_estimators=750,
                       random_state=1)

In [97]:
y_predicted_caravan_rf= rf_caravan.predict (X_test)

In [98]:
confusion_matrix (y_test, y_predicted_caravan_rf)

array([[1075,   26],
       [  60,    4]], dtype=int64)

BAD AGAIN!

### Let's change the probability threshold of the previous forest and compute the metrics again

In [99]:
prob_yes_rf= rf_caravan.predict_proba(X_test)[:,1]

In [100]:
dict_predictions= dict()

In [101]:
dict_f1_scores= dict()

In [102]:
for j in array_prob:
    dict_predictions[j]=np.empty(y_test.size, dtype=object)
    for i in np.arange(0, dict_predictions[j].size):
        if prob_yes_rf[i] > j:
            dict_predictions[j][i]= 'Yes'
        else:
            dict_predictions[j][i]= 'No'
    dict_f1_scores[j]= np.round (f1_score(y_test, dict_predictions[j],pos_label='Yes'),3)

In [103]:
dict_f1_scores

{0.05: 0.199,
 0.1: 0.216,
 0.15000000000000002: 0.219,
 0.2: 0.167,
 0.25: 0.135,
 0.3: 0.118,
 0.35000000000000003: 0.107,
 0.4: 0.113,
 0.45: 0.099,
 0.5: 0.085}

In [104]:
# This loop computes the prediction of Y (No or Yes) for each test observation
# The predictions of Y are stored in an array called 'y_predicted_prob015_2' 
# The prediction uses a prob threshold of 0.15

y_predicted_prob015_2=np.empty(y_test.size, dtype=object)

for i in np.arange(0,y_predicted_prob015.size):
    if prob_yes_rf[i] > 0.15:
        y_predicted_prob015_2[i]= 'Yes'
    else:
        y_predicted_prob015_2[i]= 'No'

In [105]:
confusion_matrix (y_test, y_predicted_prob015_2)

array([[994, 107],
       [ 43,  21]], dtype=int64)

In [106]:
print(classification_report(y_test, y_predicted_prob015_2))

              precision    recall  f1-score   support

          No       0.96      0.90      0.93      1101
         Yes       0.16      0.33      0.22        64

    accuracy                           0.87      1165
   macro avg       0.56      0.62      0.57      1165
weighted avg       0.91      0.87      0.89      1165



### Let's select the best forest configuration based on f1-score

In [108]:
from sklearn.metrics import make_scorer

In [109]:
f1_scorer = make_scorer(f1_score, pos_label='Yes')

In [110]:
gridSearch_rf_f1_score = GridSearchCV(RandomForestClassifier(), hyperparam_grid_rf, cv=3,scoring=f1_scorer )

DO NOT RUN THE NEXT CELL. IT WILL TAKE TOO LONG! (5 -6 MINUTES)

YOU DO NOT NEED TO RUN IT. YOU ONLY NEED TO OBSERVE THE RESULTS

YOU WON'T NEED TO DO THIS IN THE EXAM

In [111]:
gridSearch_rf_f1_score.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['entropy'],
                         'max_features': array([10, 20, 30, 40, 50]),
                         'n_estimators': array([ 250,  750, 1000])},
             scoring=make_scorer(f1_score, pos_label=Yes))

In [112]:
print('Parameters: ', gridSearch_rf_f1_score.best_params_)

Parameters:  {'criterion': 'entropy', 'max_features': 30, 'n_estimators': 250}


In [113]:
rf_caravan2= RandomForestClassifier(n_estimators = 250, criterion='entropy', max_features=30, random_state=1)

In [114]:
rf_caravan2.fit (X_train, y_train)

RandomForestClassifier(criterion='entropy', max_features=30, n_estimators=250,
                       random_state=1)

In [115]:
y_predicted_caravan_rf2= rf_caravan2.predict (X_test)

In [116]:
confusion_matrix (y_test, y_predicted_caravan_rf2)

array([[1072,   29],
       [  60,    4]], dtype=int64)

BAD AGAIN !

### Let's select the best forest configuration based on f1-score and change the class weights

In [118]:
hyperparam_grid_rf2 = {
    'class_weight': ['balanced'],
    'criterion': ['entropy'],
    'n_estimators': number_of_trees2,   
     'max_features': number_of_features
}

In [119]:
gridSearch_rf_f1_score2 = GridSearchCV(RandomForestClassifier(), hyperparam_grid_rf2, cv=3,scoring=f1_scorer )

DO NOT RUN THE NEXT CELL. IT WILL TAKE TOO LONG! (5 -6 MINUTES)

YOU DO NOT NEED TO RUN IT. YOU ONLY NEED TO OBSERVE THE RESULTS

YOU WON'T NEED TO DO THIS IN THE EXAM

In [120]:
gridSearch_rf_f1_score2.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'class_weight': ['balanced'], 'criterion': ['entropy'],
                         'max_features': array([10, 20, 30, 40, 50]),
                         'n_estimators': array([ 250,  750, 1000])},
             scoring=make_scorer(f1_score, pos_label=Yes))

In [121]:
print('Parameters: ', gridSearch_rf_f1_score2.best_params_)

Parameters:  {'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 30, 'n_estimators': 750}


In [122]:
rf_caravan3= RandomForestClassifier(n_estimators = 750, class_weight= 'balanced', criterion='entropy', max_features=30, random_state=1)

In [123]:
rf_caravan3.fit (X_train, y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_features=30, n_estimators=750, random_state=1)

In [124]:
y_predicted_caravan_rf3= rf_caravan3.predict (X_test)

In [125]:
confusion_matrix (y_test, y_predicted_caravan_rf3)

array([[1077,   24],
       [  60,    4]], dtype=int64)

NOT GOOD... TIME TO GIVE UP :)

# The Default dataset

Apply Bagging and RF to the default dataset