# Random Forest

In [1]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.pipeline import Pipeline

from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import classification_report, accuracy_score, r2_score

import pandas as pd

import random

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
min_range = 0
max_range = 500

In [3]:
rand_int_min = -50
rand_int_max = 50

## Random Forest Classifier

In [4]:
# 3 columns
x = [[r+random.randint(rand_int_min, rand_int_max), r+random.randint(rand_int_min, rand_int_max), r+random.randint(rand_int_min, rand_int_max)] for r in range(min_range,max_range)]
x

[[-33, -47, -2],
 [-45, 0, 21],
 [-38, 25, 46],
 [29, -20, -17],
 [23, -41, 7],
 [-26, -8, -40],
 [28, 46, -6],
 [-1, 1, -43],
 [-18, 47, -11],
 [24, -11, 43],
 [23, 8, 51],
 [42, 56, -3],
 [-29, -6, -26],
 [53, 1, 56],
 [-2, 59, -26],
 [20, 1, 3],
 [-17, -14, 62],
 [43, 38, -8],
 [64, -28, -21],
 [-4, 15, 42],
 [61, 19, -3],
 [25, 3, 47],
 [-12, 36, 33],
 [8, -21, 53],
 [65, -3, 43],
 [55, 43, 42],
 [39, 25, 27],
 [-8, -22, 66],
 [-5, 10, 9],
 [79, 77, -3],
 [74, 7, 39],
 [-4, 12, 22],
 [71, 2, 60],
 [14, 66, 62],
 [6, 23, 77],
 [11, 13, 74],
 [0, 43, 42],
 [42, 26, 81],
 [50, 32, 28],
 [36, 59, -2],
 [47, 44, 61],
 [41, -1, 80],
 [67, 83, 32],
 [56, 86, 89],
 [44, 69, 45],
 [26, 10, 82],
 [5, 64, 37],
 [29, 82, 70],
 [3, 68, 64],
 [83, 48, 49],
 [80, 91, 81],
 [8, 29, 10],
 [102, 12, 45],
 [66, 103, 93],
 [64, 83, 33],
 [21, 29, 46],
 [103, 100, 83],
 [87, 43, 16],
 [103, 9, 49],
 [22, 35, 100],
 [40, 77, 48],
 [111, 59, 70],
 [44, 34, 109],
 [42, 36, 54],
 [46, 85, 57],
 [56, 42, 47

In [5]:
y = [random.randint(0, 1) for r in range(min_range,max_range)]
y

[1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,


In [6]:
x_train_clf, x_test_clf, y_train_clf, y_test_clf = train_test_split(x, y, train_size=.7, test_size=.3)

In [7]:
rf_clf = RandomForestClassifier()

In [8]:
rf_clf.fit(x_train_clf,y_train_clf)

In [9]:
rf_clf_pred = rf_clf.predict(x_test_clf)

In [10]:
mean_squared_error(y_test_clf, rf_clf_pred)

0.48

In [11]:
mean_absolute_error(y_test_clf, rf_clf_pred)

0.48

In [12]:
accuracy_score(y_test_clf, rf_clf_pred)

0.52

In [13]:
cr = classification_report(y_test_clf, rf_clf_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.48      0.67      0.56        69
           1       0.58      0.40      0.47        81

    accuracy                           0.52       150
   macro avg       0.53      0.53      0.52       150
weighted avg       0.54      0.52      0.51       150



## Random Forest Regressor

In [14]:
# 3 columns
x = [[r+random.randint(rand_int_min, rand_int_max), r+random.randint(rand_int_min, rand_int_max), r+random.randint(rand_int_min, rand_int_max)] for r in range(min_range,max_range)]
x

[[-8, -37, 8],
 [13, 29, -49],
 [14, 39, -17],
 [4, 2, 49],
 [-37, 42, 53],
 [33, -42, -24],
 [47, -17, -33],
 [54, 24, 38],
 [-21, 11, 5],
 [24, -33, 17],
 [14, 45, 53],
 [37, 16, 59],
 [13, -33, 38],
 [-24, 43, 11],
 [-33, 8, -21],
 [63, 37, 61],
 [59, 14, -17],
 [28, -2, -24],
 [49, 14, -27],
 [67, 47, 52],
 [60, -10, 12],
 [22, -25, 17],
 [37, 60, 61],
 [54, 10, 43],
 [60, 20, 45],
 [-2, 12, -3],
 [66, 49, 12],
 [18, 44, 15],
 [33, 30, 77],
 [1, 4, 17],
 [54, 30, 38],
 [9, -14, 58],
 [20, 47, 66],
 [-12, 42, -12],
 [19, 52, 38],
 [2, 43, 22],
 [84, 67, 81],
 [85, 39, -11],
 [27, 45, 54],
 [27, 67, 38],
 [17, 7, 71],
 [89, 64, 41],
 [67, 21, 52],
 [51, 12, -2],
 [51, 10, 65],
 [67, 78, 54],
 [67, 1, 93],
 [31, 5, 40],
 [29, 73, 0],
 [73, 88, 65],
 [63, 3, 11],
 [26, 82, 7],
 [29, 26, 65],
 [51, 95, 56],
 [64, 32, 89],
 [76, 27, 21],
 [51, 78, 26],
 [86, 107, 63],
 [66, 74, 97],
 [40, 48, 86],
 [88, 97, 82],
 [76, 104, 25],
 [28, 34, 43],
 [112, 66, 26],
 [38, 22, 58],
 [43, 91, 58],

In [15]:
y = [r+random.randint(rand_int_min, rand_int_max) for r in range(min_range,max_range)]
y

[35,
 -19,
 -23,
 18,
 -30,
 25,
 -18,
 56,
 -3,
 32,
 53,
 -36,
 38,
 50,
 43,
 21,
 61,
 11,
 -11,
 29,
 -1,
 14,
 26,
 32,
 43,
 30,
 -15,
 75,
 26,
 13,
 77,
 1,
 38,
 59,
 -2,
 50,
 50,
 -12,
 -6,
 15,
 -8,
 67,
 9,
 21,
 84,
 7,
 47,
 13,
 66,
 30,
 87,
 45,
 67,
 65,
 78,
 28,
 55,
 35,
 59,
 89,
 12,
 74,
 36,
 71,
 34,
 57,
 55,
 32,
 63,
 86,
 79,
 105,
 102,
 113,
 103,
 83,
 124,
 73,
 64,
 129,
 52,
 127,
 48,
 37,
 83,
 111,
 134,
 125,
 117,
 66,
 116,
 93,
 137,
 79,
 84,
 87,
 113,
 72,
 142,
 91,
 70,
 73,
 137,
 99,
 147,
 104,
 119,
 137,
 99,
 79,
 141,
 95,
 151,
 139,
 128,
 130,
 93,
 120,
 147,
 82,
 85,
 122,
 141,
 149,
 105,
 168,
 104,
 137,
 110,
 138,
 80,
 107,
 142,
 118,
 155,
 96,
 133,
 176,
 131,
 151,
 129,
 118,
 167,
 146,
 131,
 146,
 107,
 139,
 147,
 153,
 101,
 153,
 188,
 141,
 108,
 112,
 183,
 132,
 205,
 170,
 206,
 194,
 144,
 145,
 205,
 132,
 144,
 188,
 128,
 182,
 210,
 130,
 198,
 212,
 136,
 158,
 218,
 190,
 190,
 221,
 212,
 172,

In [16]:
x_train_reg, x_test_reg, y_train_reg, y_test_reg = train_test_split(x, y, train_size=.7, test_size=.3)

In [17]:
rf_reg = RandomForestRegressor()

In [18]:
rf_reg.fit(x_train_reg, y_train_reg)

In [19]:
rf_reg_pred = rf_reg.predict(x_test_reg)

In [20]:
mean_squared_error(y_test_reg, rf_reg_pred)

1400.7658306666667

In [21]:
mean_absolute_error(y_test_reg, rf_reg_pred)

31.674000000000003

In [22]:
r2_score(y_test_reg, rf_reg_pred)

0.933805104848095

## Random Forest Pipeline

### Random Forest Classification Pipeline

In [23]:
rf_clf_pipeline = Pipeline([
    ('rf', RandomForestClassifier())
])

In [24]:
rf_clf_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'rf', 'rf__bootstrap', 'rf__ccp_alpha', 'rf__class_weight', 'rf__criterion', 'rf__max_depth', 'rf__max_features', 'rf__max_leaf_nodes', 'rf__max_samples', 'rf__min_impurity_decrease', 'rf__min_samples_leaf', 'rf__min_samples_split', 'rf__min_weight_fraction_leaf', 'rf__n_estimators', 'rf__n_jobs', 'rf__oob_score', 'rf__random_state', 'rf__verbose', 'rf__warm_start'])

In [25]:
rf_clf_param_grid = {
    'rf__criterion' : ['gini', 'entropy'],
    'rf__max_depth' : [5,10,15],
    'rf__max_features': [1,2,3],
    'rf__max_leaf_nodes': [10, 20, 50],
    'rf__min_samples_leaf': [5,10,25],
    'rf__random_state': [1, 25, 42]
}

In [26]:
rf_clf_random_search = RandomizedSearchCV(rf_clf_pipeline, rf_clf_param_grid)

In [27]:
rf_clf_random_search.fit(x_train_clf, y_train_clf)

In [28]:
rf_clf_random_search.best_params_

{'rf__random_state': 42,
 'rf__min_samples_leaf': 5,
 'rf__max_leaf_nodes': 10,
 'rf__max_features': 3,
 'rf__max_depth': 15,
 'rf__criterion': 'entropy'}

In [29]:
rf_clf_rand_search_pred = rf_clf_random_search.predict(x_test_clf)

In [30]:
mean_squared_error(y_test_clf, rf_clf_rand_search_pred)

0.49333333333333335

In [31]:
mean_absolute_error(y_test_clf, rf_clf_rand_search_pred)

0.49333333333333335

In [32]:
accuracy_score(y_test_clf, rf_clf_rand_search_pred)

0.5066666666666667

In [33]:
cr = classification_report(y_test_clf, rf_clf_rand_search_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.48      0.75      0.58        69
           1       0.59      0.30      0.39        81

    accuracy                           0.51       150
   macro avg       0.53      0.52      0.49       150
weighted avg       0.54      0.51      0.48       150



### Random Forest Regression Pipeline

In [34]:
rf_reg_pipeline = Pipeline([
    ('rf', RandomForestRegressor())
])

In [35]:
rf_reg_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'rf', 'rf__bootstrap', 'rf__ccp_alpha', 'rf__criterion', 'rf__max_depth', 'rf__max_features', 'rf__max_leaf_nodes', 'rf__max_samples', 'rf__min_impurity_decrease', 'rf__min_samples_leaf', 'rf__min_samples_split', 'rf__min_weight_fraction_leaf', 'rf__n_estimators', 'rf__n_jobs', 'rf__oob_score', 'rf__random_state', 'rf__verbose', 'rf__warm_start'])

In [36]:
rf_reg_param_grid = {
    'rf__criterion' : ['squared_error', 'friedman_mse', 'absolute_error'],
    'rf__max_depth' : [5,10,15],
    'rf__max_features': [1,2,3],
    'rf__max_leaf_nodes': [10, 20, 50],
    'rf__min_samples_leaf': [5,10,25],
    'rf__random_state': [1, 25, 42]
}

In [37]:
rf_reg_random_search = RandomizedSearchCV(rf_reg_pipeline, rf_reg_param_grid)

In [38]:
rf_reg_random_search.fit(x_train_reg, y_train_reg)

In [39]:
rf_reg_random_search.best_params_

{'rf__random_state': 42,
 'rf__min_samples_leaf': 10,
 'rf__max_leaf_nodes': 50,
 'rf__max_features': 1,
 'rf__max_depth': 15,
 'rf__criterion': 'absolute_error'}

In [40]:
rf_reg_rand_search_pred = rf_reg_random_search.predict(x_test_reg)

In [41]:
mean_squared_error(y_test_reg, rf_reg_rand_search_pred)

1262.9949924999999

In [42]:
mean_absolute_error(y_test_reg, rf_reg_rand_search_pred)

30.172966666666664

In [43]:
r2_score(y_test_reg, rf_reg_rand_search_pred)

0.9403156335801474