# Random Forest- tuning parameters and cross validation
See <a href="https://www.kaggle.com/dansbecker/random-forests">Random Forests</a> tutorial from Kaggle's excellent (and short) <a href="https://www.kaggle.com/learn/intro-to-machine-learning">Intro to Machine Learning</a> course 

In [28]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Load Data


In [14]:
df = pd.read_csv("../datasets/melb_data.csv")
# df.head()
print(f'there are {len(df)} rows in df')

there are 13580 rows in df


## BTW if you are doing EDA or model building AND your dataset is large then work with a randomly sampled fraction of the data.

This will hugely speed up EDA and model training.  You should do this every time when you first start working with a dataset, especially if it's a large one.

In [15]:
df=df.sample( frac=.1, replace=False, weights=None, random_state=42)
print(f'there are {len(df)} rows in df')

there are 1358 rows in df


In [16]:
#clean up the index
df.reset_index(drop=True,inplace=True)

## Lets strip some columns to make this demo easier

In [17]:
#lets strip out just a few columns to make this example easier
# Choose target and features
features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = df[features]
y = df.Price
print(f'there are {len(df)} rows in X')

there are 1358 rows in X


## Preprocess?
Don't have to do much for random forest.  Gotta handle nulls though

In [18]:
#any nulls?  If so handle
X.isnull().sum()
y.isnull().sum()

Rooms             0
Bathroom          0
Landsize          0
BuildingArea    647
YearBuilt       528
Lattitude         0
Longtitude        0
dtype: int64

0

In [19]:
X=X.fillna(value=X.mean())
y=y.fillna(value=y.mean())

## Get train/test split

In [25]:
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

### Create a random forest classifier, use default hyperparameters

In [52]:
def evalmodel(model,X,y):
    preds = model.predict(X)
    print(mean_absolute_error(y, preds))

In [54]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
_=model.fit(train_X, train_y)

evalmodel(model,val_X,val_y)

225085.05291176474


## What hyperparameters can I tune to make this better?

In [51]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

## lets do max_depth min_samples_split, min_samples_leaf and n_estimators

# Use Optuna to tune hyperparameters

In [48]:
# !conda install -c conda-forge optuna -y

simple optuna example

In [57]:
#simple optuna example
import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
def objective(X,y):
    clf = sklearn.ensemble.RandomForestRegressor(random_state=42)  # Define the model.
    return sklearn.model_selection.cross_val_score( clf, X, y, n_jobs=-1, cv=10,scoring='neg_mean_absolute_error').mean() 

print('neg_mean_absolute_error: {}'.format(objective(X,y)))

neg_mean_absolute_error: -214824.73917205882


finding the best hyperparameters

In [None]:
# Define the model
clf = sklearn.ensemble.RandomForestRegressor()

#get the cross validation score
numb_folds=5
mae=sklearn.model_selection.cross_val_score(clf, train_X, train_y, cv=numb_folds, scoring='neg_mean_absolute_error').mean() 
print(f'The mean absolute error={mae}')

In [56]:
import optuna
def objective(trial,X=X,y=y):
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = int(trial.suggest_float('max_depth', 1, 32, log=True))
    min_samples_split = int(trial.suggest_float('min_samples_split', 2, 10, log=True))

    # Define the model. Pass in params to be tuned
    clf = sklearn.ensemble.RandomForestRegressor(random_state=42, n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split )  

    #get the cross validation score
    return sklearn.model_selection.cross_val_score( clf, X, y, n_jobs=-1, cv=10,scoring='neg_mean_absolute_error').mean() 
 
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('neg_mean_absolute_error: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))
    

[32m[I 2022-03-16 01:31:28,520][0m A new study created in memory with name: no-name-bce606bb-9f0b-4e36-9f19-54580187831b[0m
[32m[I 2022-03-16 01:31:29,383][0m Trial 0 finished with value: -412671.0151869372 and parameters: {'n_estimators': 41, 'max_depth': 1.1205925265473706, 'min_samples_split': 8.834147972278103}. Best is trial 0 with value: -412671.0151869372.[0m
[32m[I 2022-03-16 01:31:29,895][0m Trial 1 finished with value: -264947.55275333585 and parameters: {'n_estimators': 60, 'max_depth': 5.287957535399268, 'min_samples_split': 7.536696360382102}. Best is trial 1 with value: -264947.55275333585.[0m
[32m[I 2022-03-16 01:31:30,244][0m Trial 2 finished with value: -215754.29642152422 and parameters: {'n_estimators': 81, 'max_depth': 24.094709193580808, 'min_samples_split': 4.895725467692583}. Best is trial 2 with value: -215754.29642152422.[0m
[32m[I 2022-03-16 01:31:30,293][0m Trial 3 finished with value: -323163.25507652346 and parameters: {'n_estimators': 13, 'ma

neg_mean_absolute_error: -214743.8860959198
Best hyperparameters: {'n_estimators': 100, 'max_depth': 24.96901425364306, 'min_samples_split': 2.8858608539703594}


In [55]:
model = RandomForestRegressor(random_state=42, n_estimators=99, max_depth=20.22, min_samples_split=3)
_=model.fit(train_X, train_y)

evalmodel(model,val_X,val_y)

224475.37040983982


## Visualization: Is the model more accurate for any particular feature?  Or feature value?


In [None]:
for col in features:
    _=plt.figure()
    _=sns.scatterplot(data=val_X,x=col,y=val_y)
    _=sns.scatterplot(data=val_X,x=col,y=preds)

### The above plots are useless, is there a better way?