# Hyper-Parameter Tuning of Random Forest

### Importing dependencies

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df=pd.read_csv("heart.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df.shape

(303, 14)

In [5]:
X=df.iloc[:,0:-1]
Y=df.iloc[:,-1]

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test= train_test_split(X,Y,test_size=0.2,random_state=42)

In [7]:
print(X_train.shape)
print(X_test.shape)

(242, 13)
(61, 13)


In [8]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [9]:
rf=RandomForestClassifier()

In [10]:
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8360655737704918

### Cross validation

In [11]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(),X,Y,cv=10,scoring='accuracy'))

0.8213978494623657

## Hyperparameter Tuning:

The accuracy without any hyper parameter tuning is good. But we will tune the **hyperparameters** to show that how can we increase the performance of our Machine Learning model.

We will do it by two ways:
- GridSearchCV and 
- RandomizedSearchCV

### 1. GridSearchCV

In [12]:
# number of trees in the random forest
n_estimators=[20,60,100,120]

# number of features to consider at every split
max_features=[0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth=[2,8,None]

# Numnber of samples
max_samples=[0.5,0.75,1.0]

# 108 diff random forest

In [13]:
param_grid={'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth':max_depth,
            'max_samples':max_samples 
           }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [14]:
rf=RandomForestClassifier()

In [15]:
from sklearn.model_selection import GridSearchCV

rf_grid=GridSearchCV(estimator = rf,
                    param_grid = param_grid,
                    cv = 5,
                    verbose = 2,
                    n_jobs = -1)

# 1. estimator => which algorith is to be used
# 2. param_grid => the features over which GridSearchCv will be implemented
# 3. cv => the number of times to train it over these parameters
# 4. verbose => the output will be visible during the process
# 5. n_jobs=-1 => the process will be faster

In [16]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [17]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 20}

In [18]:
rf_grid.best_score_

0.8554421768707483

### 2. RandomSearchCV

In [22]:
# number of trees in the random forest
n_estimators=[20,60,100,120]

# number of features to consider at every split
max_features=[0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth=[2,8,None]

# Numnber of samples
max_samples=[0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# minimum number of samples required to split a node
min_samples_split=[2,5]

# minimum number of samples required at each leaf node
min_samples_leaf=[1,2]


In [24]:
param_grid={'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth':max_depth,
            'max_samples':max_samples,
            'bootstrap':bootstrap,
            'min_samples_split':min_samples_split,
            'min_samples_leaf':min_samples_leaf
           }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [26]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid=RandomizedSearchCV(estimator = rf,
                    param_distributions= param_grid,
                    cv = 5,
                    verbose = 2,
                    n_jobs = -1)

In [34]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 402, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `ma

In [37]:
rf_grid.best_params_

{'n_estimators': 60,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_samples': 1.0,
 'max_features': 0.2,
 'max_depth': 8,
 'bootstrap': True}

In [38]:
rf_grid.best_score_

0.8221088435374149