In [38]:
# https://www.analyticsvidhya.com/blog/2021/06/understanding-random-forest/

# Importing the required libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree

%matplotlib inline

In [11]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [20]:
df = df.rename({'target': 'heart disease'}, axis='columns')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart disease
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [15]:
# Putting feature variable to X
X = df.drop('heart disease',axis=1)
# Putting response variable to y
y = df['heart disease']

In [26]:
# 303 rows × 13 columns
X.head(1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1


In [24]:
# Name: heart disease, Length: 303, dtype: int64
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: heart disease, Length: 303, dtype: int64

In [17]:
# now lets split the data into train and test
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [18]:
X_train.shape, X_test.shape

((212, 13), (91, 13))

In [27]:
# n_estimators represents the number of trees in the forest. 
# Usually the higher the number of trees the better to learn the data. 
# However, adding a lot of trees can slow down the training process considerably, 
# therefore we do a parameter search to find the sweet spot.
# where each dictionary contains the values for the hyperparameters 'max_depth' and 'max_features'.

# Hyperparameters to Increase the Speed :

# 1- The 'max_depth' hyperparameter specifies the maximum depth of each decision tree in the forest.

# 2- n_jobs: it tells the engine how many processors it is allowed to use. 
# If the value is 1, it can use only one processor, but if the value is -1, there is no limit.

# 3- oob_score: OOB means out of the bag.
# It is a random forest cross-validation method. 
# In this, one-third of the sample is not used to train the data; instead used to evaluate its performance. 
# These samples are called out-of-bag samples.

classifier_rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=5,random_state=42 , oob_score=True)

In [29]:
%%time
classifier_rf.fit(X_train, y_train)

CPU times: total: 906 ms
Wall time: 1.17 s


In [30]:
# checking the oob score
classifier_rf.oob_score_

0.8018867924528302

In [31]:
# now we are going to tune the hyperparameters for Random Forest using GridSearchCV and fit the data.
#  GridSearchCV (Grid Search Cross-Validation) 
# GridSearchCV helps in automating the process of hyperparameter tuning, enhancing model performance, and avoiding manual trial-and-error.

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

In [34]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")

In [35]:
%%time
grid_search.fit(X_train, y_train)
# Wall time in GridSearchCV refers to the actual time taken by the algorithm to complete the search, 
# including the time taken by the CPU and any other resources used by the algorithm

Fitting 4 folds for each of 180 candidates, totalling 720 fits
CPU times: total: 4.78 s
Wall time: 1min 25s


In [36]:
grid_search.best_score_

0.8349056603773585

In [37]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
# From hyperparameter tuning, we can fetch the best estimator, as shown. 
# The best set of parameters identified was max_depth=5, min_samples_leaf=10,n_estimators=10

In [39]:
# visualize
plt.figure(figsize=(80,40))
plot_tree(rf_best.estimators_[5], feature_names = X.columns,class_names=['Disease', "No Disease"],filled=True)

InvalidParameterError: The 'feature_names' parameter of plot_tree must be an instance of 'list' or None. Got Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object') instead.

<Figure size 8000x4000 with 0 Axes>