In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score


In [35]:
df = pd.read_csv('heart.csv')
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [3]:
#Fit RandomForest model at given state and produce classification report
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87        27
           1       0.91      0.88      0.90        34

    accuracy                           0.89        61
   macro avg       0.88      0.89      0.88        61
weighted avg       0.89      0.89      0.89        61



In [6]:
#Run grid search across chosen hyperparameters for small range

from sklearn.model_selection import cross_val_score

param_grid = {
    'bootstrap': [True, False],
    'max_depth': [10, 25, 50, 100, None],
    'n_estimators': [10, 75, 100],
    'criterion': ['gini','entropy'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto','sqrt'],
    'random_state': [0]
}

grid = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1, cv=3)
grid.fit(X_train, y_train)
print(grid.best_params_)

grid_pred = grid.predict(X_test)
#print(cross_val_score(grid, X_train, y_train, cv=5))
print(classification_report(y_test, grid_pred))

Fitting 3 folds for each of 1080 candidates, totalling 3240 fits
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 0}
              precision    recall  f1-score   support

           0       0.85      0.85      0.85        27
           1       0.88      0.88      0.88        34

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [21]:
# inputting best parameters from grid search

optimodel = RandomForestClassifier(bootstrap=True,
                                   criterion='gini',
                                   max_depth=10,
                                   max_features='auto',
                                   min_samples_leaf=2,
                                   min_samples_split=10,
                                   n_estimators=75,
                                   random_state=0
                                  )

In [37]:
optimodel.fit(X_train, y_train)

oppred = optimodel.predict(X_test)

print(classification_report(y_test, oppred))

              precision    recall  f1-score   support

           0       0.84      0.76      0.80        21
           1       0.88      0.93      0.90        40

    accuracy                           0.87        61
   macro avg       0.86      0.84      0.85        61
weighted avg       0.87      0.87      0.87        61



In [36]:
# default model classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.36      0.48      0.41        21
           1       0.67      0.55      0.60        40

    accuracy                           0.52        61
   macro avg       0.51      0.51      0.51        61
weighted avg       0.56      0.52      0.54        61



In [20]:
# cross validation score for tuned model at default settings
from sklearn.model_selection import cross_val_score

cross_val_score(optimodel, X_train, y_train, cv=10).mean()

0.8185

In [18]:
# cross validation score for model at default settings

print(cross_val_score(model, X_train, y_train, cv=10).mean())

0.8268333333333333


In [17]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cross_val_score(optimodel, X, y, cv=StratifiedKFold(n_splits=10, shuffle=False)).mean()

0.8415053763440861

In [16]:
cross_val_score(model, X, y, cv=StratifiedKFold(n_splits=10, shuffle=False)).mean()

0.8316129032258065

In [23]:
cross_val_score(optimodel, X, y, scoring='f1', cv=StratifiedKFold(n_splits=10, shuffle=False)).mean()

0.8584414798283966

In [24]:
cross_val_score(model, X, y, scoring='f1', cv=StratifiedKFold(n_splits=10, shuffle=False)).mean()

0.8495641247248855

### Results:

The default scored higher on the whole dataset, whereas the optimodel scored better on the training data when k=5 and shuffle=False. However, for k values 3 and 10 the optimodel performed better, although the results are not significantly different.
The reverse is true for the training data performance, at k=3 and k=10
the default model performed better on the training data; whereas at 
k=5, the optimodel performed better. (Both with shuffle=False)

*Optimodel means the model with the tuned parameters, from the grid search.
Note: The grid search was performed at default k value (k=5)
Further note: k is denoted by cv or n_splits in the above code

In [9]:
# display default hyperparameters for RandomForest model

from pprint import pprint
pprint(model.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}


In [17]:
# carry out random grid search across larger ranges for chosen hyperparameters

from sklearn.model_selection import RandomizedSearchCV

random_grid = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'criterion': ['gini','entropy'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto','sqrt'],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]
}

In [18]:
# fit model using random search 

rf = RandomForestClassifier(random_state=0)

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv=3, verbose=2, random_state=0, n_jobs=-1)

rf_random.fit(X_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800]},
                   random_state=0, verbose=2)

In [13]:
random_grid

{'bootstrap': [True, False],
 'criterion': ['auto', 'gini', 'entropy'],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]}

In [19]:
# display best parameters from random grid search
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 80,
 'criterion': 'gini',
 'bootstrap': False}

In [20]:
# perform grid search after random search to narrow down optimal settings

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 150, 200, 225, 1000],
    'criterion': ['gini'],
    'min_samples_split': [2, 4, 5, 6],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'max_depth': [55, 60, 65, 80, None],
    'bootstrap': [True, False],
    'random_state': [0]
}

grid = GridSearchCV(rf, param_grid, refit=True, verbose=3, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

grid_pred = grid.predict(X_test)
print(classification_report(y_test, grid_pred))

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
{'bootstrap': False, 'criterion': 'gini', 'max_depth': 55, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 200, 'random_state': 0}
              precision    recall  f1-score   support

           0       0.85      0.85      0.85        27
           1       0.88      0.88      0.88        34

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [64]:
# display accuracy score from grid search hyperparameters

from sklearn.metrics import accuracy_score

accuracy_score(y_test, grid_pred)

0.8360655737704918

In [65]:
# display accuracy score for model at default settings

accuracy_score(y_test, predictions)

0.8852459016393442

In [21]:
# display default hyperparameter settings

from pprint import pprint
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}


In [29]:
# display default settings at different random state value

rf_t = RandomForestClassifier(random_state=102)

pprint(rf_t.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 102,
 'verbose': 0,
 'warm_start': False}
