Random forest classifier.

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://www.datacamp.com/tutorial/random-forests-classifier-python
# https://www.geeksforgeeks.org/random-forest-classifier-using-scikit-learn/

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# For saving the trained classifier
import pickle

In [2]:
data = pd.read_csv('../dat/heart.csv')

# Remove the initial 'target'so that it's not used as a feature.
data.drop(columns=['target'], inplace=True)

# Specify features, our target = ["thal"]
target = [data.columns[-1]]
print(target)  # ['thal']

all_features = data.columns[0:len(data.columns)-1]
print(all_features)
# Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
#       'exang', 'oldpeak', 'slope', 'ca'],
#      dtype='object')


print(data.shape)  # (1025, 13)
data.head()

['thal']
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca'],
      dtype='object')
(1025, 13)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2


Train-test split

In [3]:
data_train, data_test = train_test_split(data, test_size=0.2, stratify=data[target], random_state=42)

print(data_train.shape)  # (820,13)
print(data_test.shape)   # (205,13)

(820, 13)
(205, 13)


Train the model

In [4]:
clf = RandomForestClassifier(n_estimators=50, random_state=1)
clf.fit(data_train[all_features], data_train[target[0]])

Evaluate its performance on train, test data

In [5]:
# performance on training data
preds_train = clf.predict(data_train[all_features])
print(classification_report(data_train[target], preds_train))

# All predictions on training data seem correct (in terms of precision, recall, f1 being 1.00).
# This may well mean overfitting.

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        51
           2       1.00      1.00      1.00       435
           3       1.00      1.00      1.00       328

    accuracy                           1.00       820
   macro avg       1.00      1.00      1.00       820
weighted avg       1.00      1.00      1.00       820



In [6]:
# check of exact equality of predictions to true values.

# check the shapes
print(data_train[target[0]].shape)  # (820,)
print(preds_train.shape)  # (820,)
print(np.array_equal(data_train[target[0]], preds_train)) # True!

# All predictions are correct on the training set.

(820,)
(820,)
True


In [7]:
# performance on test set
preds_test = clf.predict(data_test[all_features])
print(classification_report(data_test[target], preds_test))

# f1-scores : [1.00, 1.00, 0.99, 0.98]
# The predictions are almost exact!

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00        13
           2       0.97      1.00      0.99       109
           3       1.00      0.96      0.98        82

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



In [8]:
# head of testing data
data_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
61,66,0,2,146,278,0,0,152,0,0.0,1,1,2
391,45,1,3,110,264,0,1,132,0,1.2,1,0,3
612,58,0,0,170,225,1,0,146,1,2.8,1,2,1
371,55,1,0,132,353,0,1,132,1,1.2,1,1,3
826,42,1,2,130,180,0,1,150,0,0.0,2,0,2


In [9]:
# first 5 predictions.
preds_test[:5]
# [2, 3, 1, 3, 2]

array([2, 3, 1, 3, 2])

Save the model to a file in the folder 'app'. Commented out to enable re-execution of the notebook without re-writing the file.

In [10]:
# with open("../app/RF_classifier.pkl", "wb") as f:
#     pickle.dump(clf, f)

# A file of 1.1 MB is written to the disk.