In [39]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib 
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import RocCurveDisplay

import datetime 
print(f"Notebook last updated:{datetime.datetime.now()}\n")
      
print(f"NumPy version: {np.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"matplotlib version: {matplotlib.__version__}")
print(f"Scikit-Learn version: {sklearn.__version__}")



Notebook last updated:2025-05-17 18:43:06.739470

NumPy version: 2.2.2
pandas version: 2.2.3
matplotlib version: 3.10.0
Scikit-Learn version: 1.6.1


RandomForestClassifier

In [40]:
df = pd.read_csv("heart-disease (1).csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [41]:
x = df.drop("target", axis=1)
y = df["target"]

x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [42]:
y.head(), y.value_counts()

(0    1
 1    1
 2    1
 3    1
 4    1
 Name: target, dtype: int64,
 target
 1    165
 0    138
 Name: count, dtype: int64)

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [44]:
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [45]:
clf.fit(x_train, y_train)

In [46]:
y_preds = clf.predict(x_test)
y_preds

array([1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1])

Train accuracy

In [47]:
trian_acc = clf.score(x_train, y_train)
print(f"Train accuracy: {trian_acc*100:.2f}%")

Train accuracy: 100.00%


Test accuracy

In [48]:
test_acc = clf.score(x_test, y_test)
print(f"Train accuracy: {test_acc*100:.2f}%")

Train accuracy: 78.69%


Classification Reports

In [49]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.76      0.79      0.77        28
           1       0.81      0.79      0.80        33

    accuracy                           0.79        61
   macro avg       0.79      0.79      0.79        61
weighted avg       0.79      0.79      0.79        61



Confusion Matrix

In [50]:
conf_mat = confusion_matrix(y_test, y_preds)
conf_mat

array([[22,  6],
       [ 7, 26]])

Accuracy Score

In [51]:
accuracy_score(y_test, y_preds)

0.7868852459016393

In [52]:
np.random.seed(40)
for i in range(100, 200, 10):
    print(i)
    model = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    
    model_score = model.score(x_test, y_test)
    print(f"Model score: {model_score*100:.2f}%")
    
    cross_val_mean = np.mean(cross_val_score(model, x, y, cv=5))
    print(f"Cross val mean: {cross_val_mean*100:.2f}%")
    
    print("")

100
Model score: 78.69%
Cross val mean: 80.19%

110
Model score: 78.69%
Cross val mean: 81.16%

120
Model score: 73.77%
Cross val mean: 81.50%

130
Model score: 78.69%
Cross val mean: 81.83%

140
Model score: 80.33%
Cross val mean: 82.49%

150
Model score: 77.05%
Cross val mean: 82.81%

160
Model score: 77.05%
Cross val mean: 83.49%

170
Model score: 78.69%
Cross val mean: 83.15%

180
Model score: 80.33%
Cross val mean: 81.17%

190
Model score: 77.05%
Cross val mean: 81.83%



Grid Search Cross Validation

In [54]:
np.random.seed(40)
param_grid = {'n_estimators': [i for i in range(100, 200, 10)]}
grid = GridSearchCV(estimator= RandomForestClassifier(), param_grid= param_grid, cv=5, verbose=1)
grid.fit(x, y)

print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 170}
0.8314754098360655


In [55]:
clf = grid.best_estimator_
clf

In [56]:
clf = clf.fit(x_train, y_train)
print(f"Best model score split of the data: {clf.score(x_test, y_test)*100:.2f}%")

Best model score split of the data: 75.41%
