In [27]:
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score,f1_score
from scipy.stats import randint

In [2]:
df = pd.read_csv('../heart_cleveland.csv')

In [3]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [33]:
X = df.drop('condition', axis = 1)
y = df['condition']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [35]:
modelrf = RandomForestClassifier()

In [36]:
modelrf.fit(X_train, y_train)

In [37]:
y_pred_rf = modelrf.predict(X_test)

In [38]:
print(classification_report(y_pred_rf, y_test))

              precision    recall  f1-score   support

           0       0.69      0.73      0.71        30
           1       0.71      0.67      0.69        30

    accuracy                           0.70        60
   macro avg       0.70      0.70      0.70        60
weighted avg       0.70      0.70      0.70        60



In [39]:
def explain_model(model,  X_train, X_test, y_train, y_test, features = None):
    print('Test Results:')
    y_pred = model.predict(X_test)
    print(classification_report(y_pred, y_test))
    print(confusion_matrix(y_pred, y_test))
    print('Train Results:')
    y_pred_tr = model.predict(X_train)
    print(classification_report(y_pred_tr, y_train))
    print(confusion_matrix(y_pred_tr, y_train))
    print('------------------')
    print('------------------')
    plt.figure(figsize=(20, 10))
    plot_tree(model, 
              filled=True,
              feature_names = features,
              class_names=True,
              rounded=True,
              fontsize=10)
    plt.title("Visualization", fontsize=16, pad=20)
    plt.tight_layout()
    plt.show()

In [40]:
explain_model(modelrf,  X_train, X_test, y_train, y_test, features = X_train.columns)

Test Results:
              precision    recall  f1-score   support

           0       0.69      0.73      0.71        30
           1       0.71      0.67      0.69        30

    accuracy                           0.70        60
   macro avg       0.70      0.70      0.70        60
weighted avg       0.70      0.70      0.70        60

[[22  8]
 [10 20]]
Train Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       128
           1       1.00      1.00      1.00       109

    accuracy                           1.00       237
   macro avg       1.00      1.00      1.00       237
weighted avg       1.00      1.00      1.00       237

[[128   0]
 [  0 109]]
------------------
------------------


InvalidParameterError: The 'decision_tree' parameter of plot_tree must be an instance of 'sklearn.tree._classes.DecisionTreeClassifier' or an instance of 'sklearn.tree._classes.DecisionTreeRegressor'. Got RandomForestClassifier() instead.

<Figure size 2000x1000 with 0 Axes>

In [31]:
param_dist = {
    "n_estimators": randint(100, 500),
    "max_depth": randint(3, 15),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 10),
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False]
}

model = RandomForestClassifier(random_state=42)

rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=65,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rs.fit(X_train, y_train)

best_model = rs.best_estimator_

Fitting 5 folds for each of 65 candidates, totalling 325 fits


In [32]:
explain_model(best_model,  X_train, X_test, y_train, y_test, features = X_train.columns)

Test Results:
              precision    recall  f1-score   support

           0       0.77      0.72      0.74        32
           1       0.70      0.75      0.72        28

    accuracy                           0.73        60
   macro avg       0.73      0.73      0.73        60
weighted avg       0.74      0.73      0.73        60

[[23  9]
 [ 7 21]]
Train Results:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       128
           1       0.97      0.93      0.95       109

    accuracy                           0.95       237
   macro avg       0.96      0.95      0.95       237
weighted avg       0.95      0.95      0.95       237

[[125   3]
 [  8 101]]
------------------
------------------


InvalidParameterError: The 'decision_tree' parameter of plot_tree must be an instance of 'sklearn.tree._classes.DecisionTreeClassifier' or an instance of 'sklearn.tree._classes.DecisionTreeRegressor'. Got RandomForestClassifier(max_depth=5, max_features='log2', min_samples_split=5,
                       n_estimators=149, random_state=42) instead.

<Figure size 2000x1000 with 0 Axes>