In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import graphviz

data = pd.read_csv("heart.csv")
data = data.fillna(data.mean())
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category').cat.codes
X = data.drop("target", axis=1)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

dot_data = export_graphviz(dt_classifier, out_file=None,
                         feature_names=X.columns,
                         class_names=["No Heart Disease", "Heart Disease"],
                         filled=True, rounded=True,
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("heart_disease_decision_tree")

dt_classifier_depth = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_classifier_depth.fit(X_train, y_train)

y_train_pred = dt_classifier.predict(X_train)
y_test_pred = dt_classifier.predict(X_test)

y_train_pred_depth = dt_classifier_depth.predict(X_train)
y_test_pred_depth = dt_classifier_depth.predict(X_test)

print("Decision Tree (No Depth Limit) - Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Decision Tree (No Depth Limit) - Test Accuracy:", accuracy_score(y_test, y_test_pred))

print("Decision Tree (Max Depth 3) - Train Accuracy:", accuracy_score(y_train, y_train_pred_depth))
print("Decision Tree (Max Depth 3) - Test Accuracy:", accuracy_score(y_test, y_test_pred_depth))

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

rf_train_pred = rf_classifier.predict(X_train)
rf_test_pred = rf_classifier.predict(X_test)

print("Random Forest - Train Accuracy:", accuracy_score(y_train, rf_train_pred))
print("Random Forest - Test Accuracy:", accuracy_score(y_test, rf_test_pred))

importances = rf_classifier.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_dt = cross_val_score(dt_classifier, X, y, cv=kf, scoring='accuracy')
cv_scores_rf = cross_val_score(rf_classifier, X, y, cv=kf, scoring='accuracy')

print("Decision Tree Cross-Validation Scores:", cv_scores_dt)
print("Random Forest Cross-Validation Scores:", cv_scores_rf)
print("Decision Tree CV Mean Accuracy:", np.mean(cv_scores_dt))
print("Random Forest CV Mean Accuracy:", np.mean(cv_scores_rf))

Decision Tree (No Depth Limit) - Train Accuracy: 1.0
Decision Tree (No Depth Limit) - Test Accuracy: 0.9853658536585366
Decision Tree (Max Depth 3) - Train Accuracy: 0.8512195121951219
Decision Tree (Max Depth 3) - Test Accuracy: 0.7804878048780488
Random Forest - Train Accuracy: 1.0
Random Forest - Test Accuracy: 0.9853658536585366
     Feature  Importance
2         cp    0.135072
11        ca    0.127327
7    thalach    0.122169
9    oldpeak    0.121905
12      thal    0.110518
0        age    0.077908
4       chol    0.074822
3   trestbps    0.071171
8      exang    0.057594
10     slope    0.045782
1        sex    0.028731
6    restecg    0.018557
5        fbs    0.008444
Decision Tree Cross-Validation Scores: [0.98536585 1.         1.         1.         1.        ]
Random Forest Cross-Validation Scores: [0.98536585 1.         1.         1.         1.        ]
Decision Tree CV Mean Accuracy: 0.9970731707317073
Random Forest CV Mean Accuracy: 0.9970731707317073
