In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [114]:
df = pd.read_csv("../data/heart_data.csv")

df_dropped_rows = df.dropna()

df_dropped_columns = df.dropna(axis=1)

df.isnull().sum()



In [115]:
df.sample(5)



In [116]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()

df['ST_Slope'] = label_encoder.fit_transform(df['ST_Slope'])
df['ChestPainType'] = df['ChestPainType'].replace({'TA': 0, 'ATA': 1, 'NAP': 2, 'ASY': 3})
df['RestingECG'] = df['RestingECG'].replace({ "Normal": 0, "ST": 1, "LVH": 2})
df['ExerciseAngina'] = df['ExerciseAngina'].replace({"Y": 1, "N": 0})
df['Sex'] = df['Sex'].replace({ "M": 1, "F": 0 })




In [117]:
df.columns

X = df.drop('HeartDisease', axis=1)

y = df['HeartDisease']

X.head()



In [118]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, y_train.shape



In [119]:
from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier()

tree.fit(X_train, y_train)



In [120]:
tree_pred = tree.predict(X_test)



In [125]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

matrix = confusion_matrix(y_test, tree_pred)

print(matrix)



In [126]:
# Feature Importance
importances = tree.feature_importances_
indices = np.argsort(importances)[::-1]

feature_names = list(df.columns)


plt.figure()
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()



In [127]:
accuracy = accuracy_score(y_test, tree_pred)

print(f'Accuracy: {accuracy}')


cr = classification_report(y_test, tree_pred)

print(cr)



In [128]:
param_grid = {
    "n_estimators": range(50, 300, 50),         # Number of trees in the forest
    "max_depth": range(3, 11),                    # Maximum depth of each tree
    "min_samples_split": [2, 5, 10],              # Minimum number of samples required to split an internal node
    "min_samples_leaf": [1, 2, 4],                # Minimum number of samples required at a leaf node
    "max_features": ["sqrt", "log2", None],       # Number of features to consider when looking for the best split
    "bootstrap": [True, False]                    # Whether bootstrap samples are used when building trees
}


tree_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)

tree_grid.fit(X_train, y_train)


best_params = tree_grid.best_params_

print(f'Best Hyperparamers: {best_params}')

tree_pred = tree_grid.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, tree_pred)
print(f'Accuracy: {accuracy:.2f}')

cm = confusion_matrix(y_test, tree_pred)
print('Confusion Matrix:')
print(cm)

cr = classification_report(y_test, tree_pred)
print('Classification Report:')
print(cr)





In [129]:
import joblib

joblib.dump(tree, "model.pkl")

