HEART DISEASE DETECTOR

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

In [21]:
df = pd.read_csv('dataset heart disease.csv')

In [22]:
df.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [23]:
df = pd.get_dummies(df, drop_first=True)

In [24]:
X = df.drop('target', axis=1).values
y = df['target'].values.reshape(-1, 1)

In [25]:
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
import numpy as np

def gini_index(groups, classes):
    n_instances = sum([len(group) for group in groups])
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        score = 0.0
        _, y = zip(*group)
        for class_val in classes:
            p = y.count(class_val) / size
            score += p * p
        gini += (1.0 - score) * (size / n_instances)
    return gini

def test_split(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if row[0][index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

def get_split(dataset):
    class_values = list(set(row[1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0][0])):
        for row in dataset:
            groups = test_split(index, row[0][index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[0][index], gini, groups
    return {'index': b_index, 'value': b_value, 'groups': b_groups}


In [28]:
def to_terminal(group):
    outcomes = [row[1] for row in group]
    return max(set(outcomes), key=outcomes.count)

def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])

    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return

    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return

    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)

    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)


In [29]:
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']


In [30]:
import random
from collections import Counter

def subsample(dataset, ratio):
    sample = []
    n_sample = round(len(dataset) * ratio)
    while len(sample) < n_sample:
        index = random.randrange(len(dataset))
        sample.append(dataset[index])
    return sample

def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return Counter(predictions).most_common(1)[0][0]

def random_forest(train, test, max_depth, min_size, sample_size, n_trees):
    trees = []
    for _ in range(n_trees):
        sample = subsample(train, sample_size)
        tree = build_tree(sample, max_depth, min_size)
        trees.append(tree)

    predictions = [bagging_predict(trees, row[0]) for row in test]
    actuals = [row[1] for row in test]
    accuracy = sum([1 for i in range(len(predictions)) if predictions[i] == actuals[i]]) / len(actuals)
    print(f" Random Forest Accuracy: {accuracy * 100:.2f}%")
    return predictions, actuals


In [31]:
data = list(zip(X_train, y_train.ravel()))
test = list(zip(X_test, y_test.ravel()))


In [32]:
preds, trues = random_forest(
    train=data,
    test=test,
    max_depth=15,
    min_size=5,
    sample_size=1.0,
    n_trees=15
)


 Random Forest Accuracy: 90.76%


In [33]:
from sklearn.metrics import f1_score, accuracy_score

print(" Final Test Accuracy:", round(accuracy_score(trues, preds) * 100, 2), "%")
print(" Final Test F1 Score:", round(f1_score(trues, preds), 4))


 Final Test Accuracy: 90.76 %
 Final Test F1 Score: 0.9167


In [34]:
from sklearn.metrics import classification_report

print("\nClassification Report:\n", classification_report(trues, preds))



Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.90       107
           1       0.91      0.92      0.92       131

    accuracy                           0.91       238
   macro avg       0.91      0.91      0.91       238
weighted avg       0.91      0.91      0.91       238



In [36]:
import pickle
with open('best_random_forest.pkl', 'wb') as f:
    pickle.dump(final_trees, f)
print(" Model saved successfully as 'best_random_forest.pkl'")

 Model saved successfully as 'best_random_forest.pkl'
