## Title: Heart Disease Prediction Using Generative Classifiers (Naive Bayes Classifiers) 

# **Step 1: Import Necessary Python Libraries**

In [12]:
import pandas as pd
import numpy as np
from math import exp, sqrt, pi
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

# **Step 2: Load Dataset (EDA)**

In [13]:
# load the data from csv file placed locally in the pc
df = pd.read_csv('heart_disease_uci.csv')

# print the first few rows of the dataframe
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


# Step 3: Understanding the UCI Heart Disease Dataset

Let's now take a look at our dataset attributes and understand their meaning and significance.


| Attribute Name | Type | Description |
|-----------------------|----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| id | Discrete | Unique identity for each patient |
| age | Continuous | Represents age of the patient in years|
| sex  | Categorical | Represents male or female  <br>(1 = male, 0 = female) |
| dataset| Categorical |   Represents the place of study <br>(0:Cleveland, 1:Hungary, 2:Switzerland, 3:VA Long Beach) |
| cp| Categorical |   Represents the chest pain type <br>(0: asymptomatic, 1: atypical angina, 2: non-anginal pain, 3: typical angina) |
| trestbps | Continuous | resting blood pressure  (in mm Hg on admission to the hospital) |
| chol | Continuous | serum cholesterol  (in mg/dl) |
| fbs  | Categorical | Represents if fasting blood sugar > 120 mg/dl <br>(0 = false, 1 = true)|
| restecg  | Categorical | Represents the resting electrocardiographic results <br>(0: showing probable or definite left ventricular hypertrophy by Estesâ€™ criteria, 1: normal, 2: having ST-T wave abnormality)|
| thalach | Continuous | The maximum heart rate achieved |
| exang  | Categorical | Represents the exercise-induced angina <br>(0 = false, 1 = true)|
| oldpeak | Continuous | ST depression induced by exercise relative to rest |
| slope  | Categorical | Represents the  slope of the peak exercise ST segment <br>(0: downsloping; 1: flat; 2: upsloping)|
| ca | Continuous | number of major vessels (0-3) colored by fluoroscopy |
| thal | Categorical | Represents <br>(0 = normal, 1 = fixed defect, 2 = reversible defect) |
| num | Discrete | Represents the class label or predicted attribute where 0 indicates no heart disease and 1, 2, 3, and 4 represent the different stages of heart disease. <br>(0,1,2,3,4) |

We have a total of 14 features and our objective is to predict if the patient has a heart disease. Hence we will be building and interpreting a classification model.

In [14]:
# Number of rows (sample size) and columns in the data
df.shape 

(920, 16)

In [15]:
#check for data type per column 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [5]:
#check for number of missing data type per column using either of the two functions
#df.isna().sum()
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

# **Step 4: Scale continous variables**

In [16]:
# List of columns to scale
columns_to_scale = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

# Scale each column
for column in columns_to_scale:
    mean_value = df[column].mean()  # Calculate mean, ignoring NaNs
    std_value = df[column].std()    # Calculate standard deviation, ignoring NaNs
    df[column] = (df[column] - mean_value) / std_value  # Apply scaling



In [7]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,1.006838,Male,Cleveland,typical angina,0.674895,0.305736,True,lv hypertrophy,0.480375,False,1.302399,downsloping,-0.722891,fixed defect,0
1,2,1.431255,Male,Cleveland,asymptomatic,1.461633,0.784158,False,lv hypertrophy,-1.139603,True,0.569279,flat,2.483426,normal,2
2,3,1.431255,Male,Cleveland,asymptomatic,-0.636335,0.269628,False,lv hypertrophy,-0.329614,True,1.577319,flat,1.414653,reversable defect,1
3,4,-1.751875,Male,Cleveland,non-anginal,-0.111843,0.459192,False,normal,1.907499,False,2.402079,downsloping,-0.722891,normal,0
4,5,-1.327458,Female,Cleveland,atypical angina,-0.111843,0.043958,False,lv hypertrophy,1.328935,False,0.477639,upsloping,-0.722891,normal,0


# **Step 5: Compute Naive Bayes Classifiers Without Sklearn**

**Step 5a: Shuffle and split the dataset into training and test sets without using sklearn**

In [17]:
#Shuffle and Split the dataset
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the dataset into training and test sets (80:20)
train_size = int(0.8 * len(df_shuffled))  # 80% for training
train_set = df_shuffled[:train_size]
test_set = df_shuffled[train_size:]

# Separate features and labels 
X_train = train_set.drop(columns=['id', 'num'])  # Exclude 'id' and 'num'
y_train = train_set['num']
X_test = test_set.drop(columns=['num'])
y_test = test_set['num']

**Step 5b: Functions to compute statistics and probablities of Naive Bayes Classifier**

In [18]:
# Function to calculate the mean of a list of numbers, ignoring NaN values
def calculate_mean(numbers):
    numbers = [x for x in numbers if pd.notna(x)]  # Filter out NaN values
    if len(numbers) == 0:
        return float('nan')  # Return NaN if the list is empty
    return sum(numbers) / float(len(numbers))

# Function to calculate the standard deviation of a list of numbers, ignoring NaN values
def calculate_stdev(numbers):
    numbers = [x for x in numbers if pd.notna(x)]  # Filter out NaN values
    if len(numbers) < 2:
        return float('nan')  # Return NaN for stdev if there are less than 2 valid numbers
    avg = calculate_mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return variance ** 0.5  # Return the standard deviation

# Function to preprocess the dataset
def preprocess_data(train_set, target_column='num'):
    X_train = train_set.drop(columns=['id', target_column])  # Drop irrelevant columns
    y_train = train_set[target_column]  
    return X_train, y_train

# Function to summarize statistics by class
def summarize_class_statistics(X_train, y_train, feature_set):
    class_summaries = {}
    
    for class_value in np.unique(y_train):
        class_data = X_train[y_train == class_value]
        summaries = []
        
        for column in feature_set:
            column_data = class_data[column].dropna()  # Ignore missing values
            summaries.append((column, calculate_mean(column_data), calculate_stdev(column_data), len(column_data)))
        
        class_summaries[class_value] = summaries
    
    return class_summaries

# Function to calculate the Gaussian probability density function
def calculate_probability(x, mean, stdev):
    if stdev == 0:  # Avoid division by zero
        return 0
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent


def calculate_categorical_probabilities(X_train, y_train, feature_set):
    class_probabilities = {}
    
    for class_value in np.unique(y_train):
        class_data = X_train[y_train == class_value]
        probabilities = {}
        
        for column in feature_set:
            # Drop NaN values before counting
            category_counts = class_data[column].dropna().value_counts(normalize=True)  # Normalize to get probabilities
            probabilities[column] = category_counts.to_dict()  # Convert to dictionary format
        
        class_probabilities[class_value] = probabilities
    
    return class_probabilities


def calculate_posterior_probabilities(X_train, y_train, example_data_point, class_summaries, class_probabilities):
    posterior_probabilities = {}
    
    for class_value, summaries in class_summaries.items():
        prior_probability = len(X_train[y_train == class_value]) / len(X_train)
        likelihood = 1  # Start with 1 since it's a product
        
        # Calculate likelihood for continuous features
        for feature_name, mean_val, stdev_val, _ in summaries:
            x = example_data_point[feature_name]
            if pd.notna(x):  # Check if x is not NaN
                likelihood *= calculate_probability(x, mean_val, stdev_val)
        
        # Calculate likelihood for categorical features
        for column in class_probabilities[class_value]:
            category = example_data_point[column]
            if pd.notna(category):  # Check if category is not NaN
                category_probability = class_probabilities[class_value].get(column, {}).get(category, 0)
                likelihood *= category_probability

        posterior_probability = prior_probability * likelihood
        posterior_probabilities[class_value] = posterior_probability
    
    # Normalize posterior probabilities
    total_posterior = sum(posterior_probabilities.values())
    for class_value in posterior_probabilities:
        posterior_probabilities[class_value] /= total_posterior
    
    return posterior_probabilities


    # Normalize posterior probabilities
    total_posterior = sum(posterior_probabilities.values())
    for class_value in posterior_probabilities:
        posterior_probabilities[class_value] /= total_posterior
    
    return posterior_probabilities

# Example usage
train_set = df_shuffled[:train_size]  # Load your DataFrame
feature_set_1 = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
feature_set_2 = ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# Preprocess the dataset
X_train, y_train = preprocess_data(train_set)

# Summarize statistics
class_summaries = summarize_class_statistics(X_train, y_train, feature_set_1)

# Calculate categorical probabilities
class_probabilities = calculate_categorical_probabilities(X_train, y_train, feature_set_2)

# Evaluate for an example data point
example_data_point = X_test.iloc[5]  # Select the third data point
#print(example_data_point)
posterior_probabilities = calculate_posterior_probabilities(X_train, y_train, example_data_point, class_summaries, class_probabilities)

# Print prior probabilities for the classes
prior_probabilities = {class_value: len(X_train[y_train == class_value]) / len(X_train) for class_value in np.unique(y_train)}
print("\nPrior Probabilities for Each Class:")
for class_value, prob in prior_probabilities.items():
    print(f'Class: {class_value}, Prior Probability: {prob:.4f}')

# Print conditional probabilities for each feature per class
print("\nConditional Probabilities Parameter for Each Continous Feature per Class:")
for class_value, summaries in class_summaries.items():
    print(f'Class: {class_value}')
    for feature_name, mean_val, stdev_val, count in summaries:
        print(f'  Feature: {feature_name}, Mean: {mean_val:.4f}, Std Dev: {stdev_val:.4f}, Count: {count}')

# Print categorical probabilities for each class
print("\nConditional Probabilities for Each Categorical Feature per Class:")
for class_value, probabilities in class_probabilities.items():
    print(f'Class: {class_value}')
    for feature_name, feature_probs in probabilities.items():
        print(f'  Feature: {feature_name}, Probabilities: {feature_probs}')

# Print posterior probabilities for the example data point
print("\nPosterior Probabilities for the Example Data Point:")
for class_value, prob in posterior_probabilities.items():
    print(f'Class: {class_value}, Posterior Probability: {prob:.4f}')



Prior Probabilities for Each Class:
Class: 0, Prior Probability: 0.4416
Class: 1, Prior Probability: 0.2908
Class: 2, Prior Probability: 0.1264
Class: 3, Prior Probability: 0.1141
Class: 4, Prior Probability: 0.0272

Conditional Probabilities Parameter for Each Continous Feature per Class:
Class: 0
  Feature: age, Mean: -0.3337, Std Dev: 1.0100, Count: 325
  Feature: trestbps, Mean: -0.1429, Std Dev: 0.8441, Count: 309
  Feature: chol, Mean: 0.2670, Std Dev: 0.6599, Count: 311
  Feature: thalch, Mean: 0.4584, Std Dev: 0.9180, Count: 309
  Feature: oldpeak, Mean: -0.4242, Std Dev: 0.6511, Count: 308
  Feature: ca, Mean: -0.3940, Std Dev: 0.7391, Count: 130
Class: 1
  Feature: age, Mean: 0.0221, Std Dev: 0.9264, Count: 214
  Feature: trestbps, Mean: 0.0587, Std Dev: 1.0489, Count: 202
  Feature: chol, Mean: -0.0345, Std Dev: 1.0973, Count: 207
  Feature: thalch, Mean: -0.2340, Std Dev: 0.9220, Count: 202
  Feature: oldpeak, Mean: 0.0561, Std Dev: 0.9915, Count: 200
  Feature: ca, Mean: 

**Step 5c: Functions to compute the confusion matrix for Naive Bayes Classifier **

In [19]:
# Function to predict the class with the highest posterior probability
def predict_class_nb(X_train, y_train, example_data_point, class_summaries, class_probabilities):
    posterior_probabilities = calculate_posterior_probabilities(X_train, y_train, example_data_point, class_summaries, class_probabilities)
    best_class = max(posterior_probabilities, key=posterior_probabilities.get)
    return best_class

# Function to compute confusion matrix for multi-class classification
def compute_confusion_matrix(X_test, y_test, X_train, y_train, class_summaries, class_probabilities):
    # Initialize counts for True Positives, True Negatives, False Positives, and False Negatives
    True_positives = [0] * 5
    True_negatives = [0] * 5
    False_positives = [0] * 5
    False_negatives = [0] * 5

    for i in range(len(X_test)):
        example_data_point = X_test.iloc[i]
        actual_class = y_test.iloc[i]
        predicted_class = predict_class_nb(X_train, y_train, example_data_point, class_summaries, class_probabilities)

        # Confusion matrix based on predicted and actual classes
        if predicted_class == actual_class:
            True_positives[actual_class] += 1
        else:
            False_positives[predicted_class] += 1
            False_negatives[actual_class] += 1

    # Compute True Negatives for each class
    for j in range(5):
        True_negatives[j] = len(X_test) - (True_positives[j] + False_positives[j] + False_negatives[j])

    # Calculate performance metrics for each class
    class_specific_accuracy = [(True_positives[j] + True_negatives[j]) / (True_positives[j] + True_negatives[j] + False_positives[j] + False_negatives[j]) if (True_positives[j] + True_negatives[j] + False_positives[j] + False_negatives[j]) > 0 else 0 for j in range(5)]
    class_specific_precision = [(True_positives[j]) / (True_positives[j] + False_positives[j]) if (True_positives[j] + False_positives[j]) > 0 else 0 for j in range(5)]
    class_specific_recall = [(True_positives[j]) / (True_positives[j] + False_negatives[j]) if (True_positives[j] + False_negatives[j]) > 0 else 0 for j in range(5)]
    class_specific_FScore = [(2 * class_specific_precision[j] * class_specific_recall[j]) / (class_specific_precision[j] + class_specific_recall[j]) if (class_specific_precision[j] + class_specific_recall[j]) > 0 else 0 for j in range(5)]

    # Compute overall model performance
    Average_accuracy = sum(class_specific_accuracy) / 5
    Average_precision = sum(class_specific_precision) / 5
    Average_recall = sum(class_specific_recall) / 5
    Average_FScore = sum(class_specific_FScore) / 5

    return Average_accuracy, Average_precision, Average_recall, Average_FScore, class_specific_accuracy, class_specific_precision, class_specific_recall, class_specific_FScore


# Split the test set
test_set = df_shuffled[train_size:]
X_test, y_test = preprocess_data(test_set)

# Compute the confusion matrix
Average_accuracy, Average_precision, Average_recall, Average_FScore, class_specific_accuracy, class_specific_precision, class_specific_recall, class_specific_FScore = compute_confusion_matrix(X_test, y_test, X_train, y_train, class_summaries, class_probabilities)

# Print Precision, Recall, and F1 Score for each class
for i in range(5):
    print(f'Class {i} - Precision: {class_specific_precision[i]:.2f}, Recall: {class_specific_recall[i]:.2f}, F1 Score: {class_specific_FScore[i]:.2f}')
print("  ")
    # Print the overall accuracy
print(f'Overall Accuracy on the test set: {Average_accuracy:.2f}')

# Print Average Precision, Recall, and F1 Score
print(f'Average Precision: {Average_precision:.2f}')
print(f'Average Recall: {Average_recall:.2f}')
print(f'Average F1 Score: {Average_FScore:.2f}')
print(f'Average F1 Score: {Average_accuracy:.2f}')

Class 0 - Precision: 0.86, Recall: 0.74, F1 Score: 0.80
Class 1 - Precision: 0.51, Recall: 0.57, F1 Score: 0.54
Class 2 - Precision: 0.12, Recall: 0.19, F1 Score: 0.15
Class 3 - Precision: 0.29, Recall: 0.30, F1 Score: 0.30
Class 4 - Precision: 0.20, Recall: 0.12, F1 Score: 0.15
  
Overall Accuracy on the test set: 0.83
Average Precision: 0.40
Average Recall: 0.39
Average F1 Score: 0.39
Average F1 Score: 0.83
