In [1]:
import numpy as np
import pandas as pd

def pre_processing(df):

	""" partioning data into features and target """

	X = df.drop([df.columns[-1]], axis = 1)
	y = df[df.columns[-1]]

	return X, y



# Load the weather dataset
df = pd.read_csv('heart.csv', delim_whitespace=True)
print('Dataset Preview:')
print(df)

# Split into features and target

X, y = pre_processing(df)

Dataset Preview:
     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  HD  
0        0   0     1   1  
1        0   0  

  df = pd.read_csv('heart.csv', delim_whitespace=True)


In [2]:
continuous_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
X_discrete = X.copy()
for col in continuous_cols:
    X_discrete[col] = pd.cut(X[col], bins=5, labels=False)

In [3]:
X_discrete

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,3,1,3,2,1,1,0,3,0,1,0,0,1
1,0,1,2,1,1,0,1,4,0,2,0,0,2
2,1,0,1,1,0,0,0,3,0,1,2,0,2
3,2,1,1,1,1,0,1,4,0,0,2,0,2
4,2,0,0,1,2,0,1,3,1,0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,2,0,0,2,1,0,1,1,1,0,1,0,3
299,1,1,3,0,1,0,1,2,0,0,1,0,3
300,4,1,0,2,0,1,1,2,0,2,1,2,3
301,2,1,0,1,0,0,1,1,1,0,1,1,3


In [5]:
def calculate_prior_probabilities(y):
    counts =  y.value_counts().to_dict()
    total_samples = len(y)
    for cls in counts:
        counts[cls] /= total_samples
    return counts

# Calculate prior probabilities
priors = calculate_prior_probabilities(y)
print('Prior Probabilities P(y):')

for cls, prob in priors.items():
    print(f'P(HD={cls}) = {prob}')

Prior Probabilities P(y):
P(HD=1) = 0.5445544554455446
P(HD=0) = 0.45544554455445546


In [6]:
# Calculate class counts for likelihood
def calculate_class_counts(X, y):
    class_counts = {}
    for feature in X.columns:
        class_counts[feature] = {}
        for value in X[feature].unique():
            class_counts[feature][value] = {}
            for cls in y.unique():
                count = sum((X[feature] == value) & (y == cls))
                class_counts[feature][value][cls] = count
    return class_counts

In [7]:
# Calculate likelihood probabilities
def calculate_likelihoods(X, y):
    class_counts = calculate_class_counts(X, y)
    class_totals = {cls: sum(y == cls) for cls in y.unique()}
    
    likelihood_table = {}
    for feature in X.columns:
        likelihood_table[feature] = {}
        for value in X[feature].unique():
            likelihood_table[feature][value] = {}
            for cls in y.unique():
                likelihood_table[feature][value][cls] = (
                    class_counts[feature][value][cls] / class_totals[cls]
                )
    return likelihood_table

In [8]:
# Calculate and print likelihood tables
print('Likelihood Table')
likelihood_table = calculate_likelihoods(X, y)
likelihood_table

Likelihood Table


{'age': {np.int64(63): {np.int64(1): 0.01818181818181818,
   np.int64(0): 0.043478260869565216},
  np.int64(37): {np.int64(1): 0.012121212121212121, np.int64(0): 0.0},
  np.int64(41): {np.int64(1): 0.05454545454545454,
   np.int64(0): 0.007246376811594203},
  np.int64(56): {np.int64(1): 0.030303030303030304,
   np.int64(0): 0.043478260869565216},
  np.int64(57): {np.int64(1): 0.04242424242424243,
   np.int64(0): 0.07246376811594203},
  np.int64(44): {np.int64(1): 0.048484848484848485,
   np.int64(0): 0.021739130434782608},
  np.int64(52): {np.int64(1): 0.05454545454545454,
   np.int64(0): 0.028985507246376812},
  np.int64(54): {np.int64(1): 0.06060606060606061,
   np.int64(0): 0.043478260869565216},
  np.int64(48): {np.int64(1): 0.024242424242424242,
   np.int64(0): 0.021739130434782608},
  np.int64(49): {np.int64(1): 0.01818181818181818,
   np.int64(0): 0.014492753623188406},
  np.int64(64): {np.int64(1): 0.03636363636363636,
   np.int64(0): 0.028985507246376812},
  np.int64(58): {np.

In [17]:
type(X)

pandas.core.frame.DataFrame

In [9]:
def predict_naive_bayes(X_new, priors, likelihood_table):
    predictions = []
    
    for _, sample in X_new.iterrows():
        posteriors = {}
        
        for cls in priors.keys():
            posterior = priors[cls]
            
            for feature, value in sample.items():
                if value in likelihood_table[feature]:
                    posterior *= likelihood_table[feature][value].get(cls, 1.0)
            
            posteriors[cls] = posterior
        
        predictions.append(max(posteriors.items(), key=lambda x: x[1])[0])
    
    return predictions

# Make predictions on training data
predictions = predict_naive_bayes(X, priors, likelihood_table)

# Calculate accuracy
accuracy = sum(predictions == y) / len(y)
print(f'Training Accuracy: {accuracy:.2f}')

Training Accuracy: 0.96


In [30]:
X.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [31]:
x = {'age':0,'sex':0,'cp':0,'trestbps':0, 'chol':0, 'fbs':0, 'restecg':0, 'thalach':0,
       'exang':0, 'oldpeak':0, 'slope':0, 'ca':0, 'thal':0}
x = pd.DataFrame([x])
y_p = predict_naive_bayes(x,priors,likelihood_table)

In [32]:
y_p

[1]