In [1]:
import numpy as np
import pandas as pd

def pre_processing(df):

	""" partioning data into features and target """

	X = df.drop([df.columns[-1]], axis = 1)
	y = df[df.columns[-1]]

	return X, y



# Load the weather dataset
df = pd.read_csv('heart.csv', delim_whitespace=True)
print('Dataset Preview:')
print(df)

# Split into features and target

X, y = pre_processing(df)

Dataset Preview:
     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  HD  
0        0   0     1   1  
1        0   0  

  df = pd.read_csv('heart.csv', delim_whitespace=True)


In [5]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [6]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: HD, Length: 303, dtype: int64

In [9]:
continuous_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
X_discrete = X.copy()
for col in continuous_cols:
    X_discrete[col] = pd.cut(X[col], bins=2, labels=False)

In [10]:
X_discrete

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,1,1,3,0,0,1,0,1,0,0,0,0,1
1,0,1,2,0,0,0,1,1,0,1,0,0,2
2,0,0,1,0,0,0,0,1,0,0,2,0,2
3,1,1,1,0,0,0,1,1,0,0,2,0,2
4,1,0,0,0,1,0,1,1,1,0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,1,0,0,0,0,0,1,0,1,0,1,0,3
299,0,1,3,0,0,0,1,0,0,0,1,0,3
300,1,1,0,0,0,1,1,1,0,1,1,2,3
301,1,1,0,0,0,0,1,0,1,0,1,1,3


In [11]:
y.value_counts().to_dict()

{1: 165, 0: 138}

In [12]:
def calculate_prior_probabilities(y):
    counts =  y.value_counts().to_dict()
    total_samples = len(y)
    for cls in counts.keys():
        counts[cls] /= total_samples
    return counts

# Calculate prior probabilities
priors = calculate_prior_probabilities(y)
print('Prior Probabilities P(y):')

for cls, prob in priors.items():
    print(f'P(HD={cls}) = {prob}')

Prior Probabilities P(y):
P(HD=1) = 0.5445544554455446
P(HD=0) = 0.45544554455445546


In [13]:
def calculate_likelihood(X, y):
    likelihood = {}
    for cls in np.unique(y):
        X_cls = X[y == cls]
        likelihood[cls] = {}
        for i in range(X.shape[1]):
            feature_values, counts = np.unique(X_cls[:, i], return_counts=True)
            likelihood[cls][i] = dict(zip(feature_values, counts / len(X_cls)))
    return likelihood

In [14]:
calculate_likelihood(X_discrete.values,y)

{np.int64(0): {0: {np.int64(0): np.float64(0.2753623188405797),
   np.int64(1): np.float64(0.7246376811594203)},
  1: {np.int64(0): np.float64(0.17391304347826086),
   np.int64(1): np.float64(0.8260869565217391)},
  2: {np.int64(0): np.float64(0.7536231884057971),
   np.int64(1): np.float64(0.06521739130434782),
   np.int64(2): np.float64(0.13043478260869565),
   np.int64(3): np.float64(0.050724637681159424)},
  3: {np.int64(0): np.float64(0.782608695652174),
   np.int64(1): np.float64(0.21739130434782608)},
  4: {np.int64(0): np.float64(0.9782608695652174),
   np.int64(1): np.float64(0.021739130434782608)},
  5: {np.int64(0): np.float64(0.8405797101449275),
   np.int64(1): np.float64(0.15942028985507245)},
  6: {np.int64(0): np.float64(0.572463768115942),
   np.int64(1): np.float64(0.4057971014492754),
   np.int64(2): np.float64(0.021739130434782608)},
  7: {np.int64(0): np.float64(0.427536231884058),
   np.int64(1): np.float64(0.572463768115942)},
  8: {np.int64(0): np.float64(0.4492

In [15]:
def predict_instance(x, prior, likelihood):
    posteriors = {}
    for cls, cls_prior in prior.items():
        posterior = cls_prior 
        for i in range(len(x)):
            feature_value = x[i]
            if feature_value in likelihood[cls][i]:
                posterior *= likelihood[cls][i][feature_value]
            else:
                var = posterior + 1e-6  # Small value to handle unseen feature values
        posteriors[cls] = posterior
    return max(posteriors, key=posteriors.get)


In [16]:
# Train the model
prior = calculate_prior_probabilities(y)
likelihood = calculate_likelihood(X_discrete.values, y)

# Make predictions
y_pred = [predict_instance(x, prior, likelihood) for x in X_discrete.values]


In [21]:
# Calculate accuracy
accuracy = sum(y_pred == y) / len(y)
print(f'Training Accuracy: {accuracy:.2f}')

Training Accuracy: 0.86


In [22]:
x = [0,0,0,1,0,1,0,0,0,0,0,0,0]
predict_instance(x,prior,likelihood)

0

1