In [1]:
import numpy as np
import pandas as pd
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator 
#imports the MLE estimator used to learn Conditional Probability Distributions (CPDs) from data.
from pgmpy.inference import VariableElimination
#imports the inference algorithm (Variable Elimination) for querying the trained network.

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read the heart disease dataset
heartDisease = pd.read_csv(r"C:\Users\porje\Downloads\heart.csv")
heartDisease = heartDisease.replace('?', np.nan)

# 2. Rename 'target' column to 'heartdisease' if needed
if 'target' in heartDisease.columns:
    heartDisease = heartDisease.rename(columns={'target': 'heartdisease'})

print("\nFew rows of dataset:")
print(heartDisease.head())


Few rows of dataset:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  heartdisease  
0   0     1             1  
1   0     2             1  
2   0     2             1  
3   0     2             1  
4   0     2             1  


In [3]:
heartDisease.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'heartdisease'],
      dtype='object')

In [4]:
# 3. Convert continuous columns into discrete bins
discrete_cols = ['age','trestbps','chol','thalach','oldpeak']
for col in discrete_cols:
    heartDisease[col] = pd.cut(heartDisease[col], bins=5, labels=False)  
    #pd.cut divides the numeric values of the column into 5 bins
    #labels=False turns bin labels into integer codes

In [None]:
# 4. Define Bayesian Network Structure
model = DiscreteBayesianNetwork([
    ('age','trestbps'),
    ('age','fbs'),
    ('sex','trestbps'),
    ('exang','trestbps'),
    ('trestbps','heartdisease'),
    ('fbs','heartdisease'),
    ('heartdisease','restecg'),
    ('heartdisease','thalach'),
    ('heartdisease','chol')
])
#Example: ('trestbps','heartdisease') models that resting blood pressure (trestbps) influences
#  the probability of heart disease.

In [None]:
# 5. Train the model using Maximum Likelihood Estimator
print("\nLearning CPDs using Maximum Likelihood Estimator...")
model.fit(heartDisease, estimator=MaximumLikelihoodEstimator)
print("✅ Model Trained Successfully!")

# 6. Perform Inference
print("\nInferencing with Bayesian Network:")
infer = VariableElimination(model)
#infer = VariableElimination(model) — creates an inference object that can answer probabilistic queries 
# on the trained model using the Variable Elimination algorithm.

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'sex': 'N', 'cp': 'N', 'trestbps': 'N', 'chol': 'N', 'fbs': 'N', 'restecg': 'N', 'thalach': 'N', 'exang': 'N', 'oldpeak': 'N', 'slope': 'N', 'ca': 'N', 'thal': 'N', 'heartdisease': 'N'}



Learning CPDs using Maximum Likelihood Estimator...
✅ Model Trained Successfully!

Inferencing with Bayesian Network:


In [22]:
# 6. Perform Inference
print("\nInferencing with Bayesian Network:")
infer = VariableElimination(model)

# Query 1: Probability of Heart Disease given Age
print("\n  Probability of HeartDisease given age = 2 (binned value)")
q1 = infer.query(variables=['heartdisease'], evidence={'age':2})
print(q1)

# Query 2: Probability of Heart Disease given Cholesterol
print("\n  Probability of HeartDisease given chol = 1 (binned value)")
q2 = infer.query(variables=['heartdisease'], evidence={'chol':1})
print(q2)


Inferencing with Bayesian Network:

  Probability of HeartDisease given age = 2 (binned value)
+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.4507 |
+-----------------+---------------------+
| heartdisease(1) |              0.5493 |
+-----------------+---------------------+

  Probability of HeartDisease given chol = 1 (binned value)
+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.4803 |
+-----------------+---------------------+
| heartdisease(1) |              0.5197 |
+-----------------+---------------------+


Column names must match the network nodes exactly (case sensitive). We renamed target → heartdisease earlier to avoid mismatches.

NaNs: If heartDisease still has NaNs (missing values), model.fit may fail. You can handle them by heartDisease.dropna() or heartDisease.fillna(<value>) before discretizing/fitting. Use caution: dropping rows reduces data; filling requires sensible defaults.

Binning choices matter. pd.cut(..., bins=5) is a simple equal-width binning. For better models consider domain-aware bins or qcut (quantile bins) or supervised discretization.

Evidence values must be discrete states that exist in the CPDs. After binning, evidence must be integers 0..4 (for 5 bins). If you pass a raw numeric value (e.g., 28), inference will likely fail or return empty results.

Interpreting results: The printed query result shows probabilities for each state of heartdisease (e.g., heartdisease(0), heartdisease(1), ...). Map these states back to what they mean in your dataset (often 0 = no disease, 1 = disease — but verify the encoding).

If you want to use continuous variables directly (no binning), you need a different model (continuous BNs or hybrid models) — pgmpy’s discrete networks can’t handle continuous CPDs directly.