In [23]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.metrics import log_likelihood_score
from pgmpy.estimators import ParameterEstimator
from pgmpy.factors.discrete.CPD import TabularCPD
from pgmpy.estimators import MaximumLikelihoodEstimator

In [24]:
df = pd.read_csv("transport.csv")
df.drop("Unnamed: 0", inplace=True, axis=1)
df.head()

Unnamed: 0,Age,Income,Education,Residency,Transport
0,middle,low,high,big,train
1,young,low,low,big,car
2,middle,low,low,small,car
3,old,low,high,big,car
4,middle,medium,high,small,car


In [26]:
model = BayesianNetwork([("Age", "Education"), ("Education", "Income"), ("Income", "Transport"), ("Residency", "Transport")])

In [27]:
pe = ParameterEstimator(model, df)
print("\n", pe.state_counts('Age')) # Unconditional
print("\n", pe.state_counts('Residency')) # Unconditional
print("\n", pe.state_counts('Education')) # Conditional
print("\n", pe.state_counts('Income')) # Conditional
print("\n", pe.state_counts('Transport')) # Conditional



         count
Age          
middle    538
old       200
young     262

            count
Residency       
big          638
small        362

 Age       middle  old young
Education                  
high         172   69    83
low          366  131   179

 Education high  low
Income             
high        64   66
low        141  440
medium     119  170

 Income    high        low       medium      
Residency  big small  big small    big small
Transport                                   
car         41    46  167    98     84    81
train       34     9  232    84     80    44


In [28]:
def print_full(cpd):
    backup = TabularCPD._truncate_strtable
    TabularCPD._truncate_strtable = lambda self, x: x
    print(cpd)
    TabularCPD._truncate_strtable = backup

In [29]:
mle = MaximumLikelihoodEstimator(model, df)
print(mle.estimate_cpd("Age"))
print(mle.estimate_cpd("Residency"))
print(mle.estimate_cpd("Education"))
print(mle.estimate_cpd("Income"))
print(print_full(mle.estimate_cpd("Transport")))

+-------------+-------+
| Age(middle) | 0.538 |
+-------------+-------+
| Age(old)    | 0.2   |
+-------------+-------+
| Age(young)  | 0.262 |
+-------------+-------+
+------------------+-------+
| Residency(big)   | 0.638 |
+------------------+-------+
| Residency(small) | 0.362 |
+------------------+-------+
+-----------------+---------------------+----------+---------------------+
| Age             | Age(middle)         | Age(old) | Age(young)          |
+-----------------+---------------------+----------+---------------------+
| Education(high) | 0.31970260223048325 | 0.345    | 0.31679389312977096 |
+-----------------+---------------------+----------+---------------------+
| Education(low)  | 0.6802973977695167  | 0.655    | 0.683206106870229   |
+-----------------+---------------------+----------+---------------------+
+----------------+---------------------+---------------------+
| Education      | Education(high)     | Education(low)      |
+----------------+------------------

In [31]:
model.fit(df, estimator=MaximumLikelihoodEstimator)

In [32]:
log_likelihood_score(model, df)

-3875.6797856721137