PART 1

In [35]:
import pandas as p

'''
    1. mpg:           continuous
    2. cylinders:     multi-valued discrete
    3. displacement:  continuous
    4. horsepower:    continuous
    5. weight:        continuous
    6. acceleration:  continuous
    7. model year:    multi-valued discrete
    8. origin:        multi-valued discrete
    9. car name:      string (unique for each instance)
'''

def discretizer(frame, continousVariables):
    for var in continousVariables:
        frame[var] = p.cut(data[var], bins= [0, data[var].median(), data[var].max()], labels= ['low', 'high'])
    return frame

conVars = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']

data = p.read_csv('auto-mpg.csv')

data = discretizer(data, conVars)
data.astype('category')
data = data.drop('car name', axis=1)
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,low,8,high,high,high,low,70,1
1,low,8,high,high,high,low,70,1
2,low,8,high,high,high,low,70,1
3,low,8,high,high,high,low,70,1
4,low,8,high,high,high,low,70,1
...,...,...,...,...,...,...,...,...
94,high,4,low,low,low,low,74,2
95,high,4,low,low,low,high,74,3
96,high,4,low,low,low,high,74,3
97,high,4,low,low,low,high,74,2


PART 2

In [46]:
from pgmpy.estimators import PC

# estimatedDAG = PC(data.loc[:, data.columns != 'car name'])
#estimatedDAG.estimate().edges()
#https://pgmpy.org/_modules/pgmpy/estimators/PC.html
est = PC(data)
skel, seperating_sets = est.build_skeleton(significance_level=0.01, ci_test = "independence_match", max_cond_vars= 3)
print("Undirected edges: ", skel.edges())

pdag = est.skeleton_to_pdag(skel, seperating_sets)
print("PDAG edges:       ", pdag.edges())

model = pdag.to_dag()
print("DAG edges:        ", model.edges())



TypeError: argument of type 'NoneType' is not iterable

PART 2B

In [47]:
from pgmpy.estimators import HillClimbSearch as hcs
from pgmpy.estimators import BDeuScore, K2Score, BicScore
climb = hcs(data)
bestBICHCmodel = climb.estimate(scoring_method=BicScore(data))
print(bestBICHCmodel.edges())


[A
[A
[A
[A
  0%|          | 7/1000000 [00:00<33:22:24,  8.32it/s]

[('mpg', 'horsepower'), ('cylinders', 'displacement'), ('cylinders', 'weight'), ('cylinders', 'mpg'), ('displacement', 'origin'), ('displacement', 'model year'), ('horsepower', 'acceleration')]





In [39]:

bestBDSHCModel = climb.estimate(scoring_method=BDeuScore(data))
print(bestBDSHCModel.edges())





[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



  0%|          | 17/1000000 [00:02<41:13:10,  6.74it/s]

[('mpg', 'weight'), ('mpg', 'model year'), ('cylinders', 'weight'), ('cylinders', 'mpg'), ('cylinders', 'horsepower'), ('cylinders', 'origin'), ('cylinders', 'acceleration'), ('cylinders', 'model year'), ('displacement', 'cylinders'), ('displacement', 'horsepower'), ('displacement', 'weight'), ('displacement', 'origin'), ('displacement', 'mpg'), ('horsepower', 'mpg'), ('horsepower', 'acceleration'), ('horsepower', 'weight'), ('weight', 'origin')]





In [40]:
bestK2HCModel = climb.estimate(scoring_method=K2Score(data))
print(bestK2HCModel.edges())





[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



  0%|          | 12/1000000 [00:01<30:51:21,  9.00it/s]

[('mpg', 'horsepower'), ('mpg', 'model year'), ('cylinders', 'displacement'), ('cylinders', 'weight'), ('cylinders', 'mpg'), ('cylinders', 'horsepower'), ('cylinders', 'acceleration'), ('cylinders', 'model year'), ('displacement', 'origin'), ('displacement', 'mpg'), ('horsepower', 'acceleration'), ('weight', 'mpg')]





PART 3

3 A

In [56]:
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel as bm
mleModel = bm(bestBICHCmodel.edges())
mle = MaximumLikelihoodEstimator(mleModel, data)

from pgmpy.factors.discrete.CPD import TabularCPD
#https://stackoverflow.com/questions/70625490/how-to-print-the-printing-full-cpd-from-pgmpy
#credit for printing untruncated tables to console...
def print_full(cpd):
    backup = TabularCPD._truncate_strtable
    TabularCPD._truncate_strtable = lambda self, x: x
    print(cpd)
    TabularCPD._truncate_strtable = backup

for var in list(data.columns):
    print_full(mle.estimate_cpd(var))

+-----------+--------------+----------------------+--------------+--------------+
| cylinders | cylinders(3) | cylinders(4)         | cylinders(6) | cylinders(8) |
+-----------+--------------+----------------------+--------------+--------------+
| mpg(high) | 0.0          | 0.9583333333333334   | 0.25         | 0.0          |
+-----------+--------------+----------------------+--------------+--------------+
| mpg(low)  | 1.0          | 0.041666666666666664 | 0.75         | 1.0          |
+-----------+--------------+----------------------+--------------+--------------+
+--------------+----------+
| cylinders(3) | 0.010101 |
+--------------+----------+
| cylinders(4) | 0.484848 |
+--------------+----------+
| cylinders(6) | 0.121212 |
+--------------+----------+
| cylinders(8) | 0.383838 |
+--------------+----------+
+--------------------+--------------+--------------+---------------------+--------------+
| cylinders          | cylinders(3) | cylinders(4) | cylinders(6)        | cylinders

