# pandas==0.21.1 and pgmpy from pip install

In [1]:
!pip install pandas==0.21.1

Looking in indexes: https://pypi.org/simple, https://quantumblack.jfrog.io/quantumblack/api/pypi/pypi-qb/simple


In [2]:
!pip freeze | grep pandas

pandas==0.21.1


In [3]:
!pip install pgmpy

Looking in indexes: https://pypi.org/simple, https://quantumblack.jfrog.io/quantumblack/api/pypi/pypi-qb/simple
Collecting pgmpy
[31mqb-bns 0.1.5 has requirement pandas>=0.22.0, but you'll have pandas 0.21.1 which is incompatible.[0m
Installing collected packages: pgmpy
Successfully installed pgmpy-0.1.6


In [4]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import BayesianEstimator

np.random.seed(1)
values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E'])
train_data = values[:800]
predict_data = values[800:]
model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])
model.fit(data=train_data, estimator=BayesianEstimator, prior_type="BDeu")


  return f(*args, **kwds)


Error in estimating cpd for the last two (all 0.5), hence resulting in predictions only from one class

In [5]:
for cpd in model.get_cpds():
    print(cpd)

╒══════╤══════════╕
│ A(0) │ 0.517391 │
├──────┼──────────┤
│ A(1) │ 0.482609 │
╘══════╧══════════╛
╒══════╤═════════════════════╤════════════════════╤═════════════════════╤════════════════════╕
│ A    │ A(0)                │ A(0)               │ A(1)                │ A(1)               │
├──────┼─────────────────────┼────────────────────┼─────────────────────┼────────────────────┤
│ C    │ C(0)                │ C(1)               │ C(0)                │ C(1)               │
├──────┼─────────────────────┼────────────────────┼─────────────────────┼────────────────────┤
│ B(0) │ 0.5100376411543287  │ 0.5138089758342923 │ 0.49745222929936306 │ 0.4869960988296489 │
├──────┼─────────────────────┼────────────────────┼─────────────────────┼────────────────────┤
│ B(1) │ 0.48996235884567124 │ 0.4861910241657077 │ 0.5025477707006369  │ 0.5130039011703511 │
╘══════╧═════════════════════╧════════════════════╧═════════════════════╧════════════════════╛
╒══════╤══════════╕
│ C(0) │ 0.491304 │
├────

In [6]:
predict_data = predict_data.copy()
predict_data.drop('E', axis=1, inplace=True)
y_pred = model.predict(predict_data)

In [7]:
y_pred.head()

Unnamed: 0,E
800,0
801,0
802,0
803,0
804,0


This seems to due to a miscalculation in state counts. When the variable only has a single parent, the state counts return all zeros.

In [8]:
from pgmpy.estimators import ParameterEstimator
estimator = ParameterEstimator(model, train_data)

In [9]:
# Only returns 0 when the variable only has one parent
estimator.state_counts('E')

B,0,1
E,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0,0.0
1,0.0,0.0


In [10]:
# Correctly returns the state counts when the variable has multiple parents
estimator.state_counts('B')

A,0,0,1,1
C,0,1,0,1
B,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,101,111,97,93
1,97,105,98,98


The reason might be due to `pd.MultiIndex.from_product`. In pandas 0.21.1, even when there's a single parent, `pd.MultiIndex.from_product` still generates a MultiIndex object. However, in pandas 0.19.2, it generates Int64Index in this case (see pandas==0.19.2_pgmpy_pip.ipynb) and the state_counts table is correct.

Here's what it does in `pgmpy/estimators/base.py`:

`parents_states = [self.state_names[parent] for parent in parents]
state_count_data = data.groupby([variable] + parents).size().unstack(parents)
row_index = self.state_names[variable]
column_index = pd.MultiIndex.from_product(parents_states, names=parents)
state_counts = state_count_data.reindex(index=row_index, columns=column_index).fillna(0)`

In [11]:
# when the variable only has a single parent
parents = ['B']
variable = 'E'

row_index = [0, 1]
column_index = pd.MultiIndex.from_product([[0, 1]], names=parents)
state_count_data = train_data.groupby([variable] + parents).size().unstack(parents)
state_count_data.reindex(index=row_index, columns=column_index).fillna(0)

B,0,1
E,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0,0.0
1,0.0,0.0


In [12]:
column_index

MultiIndex(levels=[[0, 1]],
           labels=[[0, 1]],
           names=['B'])

In [13]:
# when the variable has multiple parents
parents = ['A', 'C']
variable = 'B'

row_index = [0, 1]
column_index = pd.MultiIndex.from_product([[0, 1], [0, 1]], names=parents)
state_count_data = train_data.groupby([variable] + parents).size().unstack(parents)
state_count_data.reindex(index=row_index, columns=column_index).fillna(0)

A,0,0,1,1
C,0,1,0,1
B,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,101,111,97,93
1,97,105,98,98


In [14]:
column_index

MultiIndex(levels=[[0, 1], [0, 1]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['A', 'C'])