In [328]:
%%capture
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import sys
sys.path.append('/home/wrwt/Programming/pygraphmodels')
import graphmodels as gm
%matplotlib inline
%load_ext line_profiler

In [14]:
x = np.random.randint(0, 2, size=10000)
y = np.random.randint(0, 2, size=10000)

In [15]:
df = pd.DataFrame(data={'x': x, 'y': y})

In [16]:
gm.information.discrete_entropy(df[['x']])

0.69312540040190695

In [17]:
gm.information.discrete_entropy(df[['y']])

0.69313918053864376

In [18]:
gm.information.discrete_entropy(df[['x', 'y']])

1.3862600011119746

In [19]:
2 * -0.5 * np.log(0.5)

0.69314718055994529

In [20]:
4 * -0.5 * np.log(0.5)

1.3862943611198906

In [21]:
from os import listdir
import os.path
NETWORKS_PATH = '/home/wrwt/Programming/pygraphmodels/networks/'
network_filenames = listdir(NETWORKS_PATH)
true_dgm = gm.DGM.read(os.path.join(NETWORKS_PATH, 'earthquake.bif'))
true_dgm.draw()

In [22]:
data = true_dgm.rvs(size=100000)

In [23]:
gm.information.discrete_entropy(data[['Earthquake', 'Burglary', 'Alarm', 'MaryCalls']])

0.23697192279781021

In [24]:
gm.information.discrete_entropy(data[['Earthquake', 'Burglary']])

0.15324978062162983

In [25]:
gm.information.discrete_entropy(data[['Earthquake']]) + gm.information.discrete_entropy(data[['Burglary']])

0.15326004726954612

In [261]:
from itertools import repeat
class SubsetGraph(nx.DiGraph):
    def __init__(self):
        nx.DiGraph.__init__(self)
        
    def add_subset(self, subs, f=None):
        subs = frozenset(subs)
        if subs in self.nodes():
            return 
        self.add_node(subs, f=f)
        for node in list(self.nodes()):
            if node > subs and all(not child > subs for child in self.successors(node)):
                self.add_edge(node, subs)
        
        for pa, ch in list(self.edges()):
            if subs < pa and ch < subs:
                self.remove_edge(pa, ch)
                self.add_edge(subs, ch)
        for ch in list(self.nodes()):
            if ch < subs and all(not parent <= subs for parent in self.predecessors(ch)):
                self.add_edge(subs, ch)
                
    def get_parents(self, subs):
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.predecessors(subs)
        results = []
        for node in list(self.nodes()):
            if node > subs and all(not child > subs for child in self.successors(node)):
                results.append(node)
        return results
    
    def get_children(self, subs):
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.successors(subs)
        results = set()
        for pa, ch in list(self.edges()):
            if subs < pa and ch < subs:
                results.add(ch)
        for ch in list(self.nodes()):
            if ch < subs and all(not parent <= subs for parent in self.predecessors(ch)):
                results.add(ch)
        return results
    
    def upper_bound(self, subs):
        #print 'calculating upper_bound of', subs
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.node[subs]['f']
        estimates = []
        for ch in self.get_children(subs):
            estimates.append(self.node[ch]['f'] + self.upper_bound(subs - ch))
        return min(np.min(estimates), np.min(self.node[pa]['f'] for pa in self.get_parents(subs)))
    
    def lower_bound1(self, subs):
        #print 'calculating lower_bound of', subs
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.node[subs]['f']
        estimates = [0]
        for ch in self.get_children(subs):
            estimates.append(max(self.node[ch]['f'], self.upper_bound(subs - ch)))
        return np.max(estimates)
    
    def lower_bound2(self, subs):
        #print 'calculating lower_bound of', subs
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.node[subs]['f']
        estimates = [0]
        for pa in self.get_parents(subs):
            estimates.append(self.node[pa]['f'] - self.upper_bound(pa - subs))
        return np.max(estimates)
    
    def lower_bound(self, subs):
        return max(self.lower_bound1(subs), self.lower_bound2(subs))

In [None]:
from itertools import repeat
class OptimizedSubsetGraph(nx.DiGraph):
    def __init__(self):
        nx.DiGraph.__init__(self)
        
    def add_subset(self, subs, f=None):
        subs = frozenset(subs)
        if subs in self.nodes():
            return 
        self.add_node(subs, f=f)
        for node in list(self.nodes()):
            if node > subs and all(not child > subs for child in self.successors(node)):
                self.add_edge(node, subs)
        
        for pa, ch in list(self.edges()):
            if subs < pa and ch < subs:
                self.remove_edge(pa, ch)
                self.add_edge(subs, ch)
        for ch in list(self.nodes()):
            if ch < subs and all(not parent <= subs for parent in self.predecessors(ch)):
                self.add_edge(subs, ch)
                
    def get_parents(self, subs):
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.predecessors(subs)
        results = []
        for node in list(self.nodes()):
            if node > subs and all(not child > subs for child in self.successors(node)):
                results.append(node)
        return results
    
    def get_children(self, subs):
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.successors(subs)
        results = set()
        for pa, ch in list(self.edges()):
            if subs < pa and ch < subs:
                results.add(ch)
        for ch in list(self.nodes()):
            if ch < subs and all(not parent <= subs for parent in self.predecessors(ch)):
                results.add(ch)
        return results
    
    def upper_bound(self, subs):
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.node[subs]['f']
        estimates = []
        for ch in self.get_children(subs):
            estimates.append(self.node[ch]['f'] + self.upper_bound(subs - ch))
        return min(np.min(estimates), np.min(self.node[pa]['f'] for pa in self.get_parents(subs)))
    
    def lower_bound1(self, subs):
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.node[subs]['f']
        estimates = [0]
        for ch in self.get_children(subs):
            estimates.append(max(self.node[ch]['f'], self.upper_bound(subs - ch)))
        return np.max(estimates)
    
    def lower_bound2(self, subs):
        subs = frozenset(subs)
        if subs in self.nodes():
            return self.node[subs]['f']
        estimates = [0]
        for pa in self.get_parents(subs):
            estimates.append(self.node[pa]['f'] - self.upper_bound(pa - subs))
        return np.max(estimates)
    
    def lower_bound(self, subs):
        return max(self.lower_bound1(subs), self.lower_bound2(subs))

In [262]:
sg = SubsetGraph()
sg.add_subset([1], 0.5)
sg.add_subset([3, 4], 1.1)
sg.add_subset([1, 2], 0.7)
sg.add_subset([2], 0.5)
sg.add_subset([3], 0.3)
sg.add_subset([4], 1.0)
sg.add_subset([1, 2, 3], 0.9)
sg.add_subset([1, 2, 3, 4], 1.9)
sg.lower_bound([1, 3, 4]), sg.upper_bound([1, 3, 4])

(1.3999999999999999, 1.6000000000000001)

In [263]:
sg = SubsetGraph()
sg.add_subset([1], 0.3)
sg.add_subset([2], 0.4)
sg.add_subset([3], 0.5)
sg.add_subset([4], 0.6)
sg.add_subset([1, 2], 0.55)
sg.add_subset([2, 3], 0.7)
sg.add_subset([3, 4], 1.0)
sg.lower_bound([1, 4]), sg.upper_bound([1, 4])

(0.59999999999999998, 0.89999999999999991)

In [264]:
data = true_dgm.rvs(10000)

In [274]:
sg = SubsetGraph()
ee = gm.EntropyEstimator(gm.MatrixGraph.from_networkx_DiGraph(true_dgm, order=data.columns), data)

In [275]:
def add_subs(*subs):
    subs = frozenset(subs)
    f = ee([name in subs for name in data.columns])
    sg.add_subset(subs, f)

In [276]:
def entr(*subs):
    return ee([name in subs for name in data.columns])

In [277]:
def bounds(*subs):
    return sg.lower_bound(frozenset(subs)), sg.upper_bound(frozenset(subs))

In [278]:
for var in data.columns:
    add_subs(var)

In [279]:
add_subs('Burglary', 'Earthquake', 'MaryCalls', 'JohnCalls')

In [280]:
%time entr('Burglary', 'MaryCalls')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 90.8 µs


0

In [38]:
bounds('Alarm', 'Burglary')

(0.080381242387159144, 0.13267347311205552)

In [39]:
bounds('Alarm', 'Earthquake')

(0.093726677279073661, 0.17410791966623279)

In [40]:
bounds('Alarm', 'Earthquake', 'MaryCalls')[1] - bounds('Alarm', 'Earthquake')[0]

0.18305450784829472

In [41]:
bounds('Alarm', 'Earthquake', 'MaryCalls')[0] - bounds('Alarm', 'Earthquake')[1]

0.022292023073976486

In [222]:
import numpy as np
from graphmodels.information import discrete_entropy


class EntropyEstimator:
    def __init__(self, graph, data):
        self.graph = graph
        self.data = data
        self.cache = {}

    def _footprint(self, nodes):
        return tuple(nodes)

    def __call__(self, nodes):
        fp = self._footprint(nodes)
        if fp in self.cache:
            return self.cache[fp]

        if np.any(nodes):
            subdata = self.data[self.data.columns[nodes]]
            result = discrete_entropy(subdata)
        else:
            result = 0

        self.cache[fp] = result
        return result


class InformationEstimator:
    def __init__(self, graph, data):
        self.graph = graph
        self.entropy_estimator = EntropyEstimator(graph, data)
        self.chosen = []

    def __call__(self, node, parents):
        parents = np.array(parents, dtype=bool)
        h1 = self.entropy_estimator(parents)
        temp = np.zeros(len(parents), dtype=bool)
        temp[node] = True
        h2 = self.entropy_estimator(temp)
        parents[node] = True
        h12 = self.entropy_estimator(parents)
        return h1 + h2 - h12


class ScoreBIC:
    def __init__(self, graph, data):
        self.graph = graph
        self.data = data
        self.n_values = np.asarray([len(self.data[column].value_counts()) for column in self.data.columns])
        self.mi_estimator = InformationEstimator(graph, data)

    def __call__(self, node, parents):
        parents = np.asarray(parents, dtype=bool)
        k = self.n_values[node]*np.prod(self.n_values[parents]) - 1
        n = self.data.shape[0]
        l = n*self.mi_estimator(node, parents)

        result = l - 0.5 * np.log(n) * k
        return result

    def total(self):
        score = 0.
        for node in self.graph.nodes():
            pa = self.graph.adj[:, node].copy()
            score += self(node, pa)
        return score

In [282]:
class AdvancedEntropyEstimator:
    def __init__(self, graph, data):
        self.graph = graph
        self.data = data
        self.sg = SubsetGraph()
        self._initialize_sg()
        self.cache = {}

    def _initialize_sg(self):
        vec = np.asarray([False] * len(self.data.columns), dtype=bool)
        for i, column in enumerate(self.data.columns):
            vec[i] = True
            self.__call__(vec)
            vec[i] = False
        
    def _key(self, subs):
        return frozenset([name for name, included in zip(data.columns, subs) if included])
    
    def _add_subs(self, subs, f):
        self.sg.add_subset(subs, f)
    
    def __call__(self, nodes):
        if not np.any(nodes):
            return 0
        key = self._key(nodes)
        if key in self.sg.nodes():
            return self.sg.node[key]['f']
        subdata = self.data[self.data.columns[nodes]]
        result = discrete_entropy(subdata)
        self._add_subs(key, f=result)
        return result
    
    def lower_bound(self, nodes):
        if not np.any(nodes):
            return 0
        key = self._key(nodes)
        return self.sg.lower_bound(key)
    
    def upper_bound(self, nodes):
        if not np.any(nodes):
            return 0
        key = self._key(nodes)
        return self.sg.upper_bound(key)
    
class AdvancedInformationEstimator:
    def __init__(self, graph, data):
        self.graph = graph
        self.entropy_estimator = AdvancedEntropyEstimator(graph, data)
        self.chosen = []

    def __call__(self, node, parents):
        parents = np.array(parents, dtype=bool)
        h1 = self.entropy_estimator(parents)
        temp = np.zeros(len(parents), dtype=bool)
        temp[node] = True
        h2 = self.entropy_estimator(temp)
        parents[node] = True
        h12 = self.entropy_estimator(parents)
        return h1 + h2 - h12
    
    def lower_bound(self, node, parents):
        parents = np.array(parents, dtype=bool)
        h1 = self.entropy_estimator.lower_bound(parents)
        temp = np.zeros(len(parents), dtype=bool)
        temp[node] = True
        h2 = self.entropy_estimator.lower_bound(temp)
        parents[node] = True
        h12 = self.entropy_estimator.upper_bound(parents)
        return h1 + h2 - h12
        
    def upper_bound(self, node, parents):
        parents = np.array(parents, dtype=bool)
        h1 = self.entropy_estimator.upper_bound(parents)
        temp = np.zeros(len(parents), dtype=bool)
        temp[node] = True
        h2 = self.entropy_estimator.upper_bound(temp)
        parents[node] = True
        h12 = self.entropy_estimator.lower_bound(parents)
        return h1 + h2 - h12
    
class AdvancedScoreBIC:
    def __init__(self, graph, data):
        self.graph = graph
        self.data = data
        self.n_values = np.asarray([len(self.data[column].value_counts()) for column in self.data.columns])
        self.mi_estimator = AdvancedInformationEstimator(graph, data)

    def __call__(self, node, parents, option=None):
        parents = np.asarray(parents, dtype=bool)
        k = self.n_values[node]*np.prod(self.n_values[parents]) - 1
        n = self.data.shape[0]
        
        if option == 'lower_bound':
            l = n*self.mi_estimator.lower_bound(node, parents)
        elif option == 'upper_bound':
            l = n*self.mi_estimator.upper_bound(node, parents)
        else:
            l = n*self.mi_estimator(node, parents)

        result = l - 0.5 * np.log(n) * k
        return result
    
    def lower_bound(self, node, parents):
        parents = np.asarray(parents, dtype=bool)
        k = self.n_values[node]*np.prod(self.n_values[parents]) - 1
        n = self.data.shape[0]
        l = n*self.mi_estimator.lower_bound(node, parents)

        result = l - 0.5 * np.log(n) * k
        return result

    def upper_bound(self, node, parents):
        parents = np.asarray(parents, dtype=bool)
        k = self.n_values[node]*np.prod(self.n_values[parents]) - 1
        n = self.data.shape[0]
        l = n*self.mi_estimator.upper_bound(node, parents)

        result = l - 0.5 * np.log(n) * k
        return result
    
    def total(self):
        score = 0.
        for node in self.graph.nodes():
            pa = self.graph.adj[:, node].copy()
            score += self(node, pa)
        return score

In [415]:
from graphmodels import AddEdge, RemoveEdge, ReverseEdge, InvalidOperation
class AdvancedGreedySearch:
    def __init__(self, data, cls_score):
        graph = nx.DiGraph()
        graph.add_nodes_from(data.columns)
        graph = MatrixGraph.from_networkx_DiGraph(graph, order=data.columns)
        self.graph = graph
        self.fscore = cls_score(graph, data)

        self.ops = []
        self.ops += [AddEdge(graph, self.fscore, u, v) for u, v in permutations(graph.nodes(), 2)]
        self.ops += [RemoveEdge(graph, self.fscore, u, v) for u, v in permutations(graph.nodes(), 2)]
        self.ops += [ReverseEdge(graph, self.fscore, u, v) for u, v in permutations(graph.nodes(), 2)]
        
        self.ops = [[op, op.score(option='upper_bound')] for op in self.ops]

    def iteration(self):
        #ops.sort(reverse=True, key=lambda x: x[1] + x[2])
        
        best_op = None
        best_score = 0.
        
        skipped = 0
        
        for i in range(len(self.ops)):
            op = self.ops[i][0]
            ub = self.ops[i][1]
            if ub < best_score:
                skipped += 1
                continue
                
            self.ops[i][1] = self.ops[i][0].score(option='upper_bound')
            ub = self.ops[i][1]
            if ub < best_score:
                skipped += 1
                continue
                
            try:
                current_score = op.score()
                op.do()
                if current_score > best_score:
                    best_score = current_score
                    best_op = op
                op.undo()
            except InvalidOperation:
                pass
        
        print 'done {}, skippped {} operations out of {}'.format(len(self.ops) - skipped, skipped, len(self.ops))
        if best_op is None or best_op.score() < 1e-5:
            return True
        best_op.do()
        return False

    def __call__(self, max_iter=40, verbose=True):
        counter = 0
        while not self.iteration() and counter < max_iter:
            if verbose:
                print(self.fscore.total())
            counter += 1
        return DGM(self.graph.to_networkx_DiGraph())

In [408]:
from os import listdir
import os.path
NETWORKS_PATH = '/home/wrwt/Programming/pygraphmodels/networks/'
network_filenames = listdir(NETWORKS_PATH)
true_dgm = gm.DGM.read(os.path.join(NETWORKS_PATH, 'alarm.bif'))
true_dgm.draw()

In [317]:
data = true_dgm.rvs(size=100000)

In [416]:
gs = AdvancedGreedySearch(data, AdvancedScoreBIC)

In [404]:
gs = GreedySearch(data, ScoreBIC)

In [417]:
%%time
gs(max_iter=100).draw()

done 313, skippped 3683 operations out of 3996
60776.6380977
done 72, skippped 3924 operations out of 3996
113918.386614
done 21, skippped 3975 operations out of 3996
166474.148386
done 12, skippped 3984 operations out of 3996
218082.718739
done 10, skippped 3986 operations out of 3996
264859.970208
done 10, skippped 3986 operations out of 3996
310426.578579
done 10, skippped 3986 operations out of 3996
355724.381372
done 48, skippped 3948 operations out of 3996
398924.323837
done 25, skippped 3971 operations out of 3996
439290.854954
done 18, skippped 3978 operations out of 3996
475598.26081
done 18, skippped 3978 operations out of 3996
511249.206984
done 81, skippped 3915 operations out of 3996
542823.856262
done 24, skippped 3972 operations out of 3996
574148.82232
done 22, skippped 3974 operations out of 3996
605166.613028
done 37, skippped 3959 operations out of 3996
633659.280395
done 29, skippped 3967 operations out of 3996
660414.271246
done 26, skippped 3970 operations out of 

In [390]:
%%time
gs(max_iter=100).draw()

60776.6380977
113918.386614
166474.148386
218082.718739
264859.970208
310426.578579
355724.381372
398924.323837
439290.854954
475598.26081
511249.206984
542823.856262
574148.82232
605748.588862
636766.37957
665259.046937
692014.037788
717530.84318
742478.585236
767131.323657
784526.03458
804399.595879
820983.329142
835972.328708
850524.071329
864869.473321
879109.131425
892331.073081
903970.831671
914952.35843
925589.782052
934839.287229
943025.560886
950305.039069
956570.55748
961964.67301
966315.537503
970388.891895
974381.922016
978086.940311
981349.882539
984339.722174
987180.902588
989925.039274
992074.412254
994174.391025
995958.946893
997466.867905
998588.128167
999652.352698
999842.96128
1000017.49049
1000146.78376
1000238.88716
1000338.22511
1000382.60057
CPU times: user 2min 30s, sys: 80 ms, total: 2min 30s
Wall time: 2min 30s


In [184]:
gs.fscore.mi_estimator.entropy_estimator.sg.nodes(data=True)

[(frozenset({'ARTCO2', 'CO'}), {'f': 1.5763543036515359}),
 (frozenset({'DISCONNECT', 'LVEDVOLUME', 'PCWP'}), {'f': 1.3364205432453751}),
 (frozenset({'HRSAT'}), {'f': 0.75335949022306459}),
 (frozenset({'EXPCO2', 'PCWP'}), {'f': 1.3793444224493232}),
 (frozenset({'LVFAILURE'}), {'f': 0.19907430985233129}),
 (frozenset({'ARTCO2', 'HRSAT'}), {'f': 1.4238568477216866}),
 (frozenset({'BP', 'PULMEMBOLUS'}), {'f': 1.1150631048365758}),
 (frozenset({'CO'}), {'f': 0.89874580841592544}),
 (frozenset({'HR', 'MINVOL', 'VENTALV'}), {'f': 1.7113281283301314}),
 (frozenset({'HISTORY', 'VENTALV', 'VENTLUNG'}), {'f': 1.3347501946108049}),
 (frozenset({'PRESS', 'PVSAT', 'VENTALV'}), {'f': 2.0824744996641069}),
 (frozenset({'DISCONNECT', 'HREKG', 'HRSAT'}), {'f': 1.3239594044904286}),
 (frozenset({'MINVOLSET', 'VENTALV'}), {'f': 1.2152058604863021}),
 (frozenset({'MINVOL', 'PAP', 'VENTALV'}), {'f': 1.6200649656904429}),
 (frozenset({'ANAPHYLAXIS', 'INSUFFANESTH'}), {'f': 0.38276954472320385}),
 (frozen

In [227]:
data.columns

Index([u'ANAPHYLAXIS', u'DISCONNECT', u'TPR', u'INSUFFANESTH', u'LVFAILURE',
       u'HISTORY', u'HYPOVOLEMIA', u'STROKEVOLUME', u'INTUBATION',
       u'LVEDVOLUME', u'PCWP', u'FIO2', u'ERRLOWOUTPUT', u'MINVOLSET',
       u'VENTMACH', u'VENTTUBE', u'PULMEMBOLUS', u'SHUNT', u'ERRCAUTER',
       u'KINKEDTUBE', u'PRESS', u'CVP', u'VENTLUNG', u'MINVOL', u'VENTALV',
       u'ARTCO2', u'PVSAT', u'SAO2', u'CATECHOL', u'HR', u'HRBP', u'CO', u'BP',
       u'HRSAT', u'HREKG', u'EXPCO2', u'PAP'],
      dtype='object')

In [358]:
vec = [False] * len(data.columns)
#vec[list(data.columns).index('CVP')] = True
vec[list(data.columns).index('LVEDVOLUME')] = True
vec[8] = True
vec[9] = True
vec[11] = True
vec[20] = True

In [359]:
%%time
gs.fscore.lower_bound(list(data.columns).index('CVP'), vec)

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 9.14 ms


-6428.9801372677375

In [360]:
gs.fscore.upper_bound(list(data.columns).index('CVP'), vec)

94625.472491508262

In [361]:
%%time
gs.fscore(list(data.columns).index('CVP'), vec)

CPU times: user 152 ms, sys: 0 ns, total: 152 ms
Wall time: 150 ms


44146.487426811793

In [364]:
len(gs.fscore.mi_estimator.entropy_estimator.sg.nodes())

1631