In [2]:
%%capture
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
%%capture
import sys
sys.path.append('/home/wrwt/Programming/pygraphmodels')
import graphmodels as gm

In [4]:
import warnings
warnings.filterwarnings('ignore', module='matplotlib')
warnings.filterwarnings('ignore', module='IPython')

In [5]:
%%capture
import theano
import theano.tensor as T
from theano.tensor import nnet

### Simple neural network implementation

In [24]:
n_in = 2
n_hid = 2
n_out = 1

W0 = T.dmatrix('W0')
b0 = T.dvector('b0')
W1 = T.dvector('W1')
b1 = T.dscalar('b1')

X = T.dmatrix('X')
y = T.dvector('y')

hidden = nnet.sigmoid(X.dot(W0) + b0)
out = hidden.dot(W1) + b1
lambda_ = 1e-5
#loss = T.mean((out - y) ** 2) + lambda_ * (T.sum(W0 ** 2) + T.sum(W0 ** 2))
loss = T.sum((out - y) ** 2)
loss_f = theano.function(inputs=[X, y, W0, W1, b0, b1], outputs=loss)

def get_loss(theta, X, y):
    W0 = theta[:n_in*n_hid].reshape((n_in, n_hid))
    W1 = theta[n_in*n_hid:n_in*n_hid + n_hid*n_out]
    b0 = theta[n_in*n_hid + n_hid*n_out:n_in*n_hid + n_hid*n_out + n_hid]
    b1 = theta[n_in*n_hid + n_hid*n_out + n_hid]
    res1 = []
    res2 = []
    print(loss_f(X, y, W0, W1, b0, b1)[0].shape)

T.grad(loss, wrt=W0)
    
deriv = T.concatenate([T.grad(loss, wrt=W0).flatten(),
                      T.grad(loss, wrt=W1).flatten(),
                      T.grad(loss, wrt=b0).flatten(),
                      [T.grad(loss, wrt=b1)]])

predict = theano.function(inputs=[X, W0, b0, W1, b1], outputs=out, name='predict')
loss_deriv = theano.function(inputs=[X, y, W0, b0, W1, b1], outputs=[loss, deriv], name='loss_deriv')

def get_loss_deriv(theta, X, y):
    W0 = theta[:n_in*n_hid].reshape((n_in, n_hid))
    W1 = theta[n_in*n_hid:n_in*n_hid + n_hid*n_out]
    b0 = theta[n_in*n_hid + n_hid*n_out:n_in*n_hid + n_hid*n_out + n_hid]
    b1 = theta[n_in*n_hid + n_hid*n_out + n_hid]
    res1 = []
    res2 = []
    return loss_deriv(X, y, W0, b0, W1, b1)

def initialize(size):
    return np.random.rand(size)

def learn(X, y):
    theta = initialize(n_in * n_hid + n_hid * n_out + n_hid + n_out)
    from scipy.optimize import minimize
    res = minimize(lambda theta: get_loss_deriv(theta, X, y), theta, jac=True, method='L-BFGS-B')
    print(res)
    return res['x']

def get_predict(theta, X):
    W0 = theta[:n_in*n_hid].reshape((n_in, n_hid))
    W1 = theta[n_in*n_hid:n_in*n_hid + n_hid*n_out]
    b0 = theta[n_in*n_hid + n_hid*n_out:n_in*n_hid + n_hid*n_out + n_hid]
    b1 = theta[n_in*n_hid + n_hid*n_out + n_hid]
    return predict(X, W0, b0, W1, b1)

In [235]:
class Neurofunction:
    def __init__(self, n_in, n_hid, lambda_=1e-4):
        self.n_in = n_in
        self.n_hid = n_hid
        self.n_params = (self.n_in + 2) * self.n_hid + 1
        
        self.W0 = T.dmatrix('W0')
        self.b0 = T.dvector('b0')
        self.W1 = T.dvector('W1')
        self.b1 = T.dscalar('b1')

        self.X = T.dmatrix('X')
        self.x = T.dvector('x')
        self.y = T.dvector('y')
        
        self.hidden = nnet.sigmoid(self.X.dot(self.W0) + self.b0)
        self.xhidden = nnet.sigmoid(self.x.dot(self.W0) + self.b0)
        self.out = self.hidden.dot(self.W1) + self.b1
        self.xout = self.xhidden.dot(self.W1) + self.b1
        
        self.loss = T.mean((self.out - self.y) ** 2) + lambda_ * (T.sum(self.W0 ** 2) + T.sum(self.W1 ** 2))
        
        self.theta_grad = T.concatenate([T.grad(self.loss, wrt=self.W0).flatten(),
                              T.grad(self.loss, wrt=self.W1).flatten(),
                              T.grad(self.loss, wrt=self.b0).flatten(),
                              [T.grad(self.loss, wrt=self.b1)]])
        self.loss_theta_grad_f = theano.function(inputs=[self.X, self.y, self.W0, self.W1, self.b0, self.b1],
                                           outputs=[self.loss, self.theta_grad])
        self.predict_f = theano.function(inputs=[self.X, self.W0, self.W1, self.b0, self.b1], 
                                         outputs=self.out, name='predict')
        
        self.x_grad = T.grad(-self.xout, wrt=self.x)
        self.loss_x_grad_f = theano.function(inputs=[self.x, self.W0, self.W1, self.b0, self.b1], 
                                            outputs=[-self.xout, self.x_grad])
    
        
    def _parse_params(self, theta):
        n_in = self.n_in
        n_hid = self.n_hid
        n_out = 1
        W0 = theta[:n_in*n_hid].reshape((n_in, n_hid))
        W1 = theta[n_in*n_hid:n_in*n_hid + n_hid*n_out]
        b0 = theta[n_in*n_hid + n_hid*n_out:n_in*n_hid + n_hid*n_out + n_hid]
        b1 = theta[n_in*n_hid + n_hid*n_out + n_hid]
        return W0, W1, b0, b1
        
    def fit(self, X, y):
        theta = np.random.rand(self.n_params)
        
        def loss_grad(theta):
            W0, W1, b0, b1 = self._parse_params(theta)
            return self.loss_theta_grad_f(X, y, W0, W1, b0, b1)
    
        res = sp.optimize.minimize(loss_grad, theta, method='L-BFGS-B', jac=True)
        self.theta = res['x']
        
    def __call__(self, X):
        params = self._parse_params(self.theta)
        return self.predict_f(np.atleast_2d(X), *params)
    
    def maximize(self, assignment=None):
        if assignment is None:
            assignment = [None] * self.n_in
        
        n_assigned = sum([1 if a is not None else 0 for a in assignment])
        x = np.random.rand(self.n_in - n_assigned)
        idx = [i for i, a in enumerate(assignment) if a is None]
        
        
        dispatch = {}
        for i, a in enumerate(assignment):
            if a is None:
                dispatch[len(dispatch)] = i
        
        def loss_grad(x_compl):
            x = np.array(assignment)
            for i, xi in enumerate(x_compl):
                x[dispatch[i]] = xi
            W0, W1, b0, b1 = self._parse_params(self.theta)
            loss, grad = self.loss_x_grad_f(x.astype('float'), W0, W1, b0, b1)
            grad = grad[idx]
            return loss, grad
        
        res = sp.optimize.minimize(loss_grad, x, method='L-BFGS-B', jac=True, 
                                   bounds=[(0, 1)] * (self.n_in - n_assigned))
        return res['x']

In [238]:
inputs = np.array([[0,1],[1,0],[1,1],[0,0]]).reshape(4,2) #training data X
exp_y = np.array([1, 1, 0, 0]) #training data Y
nf = Neurofunction(2, 4)
nf.fit(inputs, exp_y)

In [237]:
#Training done! Let's test it out
nf([[0, 0], [0, 1], [1, 0], [1, 1]])

array([ 1.7838094 ,  0.99906714,  0.99911104,  0.0018411 ])

In [233]:
nf.maximize(assignment=[None, 0.])

array([ 0.])

In [234]:
nf([0, 0.32])

array([ 1.56099748])

### Using the neural network on DGM

In [195]:
def generate_subset(target, factor, kmin=1, kmax=4, size=1):
    """
    Generate `size` random subsets of parents for node `target` with factor `factor` 
    and calculate discrete mutual information for them.
    Number of parents lies between kmin and kmax
    """
    arguments = list(factor.arguments)
    arguments.remove(target)
    arguments = np.asarray(arguments)
    
    n = np.random.randint(low=kmin, high=kmax+1, size=size)
    result = []
    score = []
    for i, cn in enumerate(n):
        current = np.random.choice(arguments, size=cn, replace=False)
        result.append([1.0 if arg in current else 0.0 for arg in arguments])
        if cn == 0:
            score.append(0)
        else:
            score.append(gm.information.discrete_mutual_information(data[[target]], data[current]))
    return np.vstack(result), np.hstack(score)

In [196]:
arguments = ['a', 'b', 'a_xor_b']
dgm = gm.DGM()
dgm.add_nodes_from(arguments)
dgm.add_edges_from([('a', 'a_xor_b'), ('b', 'a_xor_b')])
dgm.node['a']['cpd'] = gm.TableFactor(arguments, ['a'])
dgm.node['a']['cpd'].table = np.array([[[0.5]], [[0.5]]])
dgm.node['b']['cpd'] = gm.TableFactor(arguments, ['b'])
dgm.node['b']['cpd'].table = np.array([[[0.5], [0.5]]])
dgm.node['a_xor_b']['cpd'] = gm.TableFactor(arguments, ['a', 'b', 'a_xor_b'])
dgm.node['a_xor_b']['cpd'].table = np.array([[[0.9, 0.1], [0.1, 0.9]], [[0.1, 0.9], [0.9, 0.1]]])
dgm.draw()

In [197]:
data = dgm.rvs(size=10000)

In [198]:
fact = dgm.cpd('a_xor_b')

In [199]:
target = 'a_xor_b'
args = list(fact.arguments)
args.remove(target)
print(args)
print(len(args))

['a', 'b']
2


In [200]:
mi_array = np.array([gm.information.discrete_mutual_information(data[[target]], data[[current]]) for current in args])
def score(subset):
    return np.dot(subset, mi_array)

In [201]:
X_train, y_train = generate_subset(target, fact, kmin=0, kmax=2, size=100)

In [218]:
nf = Neurofunction(2, 4, lambda_=1e-5)
nf.fit(X_train, y_train)

In [219]:
nf([[0, 0], [0, 1], [1, 0], [1, 1]])

array([ -1.57331175e-04,   6.49270535e-04,   6.19116850e-04,
         3.71196144e-01])

In [221]:
print(gm.information.discrete_mutual_information(data[['a_xor_b']], 
                                                      data[['a', 'b']]))

0.371574610978
