In [21]:
import numpy as np
from pymdp.envs import TMazeEnv

import matplotlib.pyplot as plt

In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
env = TMazeEnv(reward_probs=[0.98, 0.02])
A = env.get_likelihood_dist()
B = env.get_transition_dist()

A_gp[i][j, k, l] In context l, for the modality i, if agent takes action j, what is the probability of observing k?,

In [25]:
print(len(A))  # For the 3 types of observations: Location, reward, cue

3


In [17]:
# Likelihood for location:  p(location | location, context)
print(A[0][:,:,0])  # Location, location, context (reward condition)

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [26]:
# Likelihood for reward: p(reward | location, context)
print(A[1][:,:,0])  # Reward, location, context (reward condition)


[[1.   0.   0.   1.  ]
 [0.   0.98 0.02 0.  ]
 [0.   0.02 0.98 0.  ]]


In [59]:
# Likelihood for cue: p(cue | location, context)
print(A[2][:,:,0]) 

[[0.5 0.5 0.5 1. ]
 [0.5 0.5 0.5 0. ]]


In [31]:
print(len(B))  # For the 2 types of transitions: location, context

2


In [39]:
B[0].shape  # p(location | location, action) Location, location location/action

(4, 4, 4)

In [44]:
B[0][0, :, 0]  # Probability of going center, depending on where you are and that you choose to go to center => 
# You're certain you'll end up in center wherever you are

array([1., 1., 1., 1.])

In [41]:
B[1].shape  # p(context | context, dummy action)

(2, 2, 1)

In [27]:
# Beliefs over the states: p(location), p(context)
D = [np.array([1,0,0,0]),     # Knows it is in the center
     np.array([0.5, 0.5])]    #  but doesn't know the reward condition
# Preferences over the observations: \tilde p(Location), \tilde p(reward), \tilde(context)
C = [np.array([0., 0., 0., 0.]), # Doesn't matter where it is
     np.array([ 0., 3., -3.]),   # Prefers to see reward than no reward, or punishment
     np.array([0., 0.])]         # Doesn't matter the context

In [28]:
obs = env.reset() # reset the environment and get an initial observation
obs # Location, reward, cue

[0, 0, 1]

CENTER, RIGHT ARM, LEFT ARM, or CUE LOCATION

`A_gp[i][j, k, l]`
In context `l`, for the modality `i`, if agent takes action `j`, what is the probability of observing `k`?, 

In [60]:
interactable = 0  # Can act on location but not context
all_actions = np.arange(4, dtype=int)

qlocation_pi = np.zeros((4, 4))
for action in all_actions:
     qlocation_pi[action] = B[interactable][:,:, action].dot(D[interactable])
 
         

In [72]:
qreward = A[1][:,:,:].dot(D[1]).dot(qlocation_pi)
print(qreward)

[[1.  0.  0.  1. ]
 [0.  0.5 0.5 0. ]
 [0.  0.5 0.5 0. ]]


In [80]:
C[1]

array([ 0.,  3., -3.])

In [84]:
def softmax(dist):
    """ 
    Computes the softmax function on a set of values
    """

    output = dist - dist.max(axis=0)
    output = np.exp(output)
    output = output / np.sum(output, axis=0)
    return output

In [86]:
C1 = softmax(C[1])
C1

array([0.04731416, 0.95033021, 0.00235563])

In [91]:
qreward.T.dot(np.log(C1))  # Pragmatic value ("expected reward")

array([-3.05094576, -3.05094576, -3.05094576, -3.05094576])