# Continuous Actions and Continuous Observations

In the previous two notebooks we developed active inference agents for environments with continuous action spaces and for environments with continuous observation spaces. The changes to the minimal environments and the minimal agent, where actions space and the observation space were discrete, required for each of these agent/ environment properties are non-overlapping, which makes it straight forward to combine these changes to create an active inference that can interact with environments where both of these spaces are continuous.

We start by modifying the minimal environment to emit continous-valued observations and to accept continuous-valued actions. Then, we modify the components of the minimal agent that currently exploit the discreteness of observation and action spaces.

#### Housekeeping (run once per kernel restart)

In [None]:
# change directory to parent
import os
os.chdir('..')
print(os.getcwd())

# Imports

In [None]:
import importlib
import itertools

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.markers import CARETUP, CARETDOWN
import pandas as pd
from scipy.stats import beta
import seaborn as sns
import torch

# Continuous Action and Observation Environment

Because here we simply merge modifications from the two previous notebooks we just present the final solution without analysing individual component changes again.

## Full environment specification

In [None]:
# environment
class ContinuousAOEnv(object):
  """ Wrap-around 1D state space with single food source.
  
  The probability of sensing food at locations near the food source decays 
  exponentially with increasing distance.
  
  state (int): 1 of N discrete locations in 1D space.
  observation (float): proportion of times food detected in finite sample.
  actions(float): [-2, 2] intention to move left or right.
  """
  def __init__(self, 
               N = 16, # how many discrete locations can the agent reside in
               s_0 = 0, # where does the agent start each episode?
               s_food = 0, # where is the food?
               sigma_move = 0.75, # Gaussian stdev around continuous move
               o_sample_size=10, # observation Beta distribution parameter.
               a_lims = [-2, 2], # maximum step in either direction.
               p_o_max = 0.9, # maximum probability of sensing food
               o_decay = 0.2 # decay rate of observing distant food source
               ):
    
    self.o_decay = o_decay
    self.var_move = sigma_move**2
    self.o_sample_size = o_sample_size
    self.p_o_max = p_o_max
    self.s_0 = s_0
    self.s_food = s_food
    self.s_N = N
    self.o_N = 2 # {False, True} indicating whether food has been found
    self.a_lims = a_lims
    """
    environment dynamics are governed by two probability distributions
    1. state transition probability p(s'|s, a)
    2. emission/ observation probability p(o|s)
    
    With continuous-valued actions, we can nolonger represent (1.) with a 
    single conditional probability table. However, we can generate one table of
    size |S| x |S| for each continuous action value.
    """
    self.d_s = self._signed_state_distances()
    # self.p_s1_given_s_a(a=a) returns p[s, s1] for given a; slice of Matrix B
    
    """
    We pre-compute the conditional emission random variables (2.) here so agents 
    can access the true dynamics if required.
    """
    self.p_o_given_s = self.emission_probability() # Matrix A
    
    self.s_t = None # state at current timestep

  def _signed_state_distances(self):
    s = np.arange(self.s_N)
    other, this = np.meshgrid(s, s)
    d = other - this
    d1 = other - this + self.s_N
    d2 = other - this - self.s_N
    d[np.abs(d) > np.abs(d1)] = d1[np.abs(d) > np.abs(d1)]
    d[np.abs(d) > np.abs(d2)] = d2[np.abs(d) > np.abs(d2)]
    return d
  
  def _p_a_discrete_given_a(self, a):
    # probability distribution of a discrete action (step) given a continuous
    # action intent.
    a = np.clip(a, self.a_lims[0], self.a_lims[1])
    a_discrete = np.arange(2*self.s_N-1) - self.s_N + 1
    p_a = np.exp(-0.5 * (a_discrete-a)**2 / self.var_move)
    p_a[a_discrete > self.a_lims[1]] = 0
    p_a[a_discrete < self.a_lims[0]] = 0
    p_a = p_a/p_a.sum()
    return a_discrete, p_a
  
  def p_s1_given_s_a(self, a):
    """ computes transition probability p(s'| s, a) for specific a
    
    Note: this is provided for convenience in the agent; it is not used within
    the environment simulation.

    Returns:
    p[s, s1] of size (s_N, s_N)
    """
    a_d, p_a = self._p_a_discrete_given_a(a=a)
    return p_a[self.d_s - a_d[0]]

  def emission_probability(self):
    """ initialises conditional random variables p(o|s). 
    
    Returns:
    p[s] of size (s_N) with one scipy.stats.rv_continuous per state
    """
    s = np.arange(self.s_N)
    # distance from food source
    d = np.minimum(np.abs(s - self.s_food), 
                   np.minimum(
                   np.abs(s - self.s_N - self.s_food), 
                   np.abs(s + self.s_N - self.s_food)))
  
    # exponentially decaying concentration ~ probability of detection
    mean = self.p_o_max * np.exp(-self.o_decay * d)
    # continuous relaxation: proportion of food detected in finite sample
    sample_size = self.o_sample_size
    return np.array([beta(a=m*sample_size, b=(1-m)*sample_size) for m in mean])

  def reset(self):
    self.s_t = self.s_0
    return self.sample_o()

  def step(self, a):
    if (self.s_t is None):
      print("Warning: reset environment before first action.")
      self.reset()
      
    a_discrete = self.sample_a(a)
    self.s_t = (self.s_t + a_discrete) % self.s_N
    return self.sample_o()

  def sample_o(self):
    return self.p_o_given_s[self.s_t].rvs()
  
  def sample_a(self, a):
    a_d, p_a = self._p_a_discrete_given_a(a=a)
    return np.random.choice(a_d, p=p_a)

## Random Agent Behavior

To test the environment we simulate a random agent's interactions with it. Here, the random agent samples actions uniformly in the interval `[-2, 2]`.

In [None]:
import continuous_action_and_observation_environment as caoe
importlib.reload(caoe)

env = caoe.ContinuousAOEnv(N=16, # number of states
                    s_food=0, # location of the food source
                    sigma_move=0.75, # Gaussian noise around continuous move
                    o_sample_size=100, # variance of observation decreases with increasing sample size.
                    a_lims=[-3,3]) # maximum number of steps in either direction

n_steps = 100
ss, oo, aa = [], [], []

o = env.reset()
ss.append(env.s_t)
oo.append(o)

for i in range(n_steps):
  a = np.random.uniform(low=env.a_lims[0], high=env.a_lims[1]) # random agent
  o = env.step(a)
  ss.append(env.s_t)
  oo.append(o)
  aa.append(a)

We inspect the sequence of states, actions and emissions during this interaction

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(16, 12))
ax[0].plot(ss, label='agent state $s_t$')
ax[0].plot(np.ones_like(ss) * env.s_food, 
           'r--', label='food source', linewidth=1)
for i in range(len(aa)):
  ax[0].plot([i, i], [ss[i], ss[i]+aa[i]], 
             color='orange', 
             linewidth=0.5,
             marker= CARETUP if aa[i] > 0 else CARETDOWN,
             label=None if i > 0 else 'action')
  
ax[0].set_xlabel('timestep t')
ax[0].set_ylabel('state s')
ax[0].legend(loc=1)

ax[1].plot(np.array(oo))
ax[1].set_xlabel('timestep t')
ax[1].set_ylabel('observation o')

# Continuous Action and Observation Agent

Because here we simply merge modifications from the two previous notebooks we just present the final solution without analysing individual component changes again.

In [None]:
def softmax(x):
  e = np.exp(x - x.max())
  return e / e.sum()

def kl(a, b):
    """ Discrete KL-divergence """
    return (a * (np.log(a) - np.log(b))).sum()

class ContinuousAOAgent:
    
    def __init__(self, 
                 env,
                 target_state, 
                 shape_target=False, # smooth preference distribution using poirwise state distances
                 n_plans=128, # number of plans rolled out during action selection
                 k=2, # planning horizon
                 n_o_samples=10, # observation samples for information gain
                 use_info_gain=True, # score actions by info gain
                 use_pragmatic_value=True, # score actions by pragmatic value
                 select_max_pi=False, # sample plan (False), select max negEFE (True).
                 n_steps_o=20, # optimization steps after new observation
                 n_steps_a=20, # optimization steps after new action
                 lr_o=4., # learning rate of optimization after new observation
                 lr_a=4.): # learning rate of optimization after new action)
        
        self.env = env
        self.target_state = target_state
        self.shape_target = shape_target
        self.n_plans = n_plans
        self.k = k
        self.n_o_samples = n_o_samples
        self.use_info_gain = use_info_gain
        self.use_pragmatic_value = use_pragmatic_value
        self.select_max_pi = select_max_pi
        self.n_steps_o = n_steps_o
        self.n_steps_a = n_steps_a
        self.lr_a = lr_a
        self.lr_o = lr_o
        
    def reset(self):
        # initialize state preference
        if self.shape_target:
            self.b_star = np.ones(shape=self.env.s_N) - \
                          np.abs(self.env.d_s[self.target_state])
        else:
            self.b_star = np.eye(self.env.s_N)[self.target_state] * 10
        self.log_p_c = np.log(softmax(self.b_star))
        # initialize state prior as uniform
        self.b = np.zeros(self.env.s_N)
        
    def step(self, o, debug=False):
        if debug:
            return self._step_debug(o)
        
        self.b = self._update_belief(theta_prev=self.b, o=o)
        a = select_action(theta_start=self.b)[0] # pop first action of selected plan
        self.b = self._update_belief_a(theta_prev=self.b, a=a)
        return a
    
    def _step_debug(self, o):
        self.b, ll_o = self._update_belief(theta_prev=self.b, o=o, debug=True)
        a, plans, p_pi, _ = self._select_action(theta_start=self.b, debug=True)
        max_a = plans[np.argmax(p_pi)][0]
        a = a[0]
        self.b, ll_a = self._update_belief_a(theta_prev=self.b, a=a, debug=True)
        return a, ll_o, ll_a, max_a
    
    def _update_belief_a(self, theta_prev, a, debug=False):
        # prior assumed to be expressed as parameters of the softmax (logits)
        theta = torch.tensor(theta_prev)
        q = torch.nn.Softmax(dim=0)(theta)

        # this is the prior for the distribution at time t
        q1 = torch.matmul(q, torch.tensor(self.env.p_s1_given_s_a(a=a)))

        # initialize parameters of updated belief to uniform
        theta1 = torch.zeros_like(theta, requires_grad=True)
        loss = torch.nn.CrossEntropyLoss() # expects logits and target distribution.
        optimizer = torch.optim.SGD([theta1], lr=self.lr_a)
        if debug:
            ll = np.zeros(self.n_steps_a)

        for i in range(self.n_steps_a):
            l = loss(theta1, q1)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()

            if debug:
                ll[i] = l.detach().numpy()

        theta1 = theta1.detach().numpy()
        if debug:
            return theta1, ll

        return theta1
    
    def _update_belief(self, theta_prev, o, debug=False):
        theta = torch.tensor(theta_prev)

        # make p(s) from b
        q = torch.nn.Softmax(dim=0)
        p_o_given_s = torch.tensor([p.pdf(o) for p in self.env.p_o_given_s])
        p = p_o_given_s * q(theta) # p(o|s)p(s)
        log_p = torch.log(p)

        # initialize updated belief with current belief
        theta1 = torch.tensor(theta_prev, requires_grad=True)

        # estimate loss
        def forward():
            q1 = q(theta1)
            # free energy: KL[ q(s) || p(s, o) ]
            fe = torch.sum(q1 * (torch.log(q1) - log_p))
            return fe

        optimizer = torch.optim.SGD([theta1], lr=self.lr_o)
        ll = np.zeros(self.n_steps_o)
        for i in range(self.n_steps_o):
            l = forward()
            optimizer.zero_grad()
            l.backward()
            optimizer.step()

            if debug:
                ll[i] = l.detach().numpy()

        theta1 = theta1.detach().numpy()
        if debug:
            return theta1, ll

        return theta1

    def _select_action(self, theta_start, debug=False): # return plans, p of selecting each, and marginal p of actions
        # sampling
        a_lims = self.env.a_lims
        plans = np.random.uniform(low=a_lims[0], high=a_lims[1], size=(self.n_plans, self.k)).tolist()
        
        # evaluate negative expected free energy of all plans
        nefes = []
        for pi in plans:
          
          if debug:
            step_nefes, info_gains = self._rollout_step(theta_start, pi, 
                                                        debug=True)
          else:
            step_nefes = self._rollout_step(theta_start, pi)
            
          nefe = np.array(step_nefes).mean() # expected value over steps
          nefes.append(nefe)

        # compute probability of following each plan
        p_pi = softmax(np.array(nefes)).tolist()
        if self.select_max_pi:
            a = plans[np.argmax(nefes)]
        else:
            a = plans[np.random.choice(len(plans), p=p_pi)]

        if debug:
            return a, plans, p_pi, info_gains

        return a
      
    def _rollout_step(self, theta, pi, debug=False):
        if pi == []:
            return []

        a, pi_rest = pi[0], pi[1:]
        # Where will I be after taking action a?
        theta1 = self._update_belief_a(theta, a=a) 
        q = softmax(theta1)
        # Do I like being there?
        pragmatic = np.dot(q, self.log_p_c)
        # What might I observe after taking action a? (marginalize p(o, s) over s)
        ss = np.random.choice(range(self.env.s_N), p=q, size=self.n_o_samples)
        oo = [rv.rvs() for rv in self.env.p_o_given_s[ss]]
        # Do I learn about s from observing o?
        q_o = [softmax(self._update_belief(theta1, o=o)) for o in oo]
        d_o = [kl(q_o_i, q) for q_o_i in q_o] # info gain for each observation
        info_gain = np.mean(d_o) # expected value of info gain
        # negative expected free energy for this timestep
        nefe = self.use_pragmatic_value * pragmatic + \
               self.use_info_gain * info_gain
        
        # nefe for remainder of policy rollout
        nefe_rest = self._rollout_step(theta1, pi_rest)
        # concatenate expected free energy across future time steps
        if debug:
          return [nefe] + nefe_rest, d_o

        return [nefe] + nefe_rest

The code below iterates over all steps involved in the interaction between the environment and the active inference agent. In each interaction step, the agent updates its belief about the current state given a new observation and selects an action to minimise expected free energy. It then updates its belief assuming the selected action was taken and starts anew by updating its belief based on the next observation.

In [None]:
import importlib
import continuous_action_and_observation_environment as caoe
import continuous_action_and_observation_agent as caoa
importlib.reload(caoe)
importlib.reload(caoa)

target_state = 4
k = 4 # planning horizon; run time increases exponentially with planning horizon

# runtime increases linearly with optimization steps during belief update
n_steps_o = 20 # optimization steps updating belief after observation
n_steps_a = 10 # optimization steps updating belief after action
lr_o = 4. # learning rate updating belief after observation
lr_a = 4. # learning rate updating belief after action

render_losses = True

env = caoe.ContinuousAOEnv(N=16, # number of states
                           s_food=0, # location of the food source
                           s_0=10, # starting location 
                           o_sample_size=3) # observation Beta distribution parameter.

# visualise emission probability
samples = [env.p_o_given_s[s].rvs(size=1000) for s in range(env.s_N)]
df = pd.DataFrame(np.array(samples).T)
sns.violinplot(df, cut=0, width=2.5)
plt.xlabel('state s')
plt.ylabel('p(o|s)')
plt.title('Continuous environment emission probability')

agent = caoa.ContinuousAOAgent(env=env, 
                               target_state=target_state,
                               shape_target=False,
                               k=k, 
                               n_plans=128,
                               n_o_samples=1, # observation samples for information gain
                               use_info_gain=True,
                               use_pragmatic_value=True,
                               select_max_pi=True,
                               n_steps_o=n_steps_o, 
                               n_steps_a=n_steps_a, 
                               lr_a=lr_a, 
                               lr_o=lr_o)

o = env.reset() # set state to starting state
agent.reset() # initialize belief state and target state distribution

ss = [env.s_t]
bb = [agent.b]
aa = []
if render_losses:
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    ax[0].set_title('updates from actions')
    ax[0].set_ylabel('loss')
    ax[0].set_xlabel('optimization step')
    ax[1].set_title('updates from observations')
    ax[1].set_ylabel('loss')
    ax[1].set_xlabel('optimization step')
    
for i in range(64):
    a, ll_o, ll_a, max_a = agent.step(o, debug=True)
    print(f"step {i}, s: {env.s_t}, max b:{bb[-1].argmax()}, o: {o:.2f}, top a: {max_a}, a: {a}")
    if render_losses:
        ax[0].plot(ll_a)
        ax[1].plot(ll_o)
    
    o = env.step(a)
    
    ss.append(env.s_t)
    bb.append(agent.b)
    aa.append(a)


from matplotlib.markers import CARETUP, CARETDOWN
aa = np.array(aa)
ss = np.array(ss)

fig, ax = plt.subplots(figsize=(16, 6))
plt.imshow(np.array(bb).T, label='belief')

for i in range(len(aa)):
  plt.plot([i, i], [ss[i], ss[i]+aa[i]], 
             color='orange', 
             linewidth=0.5,
             marker= CARETDOWN if aa[i] > 0 else CARETUP,
             label=None if i > 0 else 'action')


plt.plot(ss, label='state')
plt.plot([0, len(ss)-1], [target_state]*2, label='target')
plt.plot([0, len(ss)-1], [env.s_food]*2, 'w--', label='food')
plt.legend()