In [1]:
import os
import math
import random

import numpy as np
import gymnasium as gym

import gym_env

In [2]:
env = gym.make("simple-15x15")
env.reset()

  logger.deprecation(


({'agent': array([0, 0]), 'target': array([14, 14])}, {'distance': 28.0})

In [3]:
# Get environment specifics
actions = np.arange(env.action_space.n, dtype=int)
random_action = env.unwrapped.random_action()
start_loc = env.unwrapped.start_loc
target_loc = env.unwrapped.target_loc
maze = env.unwrapped.maze
size = maze.size
target_locs = [target_loc]
maze_len = maze.shape[0]

In [4]:
print(maze)

[['S' '0' '1' '0' '0' '0' '1' '0' '0' '1' '0' '0' '0' '0' '0']
 ['0' '0' '0' '1' '0' '0' '0' '1' '0' '0' '1' '0' '0' '0' '0']
 ['0' '0' '0' '1' '0' '0' '0' '1' '0' '0' '1' '0' '0' '0' '0']
 ['0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '1' '1' '0' '0' '0']
 ['0' '0' '1' '0' '0' '0' '0' '1' '1' '0' '0' '1' '0' '0' '0']
 ['0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '0' '0' '1' '1' '1']
 ['0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '1' '1' '0' '0']
 ['0' '0' '0' '1' '1' '1' '1' '0' '0' '0' '0' '1' '0' '0' '0']
 ['0' '0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '1' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '1' '1' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '1' '1' '1' '0' '1' '1' '1' '1' '0' '0' '0']
 ['0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0']
 ['0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '1' '0' '0' '1' '1']
 ['0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' 'G']]


In [10]:
# Create a matrices needed for calculation
DR = np.zeros((size,size))
V = np.zeros((size))
one_hot = np.eye(size)

# Hyperparameters
alpha = 0.3
gamma = 0.95
significant_improvement = 0.0001

In [11]:
def row_col_to_index(row, col, len):
    """
    Converts (row,col) to an index in array
    """
    return row*len + col

In [12]:
class SR_no_action():

	''' This class defines a reinforcement learning agent that 
	learns the state-state successor representation without taking actions. 
	Thus, the resulting SR matrix is in the service of prediction. 

	Initalization parameters

	gamma: discount param
	alpha: learning rate
	p_sample: probability of sampling different options, only relevant for testing poilcy dependence
	NUM_STATES: the number of states in the environment to intialize matrices

	Ida Momennejad, 2019'''

	def __init__(self, gamma, alpha, p_sample, NUM_STATES):
		self.gamma = gamma # discount factor
		self.alpha = alpha # learning rate
		self.p_sample = p_sample # p(sampling options)

		# Initalize M with I instead of zeors
		#self.M= np.zeros([NUM_STATES, NUM_STATES]) # M: state-state SR    	
		self.M= np.eye(NUM_STATES) # M: state-state SR    	

		self.W= np.zeros(NUM_STATES) # W: value weights, 1D
		self.onehot=np.eye(NUM_STATES) # onehot matrix, for updating M
		
		
		self.V= np.zeros(NUM_STATES) # value function
		self.biggest_change = 0
		self.significant_improvement = 0.001 # convergence threshold
		# policy: not revelant in exp 1, agent is passively moved
		#		  but in Exp2 we keep updating it to get the optimal policy
    	# self.Pi = np.zeros([NUM_STATES], dtype=int)  
		self.epsilon = .1
		self.memory=[]

	def step(self, s, s_new, reward):

		old_v = self.get_value()

		self.update_memory(s, s_new)
		self.update_SR(s, s_new)
		self.update_W(s, s_new, reward)

		self.update_biggest_change(old_v[s], s)

        ########## update policy  ##############
        #Pi[s] = action
        # M, W = dyna_replay(memory, M, W, episodes)

	def onehot_row(self, successor_s):	
		row = np.zeros( len(self.W)) 
		row[successor_s] = 1
		return row

	def update_SR(self, s, s_new):

		onehot_row = self.onehot_row(s_new)
		SR_TD_error = onehot_row + self.gamma * self.M[s_new] -self.M[s]  


		# learning by element, as opposed to by row
		self.M[s, s_new] =  self.M[s,s_new] + .2*SR_TD_error[s_new]

		# self.M[s] = (1-self.alpha)* self.M[s] + self.alpha * ( self.onehot[s] + self.gamma * self.M[s_new]  )
		

	def update_W(self, s, s_new, reward):

		''' Update value weight vector. 
		It computes the normalized feature vector * reward PE.
		Here reward function would be sufficient. The same, 
		but R is easier. We use W in plos comp biol 2017 paper, to 
		account for BG weights allowing dopamine similarities 
		between  MF and MB learning.'''

		# future notes: 27 feb 2019: in paper both get updated with every transition
		# better to do batch updates. W updated every step, but M 
		# updated every couple of steps with dyna
		# like feature learning.
		# all rules are correct, but in practice for TD learning on features
		# a little weird to learn feature vector with every step
		# normally features are stable over the task.

		norm_feature_rep = self.M[s] / ( self.M[s]@self.M[s].T ) 

		# Compute the values of s and s_prime, then the prediction error

		V_snew = self.M[s_new]@self.W  
		V_s    = self.M[s]@self.W 		                          
		w_pe = ( reward + self.gamma*V_snew - V_s ).squeeze()        

		# Update W with the same learning rate
		# future: this could be different
		self.W += self.alpha * w_pe *norm_feature_rep

	def get_value(self):
		''' Combine the successor representation M & value weight W
			to determine the value of different options'''

		self.V = self.M@self.W		
		return self.V

	def update_memory(self, s, s_new):
		''' Save current state and the state it visited in one-step
			to memory. This is used in the Dyna version for replay.'''

		self.memory.append([s, s_new])

	
	def update_biggest_change(self, old_v_m, s):
		''' Coompute the change in value, see if it is higher
			than the present max change, if so, update biggest_change '''

		V=self.get_value()
		self.biggest_change = max(self.biggest_change, np.abs(old_v_m - V[s]))   
		self.check_converegnce()         
	
	def check_converegnce(self):
		''' If statement is true, convergence has reached. '''

		self.convergence= self.biggest_change < self.significant_improvement

In [13]:
def SRclass_nathum_exp1(env, gamma, alpha, p_sample=None, verbose=0):

    ''' This function uses the reinfrocement learning agent class in 
        SR_no_action.py to learn.
        Here the function takes the environment from Experiment 1 in our
        Nat Hum Beh paper & learns predictive representations with the 
        specified learning rate and scale. 

        Note: This is not SR dyna, nor SR-MB. 
        This agent only learns the SR.

        Outputs:

        M: SR matrix 
        W: value weights W
        memory: memory of episodes
        episodies: # episodes it takes to reach convergence

        Ida Momennejad, NYC, 2019'''

    if p_sample==None:
        p_sample= [.5,.5]

    SR_agent = SR_no_action(gamma, alpha, p_sample, env.unwrapped.maze.size)
    episodes = 0
    done = False
   
    while True:
        SR_agent.biggest_change = 0
        s = env.unwrapped.start_loc
        s = row_col_to_index(s[0],s[1],maze_len)

        done = False
        while not done: # go through trajectory till the end
            # Get a random action
            a = env.unwrapped.random_action()
            obs, reward, done, _, _ = env.step(a)
            s_new = row_col_to_index(obs["agent"][0], obs["agent"][1], maze_len)
            SR_agent.step(s, s_new, reward)
            
            s = s_new
        
        if verbose==2:
            if episodes % verbose ==0:                    
                print(f'SR training episode #{episodes} Done.')
        episodes += 1

        if SR_agent.convergence:
            if verbose==2:
                print (episodes,' training episodes/iterations done')
            break

    return SR_agent.M, SR_agent.W , SR_agent.memory, episodes


In [14]:
# 2) create SR agent, let it learn the environment
M, W, mem, total_episodes = SRclass_nathum_exp1(env, gamma, alpha)

KeyboardInterrupt: 

In [None]:
# Test it out

# Calculate the value matrix
V = M@W

v_maze = np.zeros_like(maze)
for row in range(v_maze.shape[0]):
    for col in range(v_maze.shape[1]):
        if maze[row, col] == "1":
            v_maze[row,col] = "BAR"
            continue
        idx = row_col_to_index(row,col,maze_len)
        v_maze[row,col] = round(V[idx], 7)

In [None]:
v_maze

array([['2.2158782', '0.232469', 'BAR', '0.00063', '-0.0003467',
        '3.21e-05', 'BAR', '-0.0014005', '0.0013277', 'BAR', '0.0',
        '0.0', '0.0', '0.0', '0.0'],
       ['0.2372448', '0.2847028', '0.0020548', 'BAR', '-0.0001539',
        '0.00017', '-8.03e-05', 'BAR', '0.0001798', '-0.0003039', 'BAR',
        '0.0', '0.0', '0.0', '0.0'],
       ['0.132729', '-0.0006006', '-0.0007308', 'BAR', '0.000126',
        '7.41e-05', '-4.2e-05', 'BAR', '-1.77e-05', '0.0005734', 'BAR',
        '0.0', '0.0', '0.0', '0.0'],
       ['0.0008737', '0.0049573', 'BAR', '0.0001837', '6.73e-05',
        '-0.0003644', '0.0005158', 'BAR', '0.0002517', '0.0009825',
        'BAR', 'BAR', '0.0', '0.0', '0.0'],
       ['-0.0018272', '-0.0088824', 'BAR', '-0.0001209', '9.15e-05',
        '2.1e-06', '0.0001697', 'BAR', 'BAR', '-0.0007858', '0.0002091',
        'BAR', '0.0', '0.0', '0.0'],
       ['0.0020045', '0.0025835', 'BAR', '9.35e-05', '0.0002054',
        '0.0003045', '-0.0005983', 'BAR', '-0.0003616