In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import tensorboard


Configuration

In [2]:
ndim                = 2             # number of dimensions
T                   = 500           # maximum time steps and length of each trajectory
M                   = 250           # number of generated trajectories using pi_explore

a_max               = 100           # maximum acceleration
a_min               = -a_max        # minimum acceleration

d_max               = 3 * a_max     # max distance between trajectory points
d_min               = 1             # min distance between trajectory points

v_max               = 5 * a_max     # maximum velocity
v_min               = -v_max        # minimum velocity

num_observables             = 3             # number of observable targets from the current timestep
max_agent_distance          = 100 * d_max   # maximum distance an agent may have to his next target before the episode is canceled
max_distance_from_origin    = T * d_max     # use as normalization factor

epochs              = 50
batch_size          = 32

Toy Environment Dynamics

In [3]:
a_v = 2.5 # constant weight
a_x = 2.5 # constant weight
velocity_next   = lambda velocity, acceleration : velocity + a_v * acceleration
position_next   = lambda position, velocity     : position + a_x * velocity
get_reward      = lambda position, target       : -np.linalg.norm(position - target)


Action Space

In [4]:
a = np.zeros(shape=ndim)        # current acceleration


State Space

In [5]:
s = np.zeros(shape=(2+num_observables, ndim))   # current state
x = s[0,:]                                      # position
v = s[1,:]                                      # velocity
y = s[2:,:]                                     # next observable target(s)


State Model

In [6]:
# option1: initial state + sequence of actions => return next state(s)
# inputs = [batch, timesteps, feature]

f_s_rnn = tf.keras.Sequential([
      keras.layers.GRU(32,return_sequences=True),
      keras.layers.Dense(s.size)
])
f_s_rnn.compile(optimizer='adam', loss='mse')

# option2: every state + every action => return next state
# state model (state, action) -> state

f_s = keras.Sequential([
    keras.layers.Dense(32, input_shape=np.vstack([s, a]).reshape(-1).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(32),
    keras.layers.Activation('relu'),
    keras.layers.Dense(s.size)#,
    # keras.layers.Activation('tanh')
])
f_s.compile(optimizer='adam', loss='mse')

Reward Model

In [7]:
f_r = keras.Sequential([
    keras.layers.Dense(32, input_shape=np.vstack([s, a]).reshape(-1).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(32),
    keras.layers.Activation('relu'),
    keras.layers.Dense(1)#,
    # keras.layers.Activation('tanh')
])
f_r.compile(optimizer='adam', loss='mse')

Policy Model


In [8]:
pi = keras.Sequential([
    keras.layers.Dense(32, input_shape=s.reshape(-1).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(32),
    keras.layers.Activation('relu'),
    keras.layers.Dense(a.size)#,
    # keras.layers.Activation('tanh')
])
pi.compile(optimizer='adam', loss='mse')


Random exploration policy

In [9]:
pi_explore = lambda state : np.random.randint(low=a_min, high=a_max, size=ndim)

Observe Environment

In [None]:
record_r = []
record_a = []
record_s = []
record_sequence_a = []
record_sequence_s = []

# generate M random trajectories of length T
trajectories = np.zeros(shape=(M, T, ndim), dtype=np.float)
for m in range(M):
    for t in range(1, T):
        trajectories[m, t] = trajectories[m, t-1] + np.random.randint(low=d_min, high=d_max, size=ndim, dtype='l')

validation_trajectories = np.zeros(shape=(M, T, ndim), dtype=np.float)
for m in range(M):
    for t in range(1, T):
        validation_trajectories[m, t] = validation_trajectories[m, t-1] + np.random.randint(low=d_min, high=d_max, size=ndim, dtype='l')


num_episodes = 15 # explore each trajectory a few times
for _ in range(num_episodes):

    # for each trajectory
    for m in range(trajectories.shape[0]):

        # reset agent's state:
        s[:] = np.zeros_like(s)

        tmp_sequence_s = []
        tmp_sequence_a = []

        # for up to maximum time steps T:
        #for t in range(T): # todo: instead of maximum steps => try as long until a certain length is reached
        t = 0
        while t < T:

            # observe
            y[:min(num_observables, T - t)] = trajectories[m, t : min(t + num_observables, T)]

            # get action
            a = pi_explore(s)

            # update state
            v_ = velocity_next(v, a)
            x_ = position_next(x, v)
            # v[:] = velocity_next(v, a)
            # x[:] = position_next(x, v)

            # reward
            r = get_reward(x, y[0])

            # cancel episode if agent gets too far away from his target
            if np.linalg.norm(x_ - y[0]) > max_agent_distance:
                continue

            v[:] = v_
            x[:] = x_

            # store observation
            record_s.append(s.copy())
            record_a.append(a.copy())
            record_r.append(r.copy())
            tmp_sequence_s.append(s.copy())
            tmp_sequence_a.append(a.copy())

            t += 1

        record_sequence_s.append(np.array(tmp_sequence_s.copy()))
        record_sequence_a.append(np.array(tmp_sequence_a.copy()))

record_s = np.array(record_s)
record_a = np.array(record_a)
record_r = np.array(record_r)
# record_sequence_s = np.array(record_sequence_s)
# record_sequence_a = np.array(record_sequence_a)

In [None]:
# np.vstack(record_sequence_s)[0]
# np.array(record_sequence_s).shape
record_sequence_a[0].shape
# trajectories[0]

Normalize Records

In [None]:
# copy state:
s_norm = record_s.copy()                            # x, v, y: agent's position is always 0/0, target is normed to max_distance
x_norm = s_norm[0,:]                                # position
v_norm = s_norm[1,:]                                # velocity
y_norm = s_norm[2:,:]                               # target(s)

# normalize state and make next target relative to agent's position:
v_norm[:] = v_norm / v_max                          # normed to maximum velocity
y_norm[:] = (y_norm - x_norm) / max_agent_distance  # relative to agent's position and normed to maximum distance from agent to next target
x_norm[:] = np.zeros_like(x_norm)                   # agent is always at the center

# normalize action:
a_norm = record_a / a_max                           # normed to maximum acceleration

# normalize reward:
r_norm = record_r / -max_agent_distance             # normed to negative maximum reward


Train Reward Model

In [None]:
state_action = np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).reshape(s_norm.shape[0], -1)
f_r.fit(state_action, r_norm, batch_size, epochs)

Train State Model

In [None]:
# state_action2 = np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).reshape(s_norm.shape[0], -1)[:-1]
# f_s.fit(state_action2, s_norm.reshape(s_norm.shape[0], -1)[1:], batch_size, epochs)

# state_action2 = np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).reshape(s_norm.shape[0], -1)[:-1]
# state_action2 = a_norm.reshape(batch_size, timesequence, input_dim)
timesequence = 500
input_dim = a_norm.shape[-1]
state_action2 = a_norm.reshape(timesequence, ndim)
f_s_rnn.fit(state_action2, s_norm, batch_size, epochs)
# f_s_rnn.fit(state_action2, )
# get a sequence of actions => output sequence of states

Compare real and learned environment

In [None]:
# for each trajectory
for m in range(trajectories.shape[0]):

    # reset agent's state:
    s[:] = np.zeros_like(s)

    # for up to maximum time steps T:
    for t in range(T):

        # observe
        y[:min(num_observables, T - t)] = trajectories[m, t : min(t + num_observables, T)]

        # get action
        a = pi_explore(s)
        
        # update state
        v[:] = velocity_next(v, a)
        x[:] = position_next(x, v)
        # v_norm = f_s(np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).reshape(s_norm.shape[0], -1)[:-1])

        # reward
        r = get_reward(x, y[0])

        # cancel episode if agent gets too far away from his target
        if np.linalg.norm(x - y[0]) > max_agent_distance:
            break
