1) Import Dependencies

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import tensorboard


2) Configuration

In [3]:
ndim            = 2         # number of dimensions
T               = 500       # maximum time steps and length of each trajectory
M               = 250       # number of generated trajectories using pi_explore

num_observables = 3         # number of observable targets from the current timestep

a_max           = 100       # maximum acceleration
d_max           = 3 * a_max # max distance between trajectory points
v_max           = 5 * a_max # maximum velocity
a_min           = -a_max    # minimum acceleration
v_min           = -v_max    # minimum velocity
d_min           = 1         # min distance between trajectory points

3) Environment Dynamics

$\begin{aligned}
v_{t+1} &= v_t + a_va_t\\
x_{t+1} &= x_t + a_xv_t\\
r_t &= -(x_t-\mathbf{x^*})^T(x_t-\mathbf{x^*})
\end{aligned}$

with constants $a_v$ and $a_x$.

In [3]:
a_v = 2 # constant weight
a_x = 2 # constant weight
velocity_next   = lambda velocity, acceleration : velocity + a_v * acceleration
position_next   = lambda position, velocity     : position + a_x * velocity
get_reward      = lambda position, target       : -np.dot((position - target), np.transpose(position - target))


4) Action Space

Acceleration value for each dimension:

$a_t \in \mathbb{R}^D$

In [4]:
a = np.zeros(shape=ndim)        # current acceleration


5) State Space

$s_t$ ...state consists of:

- $x_t \in \mathbb{R}^D$ ...position vector
- $v_t \in \mathbb{R}^D$ ...velocity vector
- $\mathbf{{x_t}^*} \in \mathbb{R}^D$ ...target vector(s)

$s_t = (x_t, v_t, \mathbf{{x_t}^*}, ..., \mathbf{{x_{t+num\_observables}}^*})$

In [5]:
s = np.zeros(shape=(2+num_observables, ndim))   # current state
x = s[0,:]                                      # position
v = s[1,:]                                      # velocity
y = s[2:,:]                                     # next observable target(s)


In [6]:
trajectories = np.zeros(shape=(M, T, ndim), dtype=np.float)
for m in range(M):
    for t in range(1, T):
        trajectories[m, t] = trajectories[m, t-1] + np.random.randint(low=d_min, high=d_max, size=ndim, dtype='l')


In [13]:
validation_trajectories = np.zeros(shape=(M, T, ndim), dtype=np.float)
for m in range(M):
    for t in range(1, T):
        validation_trajectories[m, t] = validation_trajectories[m, t-1] + np.random.randint(low=d_min, high=d_max, size=ndim, dtype='l')


In [7]:
pi_explore = lambda state : np.random.randint(low=a_min, high=a_max, size=ndim)


In [8]:
record_r = []
record_a = []
record_s = []

num_episodes = 1 # observe each trajectory only once
for _ in range(num_episodes):

    # for each trajectory
    # (or perhaps create mini batches?)
    # (=> no minibatches, because trajectories may end at different time step?)
    for m in range(M):

        # reset agent's state:
        # x[:] = np.zeros_like(x)
        # v[:] = np.zeros_like(v)
        s[:] = np.zeros_like(s)
        # y[:] = trajectories[m,:num_observables]

        # for each time step:
        for t in range(T):

            # get state (observable targets)
            # tmp = min(num_observables, T - t)
            # y[:tmp] = trajectories[m, t : min(t + num_observables, T)]
            y[:min(num_observables, T - t)] = trajectories[m, t : min(t + num_observables, T)]

            # get action
            a = pi_explore(s)

            # perform action (update state)
            v[:] = velocity_next(v, a)
            x[:] = position_next(x, v)

            # get reward
            r = get_reward(x, y[0])

            # store observation
            record_s.append(s.copy())
            record_a.append(a.copy())
            record_r.append(r.copy())

record_s = np.array(record_s)
record_a = np.array(record_a)
record_r = np.array(record_r)


In [9]:
# clone state:
s_norm = record_s.copy()    # x, v, y: agent's position is always 0/0, target is normed to max_distance
x_norm = s_norm[0,:]                      # position
v_norm = s_norm[1,:]                      # velocity
y_norm = s_norm[2:,:]                     # target(s)

# normalize state:
v_norm[:] = v_norm / v_max
y_norm[:] = (y_norm - x_norm) / v_max
x_norm[:] = np.zeros_like(x_norm)

# normalize action:
a_norm = record_a / a_max    # normed to maximum acceleration

# normalize reward?:
r_norm = record_r / v_max    # normed to max_distance or not normed at all?


In [10]:
pi = keras.Sequential([
    keras.layers.Dense(32, input_shape=s.reshape(-1).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(32),
    keras.layers.Activation('relu'),
    keras.layers.Dense(a.size)#,
    # keras.layers.Activation('tanh')
])
pi.compile(optimizer='adam', loss='mse')



In [11]:
f_s = keras.Sequential([
    keras.layers.Dense(32, input_shape=np.vstack([s, a]).reshape(-1).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(32),
    keras.layers.Activation('relu'),
    keras.layers.Dense(s.size)#,
    # keras.layers.Activation('tanh')
])
f_s.compile(optimizer='adam', loss='mse')


In [12]:
f_r = keras.Sequential([
    keras.layers.Dense(32, input_shape=np.vstack([s, a]).reshape(-1).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(32),
    keras.layers.Activation('relu'),
    keras.layers.Dense(1)#,
    # keras.layers.Activation('tanh')
])
f_r.compile(optimizer='adam', loss='mse')


In [34]:
epochs = 250
batch_size = 512
state_action = np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).reshape(s_norm.shape[0], -1)
f_r.fit(state_action, r_norm, batch_size, epochs)


In [35]:
epochs = 10
batch_size = 32
state_action2 = np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).reshape(s_norm.shape[0], -1)[:-1]
f_s.fit(state_action2, s_norm.reshape(s_norm.shape[0], -1)[1:], batch_size, epochs)


In [None]:
epochs = 100
batch_size = 32
pi.fit()


In [28]:
np.vstack([s_norm, np.expand_dims(a_norm, 1)])

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 5 and the array at index 1 has size 1

In [30]:
np.expand_dims(a_norm, 1).shape

(125000, 1, 2)

In [34]:
np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).shape

(125000, 6, 2)

In [41]:
s.size



10