In [3]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import tensorboard


In [4]:
ndim            = 2         # number of dimensions
T               = 500       # maximum time steps := length of each trajectory
M               = 250       # number of generated trajectories from observations of pi_explore
a_max           = 100       # maximum acceleration
d_max           = 3 * a_max # max distance between trajectory points
v_max           = 5 * a_max # maximum velocity
a_min           = -a_max    # minimum acceleration
v_min           = -v_max    # minimum velocity
d_min           = 1         # min distance between trajectory points
num_observables = 3         # observable targets


In [5]:
a_v = 2 # constant
a_x = 2 # constant
velocity_next   = lambda velocity, acceleration : velocity + a_v * acceleration
position_next   = lambda position, velocity     : position + a_x * velocity
get_reward      = lambda position, target       : -np.dot((position - target), np.transpose(position - target))


In [6]:
a = np.zeros(shape=ndim)        # current acceleration


In [7]:
s = np.zeros(shape=(2+num_observables, ndim))   # current state
x = s[0,:]                      # position
v = s[1,:]                      # velocity
y = s[2:,:]                     # target(s)


In [8]:
pi_explore = lambda state : np.random.randint(low=a_min, high=a_max, size=ndim, dtype='l')


In [9]:
pi = keras.Sequential([
    keras.layers.Dense(32, input_shape=s.shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(a.size),
    keras.layers.Activation('tanh')
])


In [21]:
f_s = keras.Sequential([
    keras.layers.Dense(32, input_shape=np.vstack([s, a]).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(s.size),
    keras.layers.Activation('tanh')
])
f_s.compile(optimizer='adam', loss='mse')


In [24]:
f_r = keras.Sequential([
    keras.layers.Dense(32, input_shape=np.vstack([s, a]).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(1),
    keras.layers.Activation('tanh')
])
f_r.compile(optimizer='adam', loss='mse')


In [23]:
trajectories = np.zeros(shape=(M, T, ndim), dtype=np.float)
for m in range(M):
    for t in range(1, T):
        trajectories[m, t] = trajectories[m, t-1] + np.random.randint(low=d_min, high=d_max, size=ndim, dtype='l')



In [18]:
num_episodes = 1
record_r = []
record_a = []
record_s = []
for _ in range(num_episodes):

    # for each trajectory (or perhaps create mini batches # => no minibatches, because trajectories may end at different times?)
    for m in range(M):

        # reset:
        x[:] = np.zeros_like(x)
        v[:] = np.zeros_like(v)

        # for each step:
        for t in range(T):

            # get state
            tmp = min(num_observables, T - t)
            y[:tmp] = trajectories[m, t : min(t + num_observables, T)]

            # get action
            a = pi_explore(s)

            # perform update
            v[:] = velocity_next(v, a)
            x[:] = position_next(x, v)

            # get reward
            r = get_reward(x, y[0])
            record_s.append(s)
            record_a.append(a)
            record_r.append(r)

record_s = np.array(record_s)
record_a = np.array(record_a)
record_r = np.array(record_r)

In [None]:
record_s_norm = record_s

In [None]:

epochs = 10
batch_size = 32
f_r.fit(np.vstack([s, a]), r, batch_size, epochs)

In [None]:
# pi = None
