In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import tensorboard


In [2]:
ndim            = 2         # number of dimensions
T               = 500       # maximum time steps := length of each trajectory
M               = 250       # number of generated trajectories from observations of pi_explore
a_max           = 100       # maximum acceleration
d_max           = 3 * a_max # max distance between trajectory points
v_max           = 5 * a_max # maximum velocity
a_min           = -a_max    # minimum acceleration
v_min           = -v_max    # minimum velocity
d_min           = 1         # min distance between trajectory points
num_observables = 3         # observable targets


In [3]:
a_v = 2 # constant
a_x = 2 # constant
velocity_next   = lambda velocity, acceleration : velocity + a_v * acceleration
position_next   = lambda position, velocity     : position + a_x * velocity
get_reward      = lambda position, target       : -np.dot((position - target), np.transpose(position - target))


In [4]:
a = np.zeros(shape=ndim)        # current acceleration


In [5]:
s = np.zeros(shape=(2+num_observables, ndim))   # current state
x = s[0,:]                      # position
v = s[1,:]                      # velocity
y = s[2:,:]                     # target(s)


In [6]:
pi_explore = lambda state : np.random.randint(low=a_min, high=a_max, size=ndim, dtype='l')


In [7]:
pi = keras.Sequential([
    keras.layers.Dense(32, input_shape=s.shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(a.size)#,
    # keras.layers.Activation('tanh')
])
pi.compile(optimizer='adam', loss='mse')



In [44]:
f_s = keras.Sequential([
    keras.layers.Dense(32, input_shape=np.vstack([s, a]).reshape(-1).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(s.size)#,
    # keras.layers.Activation('tanh')
])
f_s.compile(optimizer='adam', loss='mse')


In [37]:
f_r = keras.Sequential([
    keras.layers.Dense(32, input_shape=np.vstack([s, a]).reshape(-1).shape),
    keras.layers.Activation('relu'),
    keras.layers.Dense(1)#,
    # keras.layers.Activation('tanh')
])
f_r.compile(optimizer='adam', loss='mse')


In [10]:
trajectories = np.zeros(shape=(M, T, ndim), dtype=np.float)
for m in range(M):
    for t in range(1, T):
        trajectories[m, t] = trajectories[m, t-1] + np.random.randint(low=d_min, high=d_max, size=ndim, dtype='l')



In [11]:
num_episodes = 1
record_r = []
record_a = []
record_s = []
for _ in range(num_episodes):

    # for each trajectory (or perhaps create mini batches # => no minibatches, because trajectories may end at different times?)
    for m in range(M):

        # reset:
        x[:] = np.zeros_like(x)
        v[:] = np.zeros_like(v)

        # for each step:
        for t in range(T):

            # get state
            tmp = min(num_observables, T - t)
            y[:tmp] = trajectories[m, t : min(t + num_observables, T)]

            # get action
            a = pi_explore(s)

            # perform update
            v[:] = velocity_next(v, a)
            x[:] = position_next(x, v)

            # get reward
            r = get_reward(x, y[0])
            record_s.append(s)
            record_a.append(a)
            record_r.append(r)

record_s = np.array(record_s)
record_a = np.array(record_a)
record_r = np.array(record_r)


In [12]:
s_norm = record_s.copy()    # x, v, y: agent's position is always 0/0, target is normed to max_distance
x_norm = s_norm[0,:]                      # position
v_norm = s_norm[1,:]                      # velocity
y_norm = s_norm[2:,:]                     # target(s)

v_norm[:] = v_norm / v_max
y_norm[:] = (y_norm - x_norm) / v_max
x_norm[:] = np.zeros_like(x_norm)

a_norm = record_a / a_max    # normed to maximum acceleration
r_norm = record_r / v_max    # normed to max_distance or not normed at all?


In [40]:
epochs = 10
batch_size = 32
f_r.fit(np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).reshape(s_norm.shape[0], -1), r_norm, batch_size, epochs)


Train on 125000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/1

<tensorflow.python.keras.callbacks.History at 0x28ca2a1a8c8>

In [45]:
epochs = 10
batch_size = 32
f_s.fit(np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).reshape(s_norm.shape[0], -1)[:-1], s_norm.reshape(s_norm.shape[0], -1)[1:], batch_size, epochs)



Train on 124999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/1

<tensorflow.python.keras.callbacks.History at 0x28ca2deb308>

In [47]:
validation_trajectories = np.zeros(shape=(M, T, ndim), dtype=np.float)
for m in range(M):
    for t in range(1, T):
        validation_trajectories[m, t] = validation_trajectories[m, t-1] + np.random.randint(low=d_min, high=d_max, size=ndim, dtype='l')


In [None]:
epochs = 100
batch_size = 32
pi.fit()


In [28]:
np.vstack([s_norm, np.expand_dims(a_norm, 1)])

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 5 and the array at index 1 has size 1

In [30]:
np.expand_dims(a_norm, 1).shape

(125000, 1, 2)

In [34]:
np.concatenate([s_norm, np.expand_dims(a_norm, 1)], axis=1).shape

(125000, 6, 2)

In [41]:
s.size



10