In [413]:
import tensorflow as tf
import numpy as np
import math

In [414]:
class simple_env():
    def __init__(self, step_size=13,targets=[166,-88],max_steps=100):
        self.pos=np.random.uniform(-10,10,len(targets))
        self.max_steps=max_steps
        self.step_size=step_size
        self.targets=targets
        self.max_num=1000
        self.min_num=-1000
        self.done=False
        self.def_size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.counter_stop=0
        self.obsevation_space=len(targets)
        self.action_space=len(targets)
        
    def reset(self):
        self.pos=np.random.uniform(-10,10,len(self.targets))
        self.size_pos=[abs(self.targets[i]-self.pos[i]) for i in range(len(self.targets))]
        self.counter_stop=0
        return self.pos

    def step(self,action):
        self.counter_stop+=1
        if self.counter_stop>=self.max_steps:
            self.done=True
        self.last_pos=np.copy(self.pos) 
        self.last_size_pos=np.copy(self.size_pos)
        self.reward=np.zeros(self.pos.shape)
        for i in range(len(action)):
            self.pos[i]=self.pos[i]+(action[i]*self.step_size)
            self.size_pos[i]=abs(self.targets[i]-self.pos[i])
            difference= self.last_size_pos[i]-self.size_pos[i]
            self.reward[i]=difference/self.def_size_pos[i]
        return self.pos, self.reward, self.done

In [415]:
def get_actor(obs_space, action_space):
    last_init = tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
    inputs = tf.keras.layers.Input(shape=(obs_space,))
    out = tf.keras.layers.Dense(4, activation="relu")(inputs)
    outputs = tf.keras.layers.Dense(action_space, activation="tanh", kernel_initializer=last_init)(out)
    model = tf.keras.Model(inputs, outputs)
    return model

In [416]:
def policy(obs, model, lower_bound=-1, upper_bound=1):
    sampled_actions=model(obs)
    sampled_actions = sampled_actions.numpy()
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return legal_action[0]

In [417]:
def play_one_step(env, obs, model, loss_fn):
    obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
    with tf.GradientTape() as tape:        
        obs=tf.expand_dims(tf.convert_to_tensor([12,-15]), 0)
        action = policy(obs,model)
        next_obs, reward, done = env.step(action)
        logits = model(obs)
        y_target=tf.reduce_mean(tf.Variable(reward))
        loss= tf.reduce_mean(loss_fn(y_target,logits))
    grads = tape.gradient(loss, model.trainable_variables)
    return next_obs, reward, done, grads, action

In [418]:
def play_episodes(env, num_steps, model, loss_fn):
    all_rewarsd=[]
    all_grads=[]
    all_obs=[]
    all_action=[]
    obs = env.reset()
    for step in range(num_steps):
        obs, reward, dode, grads, action= play_one_step(env,obs,model,loss_fn)
        all_rewarsd.append(np.copy(reward))
        all_grads.append(np.copy(grads))
        all_obs.append(np.copy(obs))
        all_action.append(np.copy(action))
    return all_rewarsd, all_grads, all_obs, all_action

In [419]:
def discount_rewards(rewards, discount):
    for i in range(len(rewards)):
        rewards[i]=np.mean(rewards[i])*pow(discount,i)
    return rewards

In [420]:
n_iteration_learn = 10
n_max_steps = 3
discount = 0.95
learning_rate = 0.01

In [421]:
env=simple_env()
first_pos=env.reset()
model = get_actor(env.obsevation_space,env.action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.MeanSquaredError()

In [422]:
model.summary()

Model: "model_123"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_124 (InputLayer)      [(None, 2)]               0         
                                                                 
 dense_246 (Dense)           (None, 4)                 12        
                                                                 
 dense_247 (Dense)           (None, 2)                 10        
                                                                 
Total params: 22
Trainable params: 22
Non-trainable params: 0
_________________________________________________________________


In [423]:
for var in model.trainable_variables:
  print(var, "\n")

<tf.Variable 'dense_246/kernel:0' shape=(2, 4) dtype=float32, numpy=
array([[-0.92634916,  0.4802642 , -0.09108782, -0.10725307],
       [-0.5171292 , -0.42497873,  0.9786489 ,  0.75194454]],
      dtype=float32)> 

<tf.Variable 'dense_246/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)> 

<tf.Variable 'dense_247/kernel:0' shape=(4, 2) dtype=float32, numpy=
array([[-0.03640034, -0.08363064],
       [-0.00559258,  0.0086256 ],
       [-0.00283928, -0.06190112],
       [-0.09818655,  0.0180835 ]], dtype=float32)> 

<tf.Variable 'dense_247/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)> 



In [424]:
print(model.layers[0].weights)
print(model.layers[1].weights)
print(model.layers[2].weights)

[]
[<tf.Variable 'dense_246/kernel:0' shape=(2, 4) dtype=float32, numpy=
array([[-0.92634916,  0.4802642 , -0.09108782, -0.10725307],
       [-0.5171292 , -0.42497873,  0.9786489 ,  0.75194454]],
      dtype=float32)>, <tf.Variable 'dense_246/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]
[<tf.Variable 'dense_247/kernel:0' shape=(4, 2) dtype=float32, numpy=
array([[-0.03640034, -0.08363064],
       [-0.00559258,  0.0086256 ],
       [-0.00283928, -0.06190112],
       [-0.09818655,  0.0180835 ]], dtype=float32)>, <tf.Variable 'dense_247/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>]


In [425]:
env=simple_env()
model = get_actor(env.obsevation_space,env.action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.MeanSquaredError()
obs = env.reset()
# obs, reward, dode, grads, action= play_one_step(env,obs,model,loss_fn)
obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
with tf.GradientTape() as tape:        
    action = policy(obs,model)
    next_obs, reward, done = env.step(action)
    # print(obs)
    # print(next_obs)   
    # print(action)
    print(reward)
    logits = model(obs)
    y_target=tf.reduce_mean(tf.Variable(reward))
    loss= tf.reduce_mean(loss_fn(y_target,logits))
grads = tape.gradient(loss, model.trainable_variables)
# print(grads)
# for i in range(len(model.trainable_variables)):
#     rd=np.mean(reward)
#     grads[i]=tf.math.multiply(grads[i],rd)

optimizer.apply_gradients(zip(grads, model.trainable_variables))
print("*********************************************")
print(grads)
print("*********************************************")
for var in model.trainable_variables:
  print(var, "\n")

[-0.03866755 -0.02428906]
*********************************************
[<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[-0.09595227,  0.00214505, -0.0445703 , -0.02829772],
       [-0.20836425,  0.00465807, -0.09678622, -0.06144964]],
      dtype=float32)>, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.03973014, -0.00088818,  0.01845485,  0.011717  ], dtype=float32)>, <tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-1.3789753 ,  0.73227215],
       [-1.6629305 ,  0.8830598 ],
       [-1.1900872 ,  0.6319676 ],
       [-0.43286043,  0.22986028]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([-0.35314867,  0.18753123], dtype=float32)>]
*********************************************
<tf.Variable 'dense_248/kernel:0' shape=(2, 4) dtype=float32, numpy=
array([[ 0.38746992, -0.2959872 , -0.09320949, -0.19253984],
       [-0.90837944, -0.77615845, -0.58503896, -0.13044623]],
      dtype=float32)> 

<tf.Variable 'dense_248/bias:0' shape=(4,) dtyp

In [426]:
buffer=[]
for iteration in range(n_iteration_learn):
    all_rewards, all_grads, all_obs, all_action = play_episodes(env,n_max_steps,model,loss_fn)
    all_final_rewards = discount_rewards(all_rewards,discount)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        for step, final_reward in enumerate(all_final_rewards):
            var = []
            for i in range(len(model.trainable_variables)):
                all_grads[step][i]=tf.math.multiply(all_grads[step][i],final_reward)
            var.append(all_grads[step][var_index])
        all_mean_grads.append(tf.reduce_mean(var,axis=0))
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))
    episode_reward=tf.reduce_sum(all_final_rewards)
    buffer.append([all_final_rewards,all_obs,all_action])
    print("Episode * {} * Reward is ==> {}".format(iteration, episode_reward))

Episode * 0 * Reward is ==> -0.2601094804883344
Episode * 1 * Reward is ==> -0.2294534170270457
Episode * 2 * Reward is ==> -0.19903095967354714
Episode * 3 * Reward is ==> -0.1697663796479697
Episode * 4 * Reward is ==> -0.14255458775525834
Episode * 5 * Reward is ==> -0.11782296894119003
Episode * 6 * Reward is ==> -0.09565820072729901
Episode * 7 * Reward is ==> -0.07595914336135792
Episode * 8 * Reward is ==> -0.058527491309627526
Episode * 9 * Reward is ==> -0.04312252607873244
