In [399]:
import tensorflow as tf
import numpy as np
import math

In [400]:
class simple_env():
    def __init__(self, step_size=13,targets=[166,-88],max_steps=100):
        self.pos=np.random.uniform(-10,10,len(targets))
        self.max_steps=max_steps
        self.step_size=step_size
        self.targets=targets
        self.max_num=1000
        self.min_num=-1000
        self.done=False
        self.def_size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.counter_stop=0
        self.obsevation_space=len(targets)
        self.action_space=len(targets)
        
    def reset(self):
        self.pos=np.random.uniform(-10,10,len(self.targets))
        self.size_pos=[abs(self.targets[i]-self.pos[i]) for i in range(len(self.targets))]
        self.counter_stop=0
        return self.pos

    def step(self,action):
        self.counter_stop+=1
        if self.counter_stop>=self.max_steps:
            self.done=True
        self.last_pos=np.copy(self.pos) 
        self.last_size_pos=np.copy(self.size_pos)
        self.reward=np.zeros(self.pos.shape)
        for i in range(len(action)):
            self.pos[i]=self.pos[i]+(action[i]*self.step_size)
            self.size_pos[i]=abs(self.targets[i]-self.pos[i])
            difference= self.last_size_pos[i]-self.size_pos[i]
            self.reward[i]=difference/self.def_size_pos[i]
        return self.pos, self.reward, self.done

In [401]:
def get_actor(obs_space, action_space):
    last_init = tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
    inputs = tf.keras.layers.Input(shape=(obs_space,))
    out = tf.keras.layers.Dense(4, activation="relu")(inputs)
    outputs = tf.keras.layers.Dense(action_space, activation="tanh", kernel_initializer=last_init)(out)
    model = tf.keras.Model(inputs, outputs)
    return model

In [402]:
def policy(obs, model, lower_bound=-1, upper_bound=1):
    sampled_actions=model(obs)
    sampled_actions = sampled_actions.numpy()
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return legal_action[0]

In [403]:
def play_one_step(env, obs, model, loss_fn):
    obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
    with tf.GradientTape() as tape:        
        obs=tf.expand_dims(tf.convert_to_tensor([12,-15]), 0)
        action = policy(obs,model)
        next_obs, reward, done = env.step(action)
        logits = model(obs)
        y_target=tf.reduce_mean(tf.Variable(reward))
        loss= tf.reduce_mean(loss_fn(y_target,logits))
    grads = tape.gradient(loss, model.trainable_variables)
    return next_obs, reward, done, grads, action

In [404]:
def play_episodes(env, num_steps, model, loss_fn):
    all_rewarsd=[]
    all_grads=[]
    all_obs=[]
    all_action=[]
    obs = env.reset()
    for step in range(num_steps):
        obs, reward, dode, grads, action= play_one_step(env,obs,model,loss_fn)
        all_rewarsd.append(np.copy(reward))
        all_grads.append(np.copy(grads))
        all_obs.append(np.copy(obs))
        all_action.append(np.copy(action))
    return all_rewarsd, all_grads, all_obs, all_action

In [405]:
def discount_rewards(rewards, discount):
    for i in range(len(rewards)):
        rewards[i]=np.mean(rewards[i])*pow(discount,i)
    return rewards

In [406]:
n_iteration_learn = 10
n_max_steps = 3
discount = 0.95
learning_rate = 0.01

In [407]:
env=simple_env()
first_pos=env.reset()
model = get_actor(env.obsevation_space,env.action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.MeanSquaredError()

In [408]:
model.summary()

Model: "model_121"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_122 (InputLayer)      [(None, 2)]               0         
                                                                 
 dense_242 (Dense)           (None, 4)                 12        
                                                                 
 dense_243 (Dense)           (None, 2)                 10        
                                                                 
Total params: 22
Trainable params: 22
Non-trainable params: 0
_________________________________________________________________


In [409]:
for var in model.trainable_variables:
  print(var, "\n")

<tf.Variable 'dense_242/kernel:0' shape=(2, 4) dtype=float32, numpy=
array([[ 0.16038346, -0.9170389 ,  0.6178055 , -0.67084146],
       [ 0.5983014 , -0.20601225, -0.77527   ,  0.92489314]],
      dtype=float32)> 

<tf.Variable 'dense_242/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)> 

<tf.Variable 'dense_243/kernel:0' shape=(4, 2) dtype=float32, numpy=
array([[-0.01579966, -0.03933489],
       [-0.01232827,  0.04514391],
       [-0.00623178,  0.08163605],
       [ 0.03753508,  0.09060925]], dtype=float32)> 

<tf.Variable 'dense_243/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)> 



In [410]:
print(model.layers[0].weights)
print(model.layers[1].weights)
print(model.layers[2].weights)

[]
[<tf.Variable 'dense_242/kernel:0' shape=(2, 4) dtype=float32, numpy=
array([[ 0.16038346, -0.9170389 ,  0.6178055 , -0.67084146],
       [ 0.5983014 , -0.20601225, -0.77527   ,  0.92489314]],
      dtype=float32)>, <tf.Variable 'dense_242/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]
[<tf.Variable 'dense_243/kernel:0' shape=(4, 2) dtype=float32, numpy=
array([[-0.01579966, -0.03933489],
       [-0.01232827,  0.04514391],
       [-0.00623178,  0.08163605],
       [ 0.03753508,  0.09060925]], dtype=float32)>, <tf.Variable 'dense_243/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>]


In [411]:
env=simple_env()
model = get_actor(env.obsevation_space,env.action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.MeanSquaredError()
obs = env.reset()
# obs, reward, dode, grads, action= play_one_step(env,obs,model,loss_fn)
obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
with tf.GradientTape() as tape:        
    action = policy(obs,model)
    next_obs, reward, done = env.step(action)
    # print(obs)
    # print(next_obs)   
    # print(action)
    print(reward)
    logits = model(obs)
    y_target=tf.reduce_mean(tf.Variable(reward))
    loss= tf.reduce_mean(loss_fn(y_target,logits))
grads = tape.gradient(loss, model.trainable_variables)
# print(grads)
for i in range(len(model.trainable_variables)):
    rd=np.mean(reward)
    grads[i]=tf.math.multiply(grads[i],rd)

optimizer.apply_gradients(zip(grads, model.trainable_variables))
print("*********************************************")
print(grads)
print("*********************************************")
for var in model.trainable_variables:
  print(var, "\n")

[0.00375837 0.05316206]
*********************************************
[<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[-0.00054769, -0.00082595,  0.        ,  0.        ],
       [ 0.00047007,  0.00070889, -0.        , -0.        ]],
      dtype=float32)>, <tf.Tensor: shape=(4,), dtype=float32, numpy=
array([ 9.9502504e-05,  1.5005590e-04, -0.0000000e+00, -0.0000000e+00],
      dtype=float32)>, <tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-1.0181728e-04, -2.9923315e-03],
       [-9.3562187e-05, -2.7497204e-03],
       [-0.0000000e+00, -0.0000000e+00],
       [-0.0000000e+00, -0.0000000e+00]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([-0.00010286, -0.00302301], dtype=float32)>]
*********************************************
<tf.Variable 'dense_244/kernel:0' shape=(2, 4) dtype=float32, numpy=
array([[ 0.47558242,  0.58713174,  0.87569547, -0.8393936 ],
       [ 0.74212915,  0.8550659 , -0.08607602, -0.9991238 ]],
      dtype=float32)> 

<tf.Va

In [412]:
buffer=[]
for iteration in range(n_iteration_learn):
    all_rewards, all_grads, all_obs, all_action = play_episodes(env,n_max_steps,model,loss_fn)
    all_final_rewards = discount_rewards(all_rewards,discount)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        for step, final_reward in enumerate(all_final_rewards):
            var = []
            for i in range(len(model.trainable_variables)):
                all_grads[step][i]=tf.math.multiply(all_grads[step][i],final_reward)
            var.append(all_grads[step][var_index])
        all_mean_grads.append(tf.reduce_mean(var,axis=0))
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))
    episode_reward=tf.reduce_sum(all_final_rewards)
    buffer.append([all_final_rewards,all_obs,all_action])
    print("Episode * {} * Reward is ==> {}".format(iteration, episode_reward))

Episode * 0 * Reward is ==> -0.12075142097020947
Episode * 1 * Reward is ==> -0.12312596904843134
Episode * 2 * Reward is ==> -0.12464709266497748
Episode * 3 * Reward is ==> -0.1253388210654547
Episode * 4 * Reward is ==> -0.12551554452853386
Episode * 5 * Reward is ==> -0.12546609742130424
Episode * 6 * Reward is ==> -0.12536486314890688
Episode * 7 * Reward is ==> -0.12528376702188398
Episode * 8 * Reward is ==> -0.1252372348654574
Episode * 9 * Reward is ==> -0.125218069090006
