In [115]:
import tensorflow as tf
import numpy as np
import math

In [116]:
class simple_env():
    def __init__(self, step_size=13,targets=[166,-155],max_steps=100):
        self.pos=np.random.uniform(-10,10,len(targets))
        self.max_steps=max_steps
        self.step_size=step_size
        self.targets=targets
        self.max_num=1000
        self.min_num=-1000
        self.done=False
        self.def_size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.counter_stop=0
        self.obsevation_space=len(targets)
        self.action_space=len(targets)
        
    def reset(self):
        self.pos=np.random.uniform(-10,10,len(self.targets))
        self.size_pos=[abs(self.targets[i]-self.pos[i]) for i in range(len(self.targets))]
        self.counter_stop=0
        return self.pos

    def step(self,action):
        self.counter_stop+=1
        if self.counter_stop>=self.max_steps:
            self.done=True
        self.last_pos=np.copy(self.pos) 
        self.last_size_pos=np.copy(self.size_pos)
        self.reward=np.zeros(self.pos.shape)
        for i in range(len(action)):
            self.pos[i]=self.pos[i]+(action[i]*self.step_size)
            self.size_pos[i]=abs(self.targets[i]-self.pos[i])
            difference= self.last_size_pos[i]-self.size_pos[i]
            self.reward[i]=difference/self.def_size_pos[i]
        return self.pos, self.reward, self.done

In [117]:
def get_actor(obs_space, action_space):
    # last_init=kernel_initializer= tf.keras.initializers.glorot_uniform()
    last_init = tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
    inputs = tf.keras.layers.Input(shape=(obs_space,))
    out = tf.keras.layers.Dense(4, activation="relu")(inputs)
    out = tf.keras.layers.Dense(4, activation="relu")(inputs)
    outputs = tf.keras.layers.Dense(action_space, activation="tanh", kernel_initializer=last_init)(out)
    model = tf.keras.Model(inputs, outputs)
    return model

In [118]:
def policy(obs, model, lower_bound=-1, upper_bound=1):
    logits=model(obs)
    sampled_actions = logits.numpy()
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    # legal_action = sampled_actions
    return logits, legal_action[0]

In [119]:
def play_one_step(env, obs, model, loss_fn):
    obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
    with tf.GradientTape() as tape:        
        logits, action = policy(obs, model)
        next_obs, reward, done = env.step(action)
        y_target=tf.reduce_mean(tf.Variable(reward))
        loss= tf.reduce_mean(loss_fn(y_target,logits))
    grads = tape.gradient(loss, model.trainable_variables)
    return next_obs, reward, done, grads, action, loss

In [120]:
def play_episodes(env, num_steps, model, loss_fn):
    all_rewarsd=[]
    all_grads=[]
    all_obs=[]
    all_action=[]
    all_obs_prev=[]
    all_losses=[]
    obs = env.reset()
    for step in range(num_steps):
        all_obs_prev.append(np.copy(obs))
        obs, reward, dode, grads, action, loss = play_one_step(env,obs,model,loss_fn)
        all_rewarsd.append(np.copy(reward))
        all_grads.append(np.copy(grads))
        all_obs.append(np.copy(obs))
        all_action.append(np.copy(action))
        all_losses.append(np.copy(loss))
    return all_rewarsd, all_grads, all_obs, all_action,all_obs_prev, all_losses

In [121]:
def discount_rewards(rewards, discount):
    for i in range(len(rewards)):
        rewards[i]=np.mean(rewards[i])*pow(discount,i)
    return rewards

In [122]:
n_iteration_learn = 10
n_max_steps = 10
discount = 0.95
learning_rate = 0.01

In [123]:
env=simple_env()
first_pos=env.reset()
model = get_actor(env.obsevation_space,env.action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.MeanSquaredError()

In [124]:
model.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 2)]               0         
                                                                 
 dense_25 (Dense)            (None, 4)                 12        
                                                                 
 dense_26 (Dense)            (None, 2)                 10        
                                                                 
Total params: 22
Trainable params: 22
Non-trainable params: 0
_________________________________________________________________


In [125]:
for var in model.trainable_variables:
  print(var, "\n")

<tf.Variable 'dense_25/kernel:0' shape=(2, 4) dtype=float32, numpy=
array([[-0.5131767 ,  0.06237054, -0.69414616, -0.8847337 ],
       [-0.5437341 ,  0.7508378 , -0.7935047 ,  0.24884486]],
      dtype=float32)> 

<tf.Variable 'dense_25/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)> 

<tf.Variable 'dense_26/kernel:0' shape=(4, 2) dtype=float32, numpy=
array([[ 0.09577108,  0.06470997],
       [-0.05173025,  0.06546802],
       [ 0.04180687, -0.04479575],
       [ 0.03161781, -0.05337641]], dtype=float32)> 

<tf.Variable 'dense_26/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)> 



In [126]:
print(model.layers[0].weights)
print(model.layers[1].weights)
print(model.layers[2].weights)

[]
[<tf.Variable 'dense_25/kernel:0' shape=(2, 4) dtype=float32, numpy=
array([[-0.5131767 ,  0.06237054, -0.69414616, -0.8847337 ],
       [-0.5437341 ,  0.7508378 , -0.7935047 ,  0.24884486]],
      dtype=float32)>, <tf.Variable 'dense_25/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]
[<tf.Variable 'dense_26/kernel:0' shape=(4, 2) dtype=float32, numpy=
array([[ 0.09577108,  0.06470997],
       [-0.05173025,  0.06546802],
       [ 0.04180687, -0.04479575],
       [ 0.03161781, -0.05337641]], dtype=float32)>, <tf.Variable 'dense_26/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>]


In [127]:
# env=simple_env()
# model = get_actor(env.obsevation_space,env.action_space)
# optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# loss_fn = tf.keras.losses.MeanSquaredError()
# obs = env.reset()
# # obs, reward, dode, grads, action= play_one_step(env,obs,model,loss_fn)
# obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
# with tf.GradientTape() as tape:        
#     action = policy(obs,model)
#     next_obs, reward, done = env.step(action)
#     # print(obs)
#     # print(next_obs)   
#     # print(action)
#     print(reward)
#     logits = model(obs)
#     y_target=tf.reduce_mean(tf.Variable(reward))
#     loss= tf.reduce_mean(loss_fn(y_target,logits))
# grads = tape.gradient(loss, model.trainable_variables)
# # print(grads)
# # for i in range(len(model.trainable_variables)):
# #     rd=np.mean(reward)
# #     grads[i]=tf.math.multiply(grads[i],rd)

# optimizer.apply_gradients(zip(grads, model.trainable_variables))
# print("*********************************************")
# print(grads)
# print("*********************************************")
# for var in model.trainable_variables:
#   print(var, "\n")

In [128]:
buffer=[]
for iteration in range(n_iteration_learn):
    all_rewards, all_grads, all_obs, all_action, all_obs_prev, all_losses = play_episodes(env,n_max_steps,model,loss_fn)
    all_final_rewards = discount_rewards(all_rewards,discount)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        var = []
        for step, final_reward in enumerate(all_final_rewards):            
            for i in range(len(model.trainable_variables)):
                all_grads[step][i]=tf.math.multiply(all_grads[step][i],final_reward)
            var.append(all_grads[step][var_index])
        all_mean_grads.append(tf.reduce_mean(var,axis=0))
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))
    episode_reward=tf.reduce_sum(all_final_rewards)
    buffer.append([all_final_rewards,all_obs,all_action])
    print("Episode * {} * Reward is ==> {}".format(iteration, episode_reward))

print("end")

for var in model.trainable_variables:
  print(var, "\n")


print(env.pos)

Episode * 0 * Reward is ==> 0.035648167430368735
Episode * 1 * Reward is ==> 0.022746634507700182
Episode * 2 * Reward is ==> -0.06857185578493476
Episode * 3 * Reward is ==> 0.03306106318444632
Episode * 4 * Reward is ==> 0.014610207784938775
Episode * 5 * Reward is ==> 0.013220083396232684
Episode * 6 * Reward is ==> -0.5001934146315604
Episode * 7 * Reward is ==> 0.008983750291591848
Episode * 8 * Reward is ==> -0.0036250848426381804
Episode * 9 * Reward is ==> 0.014631918246855589
Episode * 10 * Reward is ==> -0.3825555201129033
Episode * 11 * Reward is ==> -0.5185346505351036
Episode * 12 * Reward is ==> 0.006447602954804326
Episode * 13 * Reward is ==> -0.6050961371171932
Episode * 14 * Reward is ==> -0.43616125914093695
Episode * 15 * Reward is ==> -0.5620204793749071
Episode * 16 * Reward is ==> -0.5973259100582606
Episode * 17 * Reward is ==> 0.03874488623312579
Episode * 18 * Reward is ==> -0.00931912990303825
Episode * 19 * Reward is ==> -0.6211298314039662
Episode * 20 * Re

KeyboardInterrupt: 