In [84]:
import tensorflow as tf
import numpy as np
import math

In [85]:
class simple_env():
    def __init__(self, step_size=13,targets=[166,-155],max_steps=100):
        self.pos=np.random.uniform(-10,10,len(targets))
        self.max_steps=max_steps
        self.step_size=step_size
        self.targets=targets
        self.max_num=1000
        self.min_num=-1000
        self.done=False
        self.def_size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.counter_stop=0
        self.obsevation_space=len(targets)
        self.action_space=len(targets)
        
    def reset(self):
        self.pos=np.random.uniform(-10,10,len(self.targets))
        self.size_pos=[abs(self.targets[i]-self.pos[i]) for i in range(len(self.targets))]
        self.counter_stop=0
        return self.pos

    def step(self,action):
        self.counter_stop+=1
        if self.counter_stop>=self.max_steps:
            self.done=True
        self.last_pos=np.copy(self.pos) 
        self.last_size_pos=np.copy(self.size_pos)
        self.reward=np.zeros(self.pos.shape)
        for i in range(len(action)):
            self.pos[i]=self.pos[i]+(action[i]*self.step_size)
            self.size_pos[i]=abs(self.targets[i]-self.pos[i])
            difference= self.last_size_pos[i]-self.size_pos[i]
            self.reward[i]=difference/self.def_size_pos[i]
        return self.pos, self.reward, self.done

In [86]:
def get_actor(obs_space, action_space):
    # last_init=kernel_initializer= tf.keras.initializers.glorot_uniform()
    last_init = tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
    inputs = tf.keras.layers.Input(shape=(obs_space,))
    out = tf.keras.layers.Dense(5, activation=tf.keras.activations.sigmoid)(inputs)
    out = tf.keras.layers.Dense(5, activation=tf.keras.activations.sigmoid)(inputs)
    outputs = tf.keras.layers.Dense(action_space, activation="tanh", kernel_initializer=last_init)(out)
    model = tf.keras.Model(inputs, outputs)
    return model

In [87]:
def policy(obs, model, lower_bound=-1, upper_bound=1):
    logits=model(obs)
    sampled_actions = logits.numpy()
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    # legal_action = sampled_actions
    return logits, legal_action[0]

In [88]:
def play_one_step(env, obs, model, loss_fn):
    obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
    with tf.GradientTape() as tape:        
        logits, action = policy(obs, model)
        next_obs, reward, done = env.step(action)
        y_target=tf.reduce_mean(tf.Variable(reward))
        loss= tf.reduce_mean(loss_fn(y_target,logits))
    grads = tape.gradient(loss, model.trainable_variables)
    return next_obs, reward, done, grads, action, loss

In [89]:
def play_episodes(env, num_steps, model, loss_fn):
    all_rewarsd=[]
    all_grads=[]
    all_obs=[]
    all_action=[]
    all_obs_prev=[]
    all_losses=[]
    obs = env.reset()
    for step in range(num_steps):
        all_obs_prev.append(np.copy(obs))
        obs, reward, dode, grads, action, loss = play_one_step(env,obs,model,loss_fn)
        all_rewarsd.append(np.copy(reward))
        all_grads.append(np.copy(grads))
        all_obs.append(np.copy(obs))
        all_action.append(np.copy(action))
        all_losses.append(np.copy(loss))
    return all_rewarsd, all_grads, all_obs, all_action,all_obs_prev, all_losses

In [90]:
def discount_rewards(rewards, discount):
    for i in range(len(rewards)):
        rewards[i]=np.mean(rewards[i])*pow(discount,i)
    return rewards

In [91]:
n_iteration_learn = 10
n_max_steps = 5
discount = 0.99
learning_rate = 0.00001
step_size=10

In [92]:
env=simple_env(step_size=step_size, targets=[100])
first_pos=env.reset()
model = get_actor(env.obsevation_space,env.action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.MeanSquaredError()

In [93]:
# model.summary()

In [94]:
# for var in model.trainable_variables:
#   print(var, "\n")

In [95]:
# print(model.layers[0].weights)
# print(model.layers[1].weights)
# print(model.layers[2].weights)

In [96]:
# env=simple_env()
# model = get_actor(env.obsevation_space,env.action_space)
# optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# loss_fn = tf.keras.losses.MeanSquaredError()
# obs = env.reset()
# # obs, reward, dode, grads, action= play_one_step(env,obs,model,loss_fn)
# obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
# with tf.GradientTape() as tape:        
#     action = policy(obs,model)
#     next_obs, reward, done = env.step(action)
#     # print(obs)
#     # print(next_obs)   
#     # print(action)
#     print(reward)
#     logits = model(obs)
#     y_target=tf.reduce_mean(tf.Variable(reward))
#     loss= tf.reduce_mean(loss_fn(y_target,logits))
# grads = tape.gradient(loss, model.trainable_variables)
# # print(grads)
# # for i in range(len(model.trainable_variables)):
# #     rd=np.mean(reward)
# #     grads[i]=tf.math.multiply(grads[i],rd)

# optimizer.apply_gradients(zip(grads, model.trainable_variables))
# print("*********************************************")
# print(grads)
# print("*********************************************")
# for var in model.trainable_variables:
#   print(var, "\n")

In [98]:
buffer=[]
for iteration in range(n_iteration_learn):
    all_rewards, all_grads, all_obs, all_action, all_obs_prev, all_losses = play_episodes(env,n_max_steps,model,loss_fn)
    all_final_rewards = discount_rewards(all_rewards,discount)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        var = []
        for step, final_reward in enumerate(all_final_rewards):            
            for i in range(len(model.trainable_variables)):
                all_grads[step][i]=tf.math.multiply(all_grads[step][i],final_reward)
            var.append(all_grads[step][var_index])
        all_mean_grads.append(tf.reduce_mean(var,axis=0))
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))
    episode_reward=tf.reduce_sum(all_final_rewards)
    buffer.append([all_final_rewards,all_obs,all_action])
    print("Episode * {} * Reward is ==> {}".format(iteration, episode_reward))

print("end")

# for var in model.trainable_variables:
#   print(var, "\n")


print(env.pos)

