In [14]:
import tensorflow as tf
import numpy as np
import math

In [15]:
class simple_env():
    def __init__(self, step_size=13,targets=[166,-88],max_steps=100):
        self.pos=np.random.uniform(-10,10,len(targets))
        self.max_steps=max_steps
        self.step_size=step_size
        self.targets=targets
        self.max_num=1000
        self.min_num=-1000
        self.done=False
        self.def_size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.size_pos=[abs(targets[i]-self.pos[i]) for i in range(len(targets))]
        self.counter_stop=0
        self.obsevation_space=len(targets)
        self.action_space=len(targets)
        
    def reset(self):
        self.pos=np.random.uniform(-10,10,len(self.targets))
        self.size_pos=[abs(self.targets[i]-self.pos[i]) for i in range(len(self.targets))]
        self.counter_stop=0
        return self.pos

    def step(self,action):
        self.counter_stop+=1
        if self.counter_stop>=self.max_steps:
            self.done=True
        self.last_pos=np.copy(self.pos) 
        self.last_size_pos=np.copy(self.size_pos)
        self.reward=np.zeros(self.pos.shape)
        for i in range(len(action)):
            self.pos[i]=self.pos[i]+(action[i]*self.step_size)
            self.size_pos[i]=abs(self.targets[i]-self.pos[i])
            difference= self.last_size_pos[i]-self.size_pos[i]
            self.reward[i]=difference/self.def_size_pos[i]
        return self.pos, self.reward, self.done

In [16]:
def get_actor(obs_space, action_space):
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-0.01, maxval=0.01)

    inputs = tf.keras.layers.Input(shape=(obs_space,))
    out = tf.keras.layers.Dense(10, activation="relu")(inputs)
    out = tf.keras.layers.Dense(10, activation="relu")(out)
    outputs = tf.keras.layers.Dense(action_space, activation="tanh", kernel_initializer=last_init)(out)

    # # Our upper bound is 2.0 for Pendulum.
    # outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model

In [17]:
def policy(obs, model, lower_bound=-1, upper_bound=1):
    sampled_actions=model(obs)
    sampled_actions = sampled_actions.numpy()
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return legal_action[0]

In [18]:
# env=simple_env()
# model = get_actor(env.obsevation_space,env.action_space)
# print(env.pos)
# action = policy([25,-112],model)
# print(action)

In [19]:
def play_one_step(env, obs, model, loss_fn):
    obs=tf.expand_dims(tf.convert_to_tensor(obs), 0)
    with tf.GradientTape() as tape:        
        action = policy(obs,model)
        next_obs, reward, done = env.step(action)
        logits = model(obs)
        y_target=tf.reduce_mean(tf.Variable(reward))
        loss= tf.reduce_mean(loss_fn(y_target,logits))
    grads = tape.gradient(loss, model.trainable_variables)
    return next_obs, reward, done, grads, action

In [20]:
# env=simple_env()
# first_pos=env.reset()
# model = get_actor(env.obsevation_space,env.action_space)
# loss_fn=tf.keras.losses.MeanSquaredError()
# print(play_one_step(env,first_pos,model,loss_fn))


In [21]:
def play_episodes(env, num_steps, model, loss_fn):
    all_rewarsd=[]
    all_grads=[]
    all_obs=[]
    all_action=[]
    obs = env.reset()
    for step in range(num_steps):
        obs, reward, dode, grads, action= play_one_step(env,obs,model,loss_fn)
        all_rewarsd.append(np.copy(reward))
        all_grads.append(np.copy(grads))
        all_obs.append(np.copy(obs))
        all_action.append(np.copy(action))
    return all_rewarsd, all_grads, all_obs, all_action

In [22]:
def discount_rewards(rewards, discount):
    for i in range(len(rewards)):
        rewards[i]=np.mean(rewards[i])*pow(discount,i)
    return rewards

In [23]:
# discount_rewards([[2,4],-4,12],0.95)

In [24]:
n_iteration_learn = 10
n_max_steps = 5
discount = 0.95
learning_rate = 1

In [25]:
env=simple_env()
first_pos=env.reset()
model = get_actor(env.obsevation_space,env.action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.MeanSquaredError()

In [26]:
buffer=[]
for iteration in range(n_iteration_learn):
    all_rewards, all_grads, all_obs, all_action = play_episodes(env,n_max_steps,model,loss_fn)
    all_final_rewards = discount_rewards(all_rewards,discount)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        for step, final_reward in enumerate(all_final_rewards):
            var = []
            for i in range(len(model.trainable_variables)):
                all_grads[step][i]=tf.math.multiply(all_grads[step][i],final_reward)
            var.append(all_grads[step][var_index])
        all_mean_grads.append(tf.reduce_mean(var,axis=0))
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))
    episode_reward=tf.reduce_sum(all_final_rewards)
    buffer.append([all_final_rewards,all_obs,all_action])
    print("Episode * {} * Reward is ==> {}".format(iteration, episode_reward))


Episode * 0 * Reward is ==> -0.005216534825255978
Episode * 1 * Reward is ==> 0.0005662935428870063
Episode * 2 * Reward is ==> -0.012756238379365221
Episode * 3 * Reward is ==> -0.02867735687740393
Episode * 4 * Reward is ==> -0.012746746300070213
Episode * 5 * Reward is ==> -0.21464134819012698
Episode * 6 * Reward is ==> -0.5070200133451533
Episode * 7 * Reward is ==> -0.5070200133451533
Episode * 8 * Reward is ==> -0.5070200133451536
Episode * 9 * Reward is ==> -0.5070200133451535


In [27]:
np.matmul( [[1, 2, 3, 4],[1, 2, 3, 4]], 2)

ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)

In [None]:
env = gym.make("CartPole-v1")
env.seed(42)

for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model, loss_fn)
    total_rewards = sum(map(sum, all_rewards))                     # Not shown in the book
    print("\rIteration: {}, mean rewards: {:.1f}".format(          # Not shown
        iteration, total_rewards / n_episodes_per_update), end="") # Not shown
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                       discount_rate)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                 for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

env.close()