In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import gym
from tensorflow.keras.models import load_model

In [2]:
env = gym.make("LunarLanderContinuous-v2")
state_low = env.observation_space.low
state_high = env.observation_space.high
action_low = env.action_space.low 
action_high = env.action_space.high
print(state_low)
print(state_high)
print(action_low)
print(action_high)

[-inf -inf -inf -inf -inf -inf -inf -inf]
[inf inf inf inf inf inf inf inf]
[-1. -1.]
[1. 1.]


In [3]:
len(env.action_space.high)

2

In [4]:
class RBuffer():
    def __init__(self, maxsize, statedim, naction):
        self.cnt = 0
        self.maxsize = maxsize
        self.state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
        self.action_memory = np.zeros((maxsize, naction), dtype=np.float32)
        self.reward_memory = np.zeros((maxsize,), dtype=np.float32)
        self.next_state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
        self.done_memory = np.zeros((maxsize,), dtype= np.bool)

    def storexp(self, state, next_state, action, done, reward):
        index = self.cnt % self.maxsize
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.next_state_memory[index] = next_state
        self.done_memory[index] = 1 - int(done)
        self.cnt += 1

    def sample(self, batch_size):
        max_mem = min(self.cnt, self.maxsize)
        batch = np.random.choice(max_mem, batch_size, replace= False)  
        states = self.state_memory[batch]
        next_states = self.next_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        dones = self.done_memory[batch]
        return states, next_states, rewards, actions, dones

In [5]:
class Critic(tf.keras.Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.f1 = tf.keras.layers.Dense(512, activation='relu')
        self.f2 = tf.keras.layers.Dense(512, activation='relu')
        self.v =  tf.keras.layers.Dense(1, activation=None)

    def call(self, inputstate, action):
        x = self.f1(tf.concat([inputstate, action], axis=1))
        x = self.f2(x)
        x = self.v(x)
        return x


class Actor(tf.keras.Model):
    def __init__(self, no_action):
        super(Actor, self).__init__()    
        self.f1 = tf.keras.layers.Dense(512, activation='relu')
        self.f2 = tf.keras.layers.Dense(512, activation='relu')
        self.mu =  tf.keras.layers.Dense(no_action, activation='tanh')

    def call(self, state):
        x = self.f1(state)
        x = self.f2(x)
        x = self.mu(x)  
        return x

In [6]:
class Agent():
    def __init__(self, n_action= len(env.action_space.high)):
        self.actor_main = Actor(n_action)
        self.actor_target = Actor(n_action)
        self.critic_main = Critic()
        self.critic_target = Critic()
        self.batch_size = 64
        self.n_actions = len(env.action_space.high)
        self.a_opt = tf.keras.optimizers.Adam(1e-4)
        # self.actor_target = tf.keras.optimizers.Adam(.001)
        self.c_opt = tf.keras.optimizers.Adam(1e-4)
        # self.critic_target = tf.keras.optimizers.Adam(.002)
        self.memory = RBuffer(1_00_000, env.observation_space.shape, len(env.action_space.high))
        self.trainstep = 0
        self.replace = 5
        self.gamma = 0.99
        self.min_action = env.action_space.low[0]
        self.max_action = env.action_space.high[0]
        self.tau = 0.005
        self.actor_target.compile(optimizer=self.a_opt)
        self.critic_target.compile(optimizer=self.c_opt)


    def act(self, state, evaluate=False):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        actions = self.actor_main(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=0.1)

        actions = self.max_action * (tf.clip_by_value(actions, self.min_action, self.max_action))
        #print(actions)
        return actions[0]


    def savexp(self,state, next_state, action, done, reward):
        self.memory.storexp(state, next_state, action, done, reward)

    def update_target(self, tau=None):

        if tau is None:
            tau = self.tau

        weights1 = []
        targets1 = self.actor_target.weights
        for i, weight in enumerate(self.actor_main.weights):
            weights1.append(weight * tau + targets1[i]*(1-tau))
        self.actor_target.set_weights(weights1)

        weights2 = []
        targets2 = self.critic_target.weights
        for i, weight in enumerate(self.critic_main.weights):
            weights2.append(weight * tau + targets2[i]*(1-tau))
        self.critic_target.set_weights(weights2)


    def train(self):
        if self.memory.cnt < self.batch_size:
            return 


        states, next_states, rewards, actions, dones = self.memory.sample(self.batch_size)

        states = tf.convert_to_tensor(states, dtype= tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype= tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype= tf.float32)
        actions = tf.convert_to_tensor(actions, dtype= tf.float32)
        #dones = tf.convert_to_tensor(dones, dtype= tf.bool)

        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:

            target_actions = self.actor_target(next_states)
            target_next_state_values = tf.squeeze(self.critic_target(next_states, target_actions), 1)
            critic_value = tf.squeeze(self.critic_main(states, actions), 1)
            target_values = rewards + self.gamma * target_next_state_values * dones
            critic_loss = tf.keras.losses.MSE(target_values, critic_value)

            new_policy_actions = self.actor_main(states)
            actor_loss = -self.critic_main(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

        grads1 = tape1.gradient(actor_loss, self.actor_main.trainable_variables)
        grads2 = tape2.gradient(critic_loss, self.critic_main.trainable_variables)
        self.a_opt.apply_gradients(zip(grads1, self.actor_main.trainable_variables))
        self.c_opt.apply_gradients(zip(grads2, self.critic_main.trainable_variables))

        #if self.trainstep % self.replace == 0:
        self.update_target()

        self.trainstep +=1

In [None]:
with tf.device('GPU:0'):
    tf.random.set_seed(336699)
    agent = Agent(2)
    episods = 2000
    ep_reward = []
    total_avgr = []
    target = False

    for s in range(episods):
        if target == True:
            break
        total_reward = 0 
        state = env.reset()
        done = False

        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.savexp(state, next_state, action, done, reward)
            agent.train()
            state = next_state
            total_reward += reward
            if done:
                ep_reward.append(total_reward)
                avg_reward = np.mean(ep_reward[-100:])
                total_avgr.append(avg_reward)
                print("total reward after {} steps is {} and avg reward is {}".format(s, total_reward, avg_reward))
                if int(avg_reward) == 200:
                    target = True

total reward after 0 steps is -182.18068361789182 and avg reward is -182.18068361789182
total reward after 1 steps is -34.65772060524867 and avg reward is -108.41920211157024
total reward after 2 steps is -677.5523053950737 and avg reward is -298.13023653940473
total reward after 3 steps is -1042.6865012443532 and avg reward is -484.26930271564186
total reward after 4 steps is -825.9514400091957 and avg reward is -552.6057301743526
total reward after 5 steps is -316.6279091949686 and avg reward is -513.2760933444553
total reward after 6 steps is -229.70907768839515 and avg reward is -472.7665196793038
total reward after 7 steps is -284.37523514949874 and avg reward is -449.2176091130782
total reward after 8 steps is -430.49598666180873 and avg reward is -447.1374288407149
total reward after 9 steps is -412.7043219516426 and avg reward is -443.69411815180763
total reward after 10 steps is -93.97266970437174 and avg reward is -411.90125920204076
total reward after 11 steps is -218.553440

total reward after 94 steps is -346.26746284585647 and avg reward is -204.73324921173995
total reward after 95 steps is -78.39901514359298 and avg reward is -203.41726760686342
total reward after 96 steps is 5.595738681367543 and avg reward is -201.26249434616
total reward after 97 steps is -93.47517344104718 and avg reward is -200.16262372467924
total reward after 98 steps is -8.535559506347202 and avg reward is -198.22699681338295
total reward after 99 steps is -127.86019728452146 and avg reward is -197.52332881809434
total reward after 100 steps is -63.91283168530851 and avg reward is -196.34065029876848
total reward after 101 steps is -83.19806684007324 and avg reward is -196.82605376111675
total reward after 102 steps is -64.31135489779375 and avg reward is -190.69364425614393
total reward after 103 steps is -103.47252269110948 and avg reward is -181.30150447061152
total reward after 104 steps is -111.63323014395907 and avg reward is -174.15832237195912
total reward after 105 step

total reward after 187 steps is -2.641418792649945 and avg reward is -64.83036894758763
total reward after 188 steps is -61.36017949016992 and avg reward is -64.95114682255824
total reward after 189 steps is -33.078831226780984 and avg reward is -63.18182900656122
total reward after 190 steps is -11.646625224034159 and avg reward is -62.41766922785564
total reward after 191 steps is -48.3550453872664 and avg reward is -60.60256848623862
total reward after 192 steps is -129.75027940789698 and avg reward is -62.6306989249852
total reward after 193 steps is -46.69388774589336 and avg reward is -63.08889103063766
total reward after 194 steps is -36.45474885697627 and avg reward is -59.99076389074886
total reward after 195 steps is -61.71439650335576 and avg reward is -59.823917704346485
total reward after 196 steps is -16.11496229437578 and avg reward is -60.04102471410391
total reward after 197 steps is -46.93769647213443 and avg reward is -59.57564994441478
total reward after 198 steps i

In [None]:
ep = [i  for i in range(len(avg_rewards_list))]
plt.plot( range(len(avg_rewards_list)),avg_rewards_list,'b')
plt.title("Avg Test Aeward Vs Test Episods")
plt.xlabel("Test Episods")
plt.ylabel("Average Test Reward")
plt.grid(True)
plt.show()

In [None]:
total_reward = 0
state = env.reset()
while not done:
    action = agent.act(state, True)
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward
    if done:
        print(total_reward)