In [1]:
import warnings
warnings.simplefilter('ignore')
import tensorflow as tf

In [2]:
print("Is GPU available?", tf.test.is_gpu_available())
print("TF version:", tf.__version__)
print("Keras version:", tf.keras.__version__)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is GPU available? True
TF version: 2.3.1
Keras version: 2.4.0


In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MSE

from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import random
import gym

In [4]:
class Critic(Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.f1 = Dense(512, activation='relu', input_shape=(10,))
        self.f2 = Dense(512, activation='relu')
        self.c1 = Concatenate(axis=-1)
        self.v =  Dense(1, activation=None)

    def call(self, state, action):
        x = Concatenate([state, action])
        #x = tf.concat([state, action], axis=1)
        x = self.f1(x)
        x = self.f2(x)
        x = self.v(x)
        return x


class Actor(Model):
    def __init__(self, n_action):
        super(Actor, self).__init__()    
        self.f1 = Dense(512, activation='relu')
        self.f2 = Dense(512, activation='relu')
        self.mu = Dense(n_action, activation='tanh')

    def call(self, state):
        x = self.f1(state)
        x = self.f2(x)
        x = self.mu(x)  
        return x

In [5]:
class Agent():
    def __init__(self, n_actions, min_action, max_action):
        self.actor = Actor(n_actions)
        self.critic = Critic()
        self.actor_target = Actor(n_actions)
        self.critic_target = Critic()
        self.batch_size = 64
        self.n_actions = n_actions
        self.actor_opt = Adam(1e-4)
        self.critic_opt = Adam(1e-4)
        self.memory = deque(maxlen=100000)
        self.steps = 0
        self.replace = 5
        self.gamma = 0.99
        self.min_action = min_action
        self.max_action = max_action
        self.tau = 0.005
        self.actor_target.compile(optimizer=self.actor_opt)
        self.critic_target.compile(optimizer=self.critic_opt)
    
    def store(self, state, action, reward, n_state, done):
        pack = [np.expand_dims(state, axis=0), action, reward, np.expand_dims(n_state, axis=0), done]
        self.memory.append(pack)
    
    def take_data(self, batch_size):
        pack = random.sample(self.memory, batch_size)
        states = []
        actions = []
        rewards = []
        n_states = []
        dones = []
        for i in range(batch_size):
            states.append(pack[i][0])
            actions.append(pack[i][1])
            rewards.append(pack[i][2])
            n_states.append(pack[i][3])
            dones.append(pack[i][4])
        return states, actions, rewards, n_states, dones


    def act(self, state, evaluate=False):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        actions = self.actor(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=0.1)

        actions = self.max_action * (tf.clip_by_value(actions, self.min_action, self.max_action))
        return actions[0]
    
    def update_target(self):
        if tau is None:
            tau = self.tau

        weights1 = []
        targets1 = self.actor_target.weights
        for i, weight in enumerate(self.actor.weights):
            weights1.append(weight * tau + targets1[i]*(1-tau))
        self.actor_target.set_weights(weights1)

        weights2 = []
        targets2 = self.critic_target.weights
        for i, weight in enumerate(self.critic.weights):
            weights2.append(weight * tau + targets2[i]*(1-tau))
        self.critic_target.set_weights(weights2)
    
    def upgrade(self):
        if len(self.memory) < self.batch_size:
            return 

        states, actions, rewards, n_states, dones = self.take_data(self.batch_size)

        states = tf.convert_to_tensor(states, dtype= tf.float32)
        n_states = tf.convert_to_tensor(n_states, dtype= tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype= tf.float32)
        actions = tf.convert_to_tensor(actions, dtype= tf.float32)

        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:

            target_actions = self.actor_target(n_states)
            target_n_state = self.critic_target(n_states, target_actions)
            critic_value = self.critic(states, actions)
            target_values = rewards + (self.gamma * target_n_state * dones)
            critic_loss = MSE(target_values, critic_value)

            new_policy_actions = self.actor(states)
            actor_loss = -self.critic(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

        grads1 = tape1.gradient(actor_loss, self.actor.trainable_variables)
        grads2 = tape2.gradient(critic_loss, self.critic.trainable_variables)
        self.actor_opt.apply_gradients(zip(grads1, self.actor.trainable_variables))
        self.critic_opt.apply_gradients(zip(grads2, self.critic.trainable_variables))

        self.steps +=1
        
        if self.trainstep % self.replace == 0:
            self.update_target()

In [6]:
env = gym.make('LunarLanderContinuous-v2')

print('Min_action:', env.action_space.low[0])
print('Max_action: ', env.action_space.high[0])

Min_action: -1.0
Max_action:  1.0


In [7]:
agent = Agent(2, -1, 1)

n_episodes = 2000
avg_hist = []
scores = []

for i in range(n_episodes):
    score = 0 
    state = env.reset()
    done = False

    while not done:
        action = agent.act(state)
        n_state, reward, done, _ = env.step(action)
        agent.store(state, action, reward, n_state, 1-int(done))
        agent.upgrade()
        state = n_state
        score += reward
    
    scores.append(score)
    avg_reward = int(np.mean(scores[-100:]))
    print(f'Episode: {i}  Score: {score}  AVG: {avg_reward}')
    
    if avg_reward >= 200:
        break

env.close()

AttributeError: 'Concatenate' object has no attribute 'shape'

In [None]:
plt.plot(avg_hist)
plt.grid()
plt.xlabel('Episodes')
plt.ylabel('Avg rewards')

In [None]:
env.observation_space.shape

In [None]:
help(Concatenate)

In [None]:
states, n_states, rewards, actions, dones = agent.take_data(64)

In [None]:
print(np.shape(states))
print(np.shape(actions))
print(np.shape(n_states))