In [1]:
import warnings
warnings.simplefilter('ignore')
import tensorflow as tf

In [2]:
print("Is GPU available?", tf.test.is_gpu_available())
print("TF version:", tf.__version__)
print("Keras version:", tf.keras.__version__)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is GPU available? True
TF version: 2.3.1
Keras version: 2.4.0


In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

import numpy as np
from collections import deque

In [4]:
class Critic(Model):
    def __init__(self, beta, input_dims, n_actions, name):
        super(Critic, self).__init__()
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.beta = beta
        self.name = name
        
        self.fc1 = Dense(32, input_shape=self.input_dims[0] + n_actions, activation='relu')
        self.fc2 = Dense(32, activation='relu')
        self.q1 = Dense(1)
        
        self.optimizer = Adam(lr = self.beta)
        
    def call(self, state, action):
        q1_action_value = self.fc1(np.concatenate((state, action), axis=1))
        q1_action_value = self.fc2(q1_action_value)
        
        q1 = self.q1(q1_action_value)
        
        return q1

In [5]:
class Actor(Model):
    def __init__(self, alpha, input_dims, n_actions, name):
        super(Actor, self).__init__()
        self.input_dims=  input_dims
        self.alpha = alpha
        self.name = name
        self.n_actions = n_actions
        
        self.fc1 = Dense(32, activation='relu')
        self.fc2 = Dense(32, activation='relu')
        self.mu = Dense(self.n_actions, activation='tanh')
        
        self.optimizer = Adam(lr = self.alpha)
    
    def call(self, state):
        prob = self.fc1(state)
        prob = self.fc2(prob)
        
        return prob

In [2]:
class Agent():
    def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, update_actor=2, warmup=1000, n_actions=2, max_size=1000000, batch_size=100, noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        self.memory = deque(maxlen=max_size)
        self.batch_size = batch_size
        self.learn_step_counter = 0
        self.time_step = 0
        self.warmup = warmup
        self.n_actions = n_actions
        self.update_actor = update_actor
        
        self.actor = Actor(alpha, input_dims, n_actions, 'actor')
        self.critic1 = Critic(beta, input_dims, n_actions, 'critic1')
        self.critic2 = Critic(beta, input_dims, n_actions, 'critic2')
        
        self.target_actor = Actor(alpha, input_dims, n_actions, 't_actor')
        self.target_critic1 = Critic(beta, input_dims, n_actions, 't_critic1')
        self.target_critic2 = Critic(beta, input_dims, n_actions, 't_critic2')
        
        self.noise = noise
        self.update_network_param(tau=1)
    
    def choose_action(self, obs):
        if semf.time_step < self.warmup:
            mu = tf.Tensor(np.random.normal(scale=self.noise, size=(self.n_actions,)))
        else:
            state = tf.Tensor(obs, dtype=tf.float32)
            mu = self.actor(state)
        mu_prime = mu + tf.Tensor(np.random.normal(scale=self.noise), dtype=tf.float32)
        
        mu_prime = tf.clip_by_value(mu_prime, self.min_action[0], self.max_action[0])
        self.time_step += 1
        return mu_prime.numpy()
    
    def store(self, state, action, reward, n_state, done):
        pack = [np.expand_dims(state, axis=0), action, reward, np.expand_dims(n_state, axis=0), done]
        self.memory.append(pack)
    
    def take_data(self, batch_size):
        pack = random.sample(self.memory, batch_size)
        states = []
        actions = []
        rewards = []
        n_states = []
        dones = []
        for i in range(batch_size):
            states.append(pack[i][0])
            actions.append(pack[i][1])
            rewards.append(pack[i][2])
            n_states.append(pack[i][3])
            dones.append(pack[i][4])
        return states, actions, rewards, n_states, dones
    
    def learn(self):
        if len(self.memory) >= self.batch_size:
        
            states, actions, rewards, n_states, dones = self.take_data(self.batch_size)
            
            target_actions = self.target_actor(np.array(n_states))
            target_actions += tf.clip_by_value(np.random.normal(scale=0.2), -0.5, 0.5)
            target_actions = tf.clip_by_value(target_actions, self.min_action[0], self.max_action[0])
            
            q1_ = self.target_critic1([n_states, target_actions])
            q2_ = self.target_critic2([n_states, target_actions])
            
            q1 = self.critic1([states, actions])
            q2 = self.critic2([states, actions])
            
            q1_[dones] = 0.0
            q2_[dones] = 0.0
            
            