In [1]:
!pip3 uninstall gym

Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/gym-0.25.2.dist-info/*
    /usr/local/lib/python3.7/dist-packages/gym/*
Proceed (y/n)? y
  Successfully uninstalled gym-0.25.2


In [2]:
!pip3 install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[mujoco]
  Downloading gym-0.26.1.tar.gz (719 kB)
[K     |████████████████████████████████| 719 kB 4.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting mujoco==2.2.0
  Downloading mujoco-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 42.9 MB/s 
[?25hCollecting imageio>=2.14.1
  Downloading imageio-2.22.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 43.1 MB/s 
[?25hCollecting glfw
  Downloading glfw-2.5.5-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[K     |████████████████████████████████| 207 kB 56.5 MB/s 
Collecting pillow>=8.3.2
  Downloading Pillow-9.2.0-cp37-cp37m-manylinux_2_

In [18]:
import tensorflow as tf
import numpy as np


class ExperienceReplayBuffer:
    def __init__(self, state_dims, action_dims, max_size=1000000, batch_size=256):
        self._max_size = max_size
        self._batch_size = batch_size
        self._size = 0
        self._current_position = 0
        self._state_memory = np.zeros((self._max_size, *state_dims))
        self._state_prime_memory = np.zeros((self._max_size, *state_dims))
        self._action_memory = np.zeros((self._max_size, action_dims))
        self._reward_memory = np.zeros((self._max_size, 1))
        self._done_memory = np.zeros((self._max_size, 1), dtype=np.bool)

    def size(self):
        return self._size

    def add_transition(self, state, action, reward, state_, done):
        self._state_memory[self._current_position] = state
        self._state_prime_memory[self._current_position] = state_
        self._action_memory[self._current_position] = action
        self._reward_memory[self._current_position] = reward
        self._done_memory[self._current_position] = done
        # self.un_norm_r[self.current_position] = r
        # self.r = (self.un_norm_r - np.mean(self.un_norm_r)) / (np.std(self.un_norm_r) + 1e-10)
        if self._size < self._max_size:
            self._size += 1
        self._current_position = (self._current_position + 1) % self._max_size

    def sample_batch(self):
        batch_indices = np.random.choice(self._size, self._batch_size, replace=False)
        states = tf.convert_to_tensor(self._state_memory[batch_indices], dtype=tf.float32)
        states_prime = tf.convert_to_tensor(self._state_prime_memory[batch_indices], dtype=tf.float32)
        actions = tf.convert_to_tensor(self._action_memory[batch_indices], dtype=tf.float32)
        rewards = tf.convert_to_tensor(self._reward_memory[batch_indices], dtype=tf.float32)
        dones = tf.convert_to_tensor(self._done_memory[batch_indices], dtype=tf.float32)
        return states, actions, rewards, states_prime, dones


import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Concatenate


def create_policy_network(learning_rate, state_dim, action_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    out = Dense(action_dim, activation=tf.nn.tanh)(x)
    model = keras.Model(inputs=inputs, outputs=out)
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


def create_q_network(learning_rate, state_dim, action_dim):
    inputs_s = keras.Input(shape=state_dim)
    inputs_a = keras.Input(shape=action_dim)
    x = Concatenate()([inputs_s, inputs_a])
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    out = Dense(1, activation=None)(x)
    model = keras.Model(inputs=(inputs_s, inputs_a), outputs=out)
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


# from ExperienceReplayBuffer import ExperienceReplayBuffer
import tensorflow as tf
from tensorflow import math as tfm
from tensorflow_probability import distributions as tfd
import numpy as np


# input should be between (−1, 1)
def default_scaling(actions):
    return actions


# input should be between (−1, 1)
def multiplicative_scaling(actions, factors):
    return actions * factors


class Agent:
    def __init__(self, environment, state_dim, action_dim,
                 actor_network_generator, critic_network_generator, action_scaling=default_scaling,
                 learning_rate=0.0003, gamma=0.99, tau=0.005, policy_delay=2,
                 target_noise=0.1, exploration_noise=0.2, noise_clip=0.5, a_low=-1, a_high=1,
                 batch_size=256, max_replay_buffer_size=1000000):
        self._environment = environment
        self._action_dim = action_dim
        self._action_scaling = action_scaling
        self._gamma = gamma
        self._tau = tau
        self._policy_delay = policy_delay
        self._target_noise = target_noise
        self._exploration_noise = exploration_noise
        self._noise_clip = noise_clip
        self._a_low = a_low
        self._a_high = a_high
        self._batch_size = batch_size
        self._mse = tf.keras.losses.MeanSquaredError()
        self._reply_buffer = ExperienceReplayBuffer(state_dim, action_dim, max_replay_buffer_size, batch_size)
        self._actor = actor_network_generator(learning_rate)
        self._actor_t = actor_network_generator(learning_rate)
        self._critic_1 = critic_network_generator(learning_rate)
        self._critic_2 = critic_network_generator(learning_rate)
        self._critic_1_t = critic_network_generator(learning_rate)
        self._critic_2_t = critic_network_generator(learning_rate)
        self._wight_init()
        self.step_counter = 0

    def reply_buffer(self):
        return self._reply_buffer

    def environment(self):
        return self._environment

    def _wight_init(self):
        self._actor.set_weights(self._actor_t.weights)
        self._critic_1.set_weights(self._critic_1_t.weights)
        self._critic_2.set_weights(self._critic_2_t.weights)

    def update_target_weights(self):
        self._weight_update(self._actor_t, self._actor)
        self._weight_update(self._critic_1_t, self._critic_1)
        self._weight_update(self._critic_2_t, self._critic_2)

    def _weight_update(self, target_network, network):
        new_wights = []
        for w_t, w in zip(target_network.weights, network.weights):
            new_wights.append((1 - self._tau) * w_t + self._tau * w)
        target_network.set_weights(new_wights)

    def learn(self):
        states, actions, rewards, states_prime, dones = self._reply_buffer.sample_batch()
        self.train_step_critic(states, actions, rewards, states_prime, dones)
        if self.step_counter % self._policy_delay:
            self.train_step_actor(states)
            self.update_target_weights()
        self.step_counter += 1

    @tf.function
    def train_step_critic(self, states, actions, rewards, states_prime, dones):
        actions_prime = self.sample_actions_form_target_policy(states_prime)
        q1 = self._critic_1_t((states_prime, actions_prime))
        q2 = self._critic_2_t((states_prime, actions_prime))
        targets = rewards + self._gamma * (1 - dones) * tfm.minimum(q1, q2)
        self._critic_update(self._critic_1, states, actions, targets)
        self._critic_update(self._critic_2, states, actions, targets)

    def _critic_update(self, critic, states, actions, targets):
        with tf.GradientTape() as tape:
            q = critic((states, actions))
            loss = 0.5 * self._mse(targets, q)
        gradients = tape.gradient(loss, critic.trainable_variables)
        critic.optimizer.apply_gradients(zip(gradients, critic.trainable_variables))

    @tf.function
    def train_step_actor(self, states):
        with tf.GradientTape() as tape:
            actions_new = self._actor(states)
            q1 = self._critic_1((states, actions_new))
            q2 = self._critic_2((states, actions_new))
            loss = -tfm.reduce_mean(tfm.minimum(q1, q2))
        gradients = tape.gradient(loss, self._actor.trainable_variables)
        self._actor.optimizer.apply_gradients(zip(gradients, self._actor.trainable_variables))

    def _action_clipping(self, actions):
        return tf.clip_by_value(actions, self._a_low, self._a_high)

    def sample_actions_form_policy(self, state):
        actions = self._actor(state)
        # or noise from sampling form tfp normal distribution with a sigma vector to get different noise per action
        noise = tf.random.normal(actions.get_shape(), 0, self._exploration_noise)
        clip_actions = self._action_clipping(actions + noise)
        return clip_actions

    def sample_actions_form_target_policy(self, state):
        actions = self._actor_t(state)
        # or noise from sampling form tfp normal distribution with a sigma vector to get different noise per action
        noise = tf.clip_by_value(tf.random.normal(actions.get_shape(), 0, self._target_noise),
                                 -self._noise_clip, self._noise_clip)
        clip_actions = self._action_clipping(actions + noise)
        return clip_actions

    def act_deterministic(self, state):
        actions_prime = self._actor(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def act_stochastic(self, state):
        actions_prime = self.sample_actions_form_policy(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def _act(self, actions):
        scaled_actions = self._action_scaling(actions)  # scaled actions from (-1, 1) according (to environment)
        observation_prime, reward, done, _, _ = self._environment.step(scaled_actions[0])
        return actions, observation_prime, reward, done

    def train(self, epochs, environment_steps=1, training_steps=1, pre_sampling_steps=0):
        print(f"Random exploration for {pre_sampling_steps} steps!")
        observation, _ = self._environment.reset()
        ret = 0
        for _ in range(max(pre_sampling_steps, self._batch_size)):
            actions = tf.random.uniform((self._action_dim,), minval=-1, maxval=1)
            actions = self._action_scaling(actions)
            observation_prime, reward, done, _, _ = self._environment.step(actions)
            ret += reward
            self._reply_buffer.add_transition(observation, actions, reward, observation_prime, done)
            if done:
                print("print", ret)
                ret = 0
                observation, _ = self._environment.reset()
            else:
                observation = observation_prime
        print("print", ret)

        print("start training!")
        returns = []
        observation, _ = self._environment.reset()
        done = 0
        ret = 0
        epoch = 0
        steps = 0
        while True:
            i = 0
            while i < environment_steps or self._reply_buffer.size() < self._batch_size:
                if done:
                    observation, _ = self._environment.reset()
                    returns.append(ret)
                    print("epoch:", epoch, "steps:", steps, "return:", ret, "avg return:", np.average(returns[-50:]))
                    ret = 0
                    epoch += 1
                    if epoch >= epochs:
                        print("training finished!")
                        return
                actions, observation_prime, reward, done = self.act_stochastic(observation)
                self._reply_buffer.add_transition(observation, actions, reward, observation_prime, done)
                observation = observation_prime
                steps += 1
                ret += reward
                i += 1
            for _ in range(training_steps):
                self.learn()


from functools import partial
import gym
import tensorflow as tf

# from GenericMLPs1D import create_policy_network, create_q_network
# from DDPGAgent import Agent, multiplicative_scaling

if __name__ == '__main__':
    tf.keras.backend.clear_session()
    env = gym.make('InvertedPendulum-v4')
    print("state_dim=", env.observation_space.shape, "action_dim=", env.action_space.shape[0], "action_scaling:",
          env.action_space.high)

    agent = Agent(environment=env, state_dim=env.observation_space.shape, action_dim=env.action_space.shape[0],
                  action_scaling=partial(multiplicative_scaling, factors=env.action_space.high),
                  actor_network_generator=partial(create_policy_network, state_dim=env.observation_space.shape[0],
                                                  action_dim=env.action_space.shape[0]),
                  critic_network_generator=partial(create_q_network, state_dim=env.observation_space.shape[0],
                                                   action_dim=env.action_space.shape[0]))
    agent.train(10000)

state_dim= (4,) action_dim= 1 action_scaling: [3.]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ipykernel import kernelapp as app


Random exploration for 0 steps!
print 4.0
print 9.0
print 4.0
print 3.0
print 5.0
print 3.0
print 4.0
print 4.0
print 15.0
print 3.0
print 3.0
print 4.0
print 3.0
print 8.0
print 15.0
print 4.0
print 4.0
print 5.0
print 9.0
print 3.0
print 5.0
print 4.0
print 8.0
print 5.0
print 3.0
print 6.0
print 6.0
print 6.0
print 5.0
print 4.0
print 8.0
print 3.0
print 14.0
print 4.0
print 5.0
print 3.0
print 3.0
print 4.0
print 4.0
print 11.0
print 3.0
print 3.0
print 3.0
print 10.0
print 3.0
print 3.0
print 8.0
start training!
epoch: 0 steps: 27 return: 27.0 avg return: 27.0
epoch: 1 steps: 31 return: 4.0 avg return: 15.5
epoch: 2 steps: 35 return: 4.0 avg return: 11.666666666666666
epoch: 3 steps: 41 return: 6.0 avg return: 10.25
epoch: 4 steps: 44 return: 3.0 avg return: 8.8
epoch: 5 steps: 48 return: 4.0 avg return: 8.0
epoch: 6 steps: 52 return: 4.0 avg return: 7.428571428571429
epoch: 7 steps: 55 return: 3.0 avg return: 6.875
epoch: 8 steps: 58 return: 3.0 avg return: 6.444444444444445
epoc

KeyboardInterrupt: ignored