In [4]:
!pip3 uninstall gym

Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/gym-0.25.2.dist-info/*
    /usr/local/lib/python3.7/dist-packages/gym/*
Proceed (y/n)? y
  Successfully uninstalled gym-0.25.2


In [5]:
!pip3 install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[mujoco]
  Downloading gym-0.26.1.tar.gz (719 kB)
[K     |████████████████████████████████| 719 kB 34.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting imageio>=2.14.1
  Using cached imageio-2.22.0-py3-none-any.whl (3.4 MB)
Collecting mujoco==2.2.0
  Using cached mujoco-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
Collecting glfw
  Using cached glfw-2.5.5-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
Collecting pillow>=8.3.2
  Using cached Pillow-9.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Building wheels for collected packages: gym
  Building wheel for gym (PEP 517) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.26.1-p

In [16]:
import tensorflow as tf
import numpy as np


class EpisodeBuffer:

    def __init__(self, advantage_estimator, calc_rewards_to_go, batch_size=256):
        self._advantage_estimator = advantage_estimator
        self._calc_rewards_to_go = calc_rewards_to_go
        self._batch_size = batch_size
        self._s = []
        self._a = []
        self._r = []
        self._v = []
        self._p = []
        self._d = []

    def add(self, s, a, r, v, p, d):
        self._s.append(tf.convert_to_tensor(s, dtype=tf.float32))
        self._a.append(tf.convert_to_tensor(a, dtype=tf.float32))
        self._r.append(tf.convert_to_tensor(r, dtype=tf.float32))
        self._v.append(tf.convert_to_tensor(v, dtype=tf.float32))
        self._p.append(tf.convert_to_tensor(p, dtype=tf.float32))
        self._d.append(tf.convert_to_tensor(d, dtype=tf.float32))

    def get_as_data_set(self):
        adv = self._advantage_estimator(self._r, self._v, self._d)
        g = self._calc_rewards_to_go(self._r, self._v, adv)
        return tf.data.Dataset.from_tensor_slices((self._s, self._a, self._r, g, adv, self._p)).shuffle(
            np.minimum(512)).batch(self._batch_size)


import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense


def create_policy_network(learning_rate, state_dim, action_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    mu = Dense(action_dim, activation=None)(x)
    sigma = Dense(action_dim, activation=tf.nn.softplus)(x)
    model = keras.Model(inputs=inputs, outputs=(mu, sigma))
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


def create_value_network(learning_rate, state_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    out = Dense(1, activation=None)(x)
    model = keras.Model(inputs=inputs, outputs=out)
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


# from EpisodeBuffer import EpisodeBuffer
import tensorflow as tf
from tensorflow import math as tfm
from tensorflow_probability import distributions as tfd
import numpy as np


class Agent:

    def __init__(self, environment, actor_network_generator, critic_network_generator,
                 batch_size=256, epsilon=0.2, gae_lambda=0.95, learning_rate=0.0003, gamma=0.99, kld_threshold=0.05):
        self._environment = environment
        self._batch_size = batch_size
        self._epsilon = epsilon
        self._gae_lambda = gae_lambda
        self._gamma = gamma
        self._learning_rate = learning_rate
        self._mse = tf.keras.losses.MeanSquaredError()
        self._policy_network = actor_network_generator(learning_rate)
        self._value_network = critic_network_generator(learning_rate)
        self._kld_threshold = kld_threshold

    # generalized advantage estimate
    def estimate_advantage(self, rewards, values, dones):  # TODO: rework
        advantage = np.zeros_like(rewards, dtype=np.float32)
        for t in range(len(rewards) - 1):
            discount = 1
            a_t = 0
            for k in range(t, len(rewards) - 1):
                a_t += discount * (rewards[k] + self._gamma * values[k + 1] * (1 - dones[k]) - values[k])
                discount *= self._gamma * self._gae_lambda
            advantage[t] = a_t
        return advantage

    def calc_rewards_to_go(self, rewards, values, advantages):
        g = np.zeros_like(rewards, dtype=np.float32)  # TODO: better implementation for discounting (cumsum, tf.scan)
        for t in range(len(rewards)):
            g_sum = 0
            gamma_t = 1
            for k in range(t, len(rewards)):
                g_sum += rewards[k] * gamma_t
                gamma_t *= self._gamma
            g[t] = g_sum
        return g

    def calc_rewards_to_go2(self, rewards, values, advantages):
        return advantages + np.asarray(values)

    @tf.function
    def distribution_form_policy(self, state):
        mu, sigma = self._policy_network(state)
        return tfd.Normal(mu, sigma)

    @tf.function
    def sample_actions_form_policy(self, state):
        distribution = self.distribution_form_policy(state)
        actions = distribution.sample()
        log_probs = self.log_probs_form_distribution(distribution, actions)
        return actions, log_probs

    @tf.function
    def log_probs_form_policy(self, state, actions):
        distribution = self.distribution_form_policy(state)
        return self.log_probs_form_distribution(distribution, actions)

    def log_probs_form_distribution(self, distribution, actions):
        log_probs = distribution.log_prob(actions)
        return tfm.reduce_sum(log_probs, axis=-1, keepdims=True)

    def act_deterministic(self, state):
        actions_prime, _ = self._actor(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def act_stochastic(self, state):
        actions_prime, log_probs = self.sample_actions_form_policy(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime) + (log_probs,)

    def _act(self, actions):
        observation_prime, reward, terminated, truncated, _ = self._environment.step(actions[0])
        return actions, observation_prime, reward, terminated or truncated

    def learn(self, episode):
        self.train_step_actor(episode)
        self.train_step_critic(episode)

    @tf.function
    def train_step_actor(self, episode):
        for s, a, _, _, adv, porb_old_policy in episode:
            with tf.GradientTape() as tape:
                porb_current_policy = self.log_probs_form_policy(s, a)
                # prob of current policy / prob of old policy (log probs: p/p2 = log(p)-log(p2)
                p = tf.math.exp(porb_current_policy - porb_old_policy)  # exp() to un do log(p)
                clipped_p = tf.clip_by_value(p, 1 - self._epsilon, 1 + self._epsilon)
                loss = -tfm.reduce_mean(tfm.minimum(p * adv, clipped_p * adv))
            kld = tf.math.reduce_mean(porb_current_policy - porb_old_policy)  # aproximated Kullback Leibler Divergence
            if tfm.abs(kld) > self._kld_threshold:  # early stoppling if KLD is too high
                break
            gradients = tape.gradient(loss, self._policy_network.trainable_variables)
            self._policy_network.optimizer.apply_gradients(zip(gradients, self._policy_network.trainable_variables))

    @tf.function
    def train_step_critic(self, episode):
        for s, _, _, r_sum, _, _ in episode:
            with tf.GradientTape() as tape:
                prev_v = self._value_network(s)
                loss = self._mse(r_sum, prev_v)
            gradients = tape.gradient(loss, self._value_network.trainable_variables)
            self._value_network.optimizer.apply_gradients(zip(gradients, self._value_network.trainable_variables))

    def sample_to_episode_buffer(self):
        buffer = EpisodeBuffer(self.estimate_advantage, self.calc_rewards_to_go, self._batch_size)
        s, _ = self._environment.reset()
        d = 0
        ret = 0
        while not d:
            a, s_p, r, d, p = self.act_stochastic(s)
            ret += r
            v = self._value_network(tf.convert_to_tensor([s], dtype=tf.float32))
            buffer.add(s, tf.squeeze(a, 1), [r], tf.squeeze(v, 1), tf.squeeze(p, 1), d)
            s = s_p
        return buffer, ret

    def train(self, epochs):
        print("start training!")
        rets = []
        for e in range(epochs):
            buffer, ret = self.sample_to_episode_buffer()
            rets.append(ret)
            print("epoch:", e, "return of episode:", ret, "avg 100:", np.average(rets[-100:]))
            episode = buffer.get_as_data_set()
            self.learn(episode)
        print("training finished!")


# from Agent import Agent
# from GenericMLPs1D import create_policy_network, create_value_network
import gym
from functools import partial

if __name__ == '__main__':
    tf.keras.backend.clear_session()
    env = gym.make('InvertedPendulum-v4')
    print("state_dim=", env.observation_space.shape, "action_dim=", env.action_space.shape[0], "action_scaling:",
          env.action_space.high)
    agent = Agent(environment=env, batch_size=8,
                  actor_network_generator=partial(create_policy_network, state_dim=env.observation_space.shape[0],
                                                  action_dim=env.action_space.shape[0]),
                  critic_network_generator=partial(create_value_network, state_dim=env.observation_space.shape))
    agent.train(10000)

state_dim= (4,) action_dim= 1 action_scaling: [3.]
start training!
epoch: 0
return of episode: 25.0 avg 100: 25.0
epoch: 1
return of episode: 7.0 avg 100: 16.0
epoch: 2
return of episode: 13.0 avg 100: 15.0
epoch: 3
return of episode: 19.0 avg 100: 16.0




epoch: 4
return of episode: 5.0 avg 100: 13.8




epoch: 5
return of episode: 10.0 avg 100: 13.166666666666666




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
return of episode: 18.0 avg 100: 22.85
epoch: 3738
return of episode: 17.0 avg 100: 22.78
epoch: 3739
return of episode: 15.0 avg 100: 22.81
epoch: 3740
return of episode: 17.0 avg 100: 22.57
epoch: 3741
return of episode: 21.0 avg 100: 22.68
epoch: 3742
return of episode: 29.0 avg 100: 22.83
epoch: 3743
return of episode: 27.0 avg 100: 22.93
epoch: 3744
return of episode: 23.0 avg 100: 23.01
epoch: 3745
return of episode: 16.0 avg 100: 22.98
epoch: 3746
return of episode: 22.0 avg 100: 22.87
epoch: 3747
return of episode: 28.0 avg 100: 22.73
epoch: 3748
return of episode: 22.0 avg 100: 22.73
epoch: 3749
return of episode: 29.0 avg 100: 22.76
epoch: 3750
return of episode: 25.0 avg 100: 22.56
epoch: 3751
return of episode: 42.0 avg 100: 22.84
epoch: 3752
return of episode: 27.0 avg 100: 22.87
epoch: 3753
return of episode: 18.0 avg 100: 22.94
epoch: 3754
return of episode: 20.0 avg 100: 22.96
epoch: 3755
return of episode

KeyboardInterrupt: ignored