In [1]:
!pip3 uninstall gym

Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Would remove:
    /usr/local/lib/python3.9/dist-packages/gym-0.25.2.dist-info/*
    /usr/local/lib/python3.9/dist-packages/gym/*
Proceed (Y/n)? y
  Successfully uninstalled gym-0.25.2


In [2]:
!pip3 install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[mujoco]
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 KB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting imageio>=2.14.1
  Downloading imageio-2.26.0-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mujoco==2.2
  Downloading mujoco-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
Collecting glfw
  Downloading glfw-2.5.6-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py3

In [5]:
import tensorflow as tf
import numpy as np


class EpisodeBuffer:

    def __init__(self, gamma, gae_lambda):
        self._gamma = gamma
        self._gae_lambda = gae_lambda
        self._s = []
        self._a = []
        self._r = []
        self._v = []
        self._p = []
        self._d = []

    def add(self, s, a, r, v, p, d):
        self._s.append(tf.convert_to_tensor(s, dtype=tf.float32))
        self._a.append(tf.convert_to_tensor(a, dtype=tf.float32))
        self._r.append(tf.convert_to_tensor(r, dtype=tf.float32))
        self._v.append(tf.convert_to_tensor(v, dtype=tf.float32))
        self._p.append(tf.convert_to_tensor(p, dtype=tf.float32))
        self._d.append(tf.convert_to_tensor(d, dtype=tf.float32))

    # generalized advantage estimate
    def estimate_advantage(self, rewards, values, dones):  # TODO: rework
        advantage = np.zeros_like(rewards, dtype=np.float32)
        for t in range(len(rewards) - 1):
            discount = 1
            a_t = 0
            for k in range(t, len(rewards) - 1):
                a_t += discount * (rewards[k] + self._gamma * values[k + 1] * (1 - dones[k]) - values[k])
                discount *= self._gamma * self._gae_lambda
            advantage[t] = a_t
        return advantage

    def get_episode(self):
        adv = self.estimate_advantage(self._r, self._v, self._d)
        g = adv + np.asarray(self._v)
        return (tf.convert_to_tensor(self._s), tf.convert_to_tensor(self._a), tf.convert_to_tensor(g),
                tf.convert_to_tensor(adv), tf.convert_to_tensor(self._p))


import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense


def create_policy_network(learning_rate, state_dim, action_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    mu = Dense(action_dim, activation=None)(x)
    sigma = Dense(action_dim, activation=tf.nn.softplus)(x)
    model = keras.Model(inputs=inputs, outputs=(mu, sigma))
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


def create_value_network(learning_rate, state_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    out = Dense(1, activation=None)(x)
    model = keras.Model(inputs=inputs, outputs=out)
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


# from EpisodeBuffer import EpisodeBuffer
import tensorflow as tf
from tensorflow import math as tfm
from tensorflow_probability import distributions as tfd
import numpy as np


class Agent:

    def __init__(self, environment, actor_network_generator, critic_network_generator, updates_per_episode=80,
                 epsilon=0.2, gae_lambda=0.95, learning_rate=0.0003, gamma=0.99, alpha=0.2, kld_threshold=0.05):
        self._updates_per_episode = updates_per_episode
        self._environment = environment
        self._epsilon = epsilon
        self._gae_lambda = gae_lambda
        self._gamma = gamma
        self._alpha = alpha
        self._learning_rate = learning_rate
        self._mse = tf.keras.losses.MeanSquaredError()
        self._policy_network = actor_network_generator(learning_rate)
        self._value_network = critic_network_generator(learning_rate)
        self._kld_threshold = kld_threshold

    def distribution_form_policy(self, state):
        mu, sigma = self._policy_network(state)
        return tfd.Normal(mu, sigma)

    def sample_actions_form_policy(self, state):
        distribution = self.distribution_form_policy(state)
        actions = distribution.sample()
        log_probs = self.log_probs_form_distribution(distribution, actions)
        return actions, log_probs

    def log_probs_form_policy(self, state, actions):
        distribution = self.distribution_form_policy(state)
        return self.log_probs_form_distribution(distribution, actions), distribution.entropy()

    def log_probs_form_distribution(self, distribution, actions):
        log_probs = distribution.log_prob(actions)
        return tfm.reduce_sum(log_probs, axis=-1, keepdims=True)

    def act_stochastic(self, state):
        actions_prime, log_probs = self.sample_actions_form_policy(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime) + (log_probs,)

    def _act(self, actions):
        observation_prime, reward, terminated, truncated, _ = self._environment.step(actions[0])
        return actions, observation_prime, reward, terminated or truncated

    def learn2(self, episode):
        s, a, r_sum, adv, porb_old_policy = episode
        for _ in range(self._updates_per_episode):
            if self.train_step_actor(s, a, adv, porb_old_policy):
                break
        for _ in range(self._updates_per_episode):
            self.train_step_critic(s, r_sum)

    @tf.function
    def learn(self, episode):
        s, a, r_sum, adv, porb_old_policy = episode

        st = False
        it = 0
        c = lambda stop, i: (not stop) or i < self._updates_per_episode
        b = lambda stop, i: (self.train_step_actor(s, a, adv, porb_old_policy), i + 1)
        tf.while_loop(c, b, [st, it])

        for _ in range(self._updates_per_episode):
            self.train_step_critic(s, r_sum)

    def train_step_actor(self, s, a, adv, porb_old_policy):
        early_stoppling = False
        with tf.GradientTape() as tape:
            porb_current_policy, entropy = self.log_probs_form_policy(s, a)
            kld = tf.math.reduce_mean(porb_current_policy - porb_old_policy)  # aproximated Kullback Leibler Divergence
            if tfm.abs(kld) > self._kld_threshold:  # early stoppling if KLD is too high
                early_stoppling = True
            else:
                # prob of current policy / prob of old policy (log probs: p/p2 = log(p)-log(p2)
                p = tf.math.exp(porb_current_policy - porb_old_policy)  # exp() to un do log(p)
                clipped_p = tf.clip_by_value(p, 1 - self._epsilon, 1 + self._epsilon)
                policy_loss = -tfm.reduce_mean(tfm.minimum(p * adv, clipped_p * adv))
                # entropy_loss = -tfm.reduce_mean(-porb_current_policy)  # approximate entropy
                entropy_loss = -tfm.reduce_mean(entropy)
                loss = policy_loss + self._alpha * entropy_loss

                gradients = tape.gradient(loss, self._policy_network.trainable_variables)
                self._policy_network.optimizer.apply_gradients(zip(gradients, self._policy_network.trainable_variables))
        return early_stoppling

    def train_step_critic(self, s, r_sum):
        with tf.GradientTape() as tape:
            prev_v = self._value_network(s)
            loss = self._mse(r_sum, prev_v)
        gradients = tape.gradient(loss, self._value_network.trainable_variables)
        self._value_network.optimizer.apply_gradients(zip(gradients, self._value_network.trainable_variables))

    def sample_to_episode_buffer(self, max_steps_per_episode):
        buffer = EpisodeBuffer(self._gamma, self._gae_lambda)
        s, _ = self._environment.reset()
        d = 0
        ret = 0
        i = 0
        while not d and (max_steps_per_episode is None or i < max_steps_per_episode):
            a, s_p, r, d, p = self.act_stochastic(s)
            ret += r
            v = self._value_network(tf.convert_to_tensor([s], dtype=tf.float32))
            buffer.add(s, tf.squeeze(a, 1), [r], tf.squeeze(v, 1), tf.squeeze(p, 1), d)
            s = s_p
            i += 1
        return buffer, ret

    def train(self, epochs, max_steps_per_episode=None):
        print("start training!")
        rets = []
        for e in range(epochs):
            buffer, ret = self.sample_to_episode_buffer(max_steps_per_episode)
            rets.append(ret)
            print("epoch:", e, "return of episode:", ret, "avg 100:", np.average(rets[-100:]))
            episode = buffer.get_episode()
            self.learn(episode)
        print("training finished!")


import tensorflow as tf
# from Agent import Agent
# from GenericMLPs1D import create_policy_network, create_value_network
import gym
from functools import partial

if __name__ == '__main__':
    tf.keras.backend.clear_session()
    env = gym.make('InvertedPendulum-v4')
    print("state_dim=", env.observation_space.shape, "action_dim=", env.action_space.shape[0], "action_scaling:",
          env.action_space.high)
    agent = Agent(environment=env,
                  actor_network_generator=partial(create_policy_network, state_dim=env.observation_space.shape[0],
                                                  action_dim=env.action_space.shape[0]),
                  critic_network_generator=partial(create_value_network, state_dim=env.observation_space.shape))
    agent.train(epochs=1000, max_steps_per_episode=200)

state_dim= (4,) action_dim= 1 action_scaling: [3.]
start training!
epoch: 0 return of episode: 15.0 avg 100: 15.0
epoch: 1 return of episode: 13.0 avg 100: 14.0
epoch: 2 return of episode: 30.0 avg 100: 19.333333333333332
epoch: 3 return of episode: 27.0 avg 100: 21.25
epoch: 4 return of episode: 26.0 avg 100: 22.2
epoch: 5 return of episode: 10.0 avg 100: 20.166666666666668
epoch: 6 return of episode: 18.0 avg 100: 19.857142857142858
epoch: 7 return of episode: 22.0 avg 100: 20.125
epoch: 8 return of episode: 6.0 avg 100: 18.555555555555557
epoch: 9 return of episode: 36.0 avg 100: 20.3
epoch: 10 return of episode: 17.0 avg 100: 20.0
epoch: 11 return of episode: 26.0 avg 100: 20.5
epoch: 12 return of episode: 19.0 avg 100: 20.384615384615383
epoch: 13 return of episode: 43.0 avg 100: 22.0
epoch: 14 return of episode: 55.0 avg 100: 24.2
epoch: 15 return of episode: 44.0 avg 100: 25.4375
epoch: 16 return of episode: 49.0 avg 100: 26.823529411764707
epoch: 17 return of episode: 61.0 avg 



epoch: 31 return of episode: 145.0 avg 100: 68.15625




epoch: 32 return of episode: 183.0 avg 100: 71.63636363636364
epoch: 33 return of episode: 221.0 avg 100: 76.02941176470588
epoch: 34 return of episode: 210.0 avg 100: 79.85714285714286
epoch: 35 return of episode: 161.0 avg 100: 82.11111111111111
epoch: 36 return of episode: 222.0 avg 100: 85.89189189189189
epoch: 37 return of episode: 127.0 avg 100: 86.97368421052632
epoch: 38 return of episode: 181.0 avg 100: 89.38461538461539
epoch: 39 return of episode: 165.0 avg 100: 91.275
epoch: 40 return of episode: 139.0 avg 100: 92.4390243902439
epoch: 41 return of episode: 186.0 avg 100: 94.66666666666667
epoch: 42 return of episode: 113.0 avg 100: 95.09302325581395
epoch: 43 return of episode: 98.0 avg 100: 95.1590909090909
epoch: 44 return of episode: 89.0 avg 100: 95.02222222222223
epoch: 45 return of episode: 91.0 avg 100: 94.93478260869566
epoch: 46 return of episode: 98.0 avg 100: 95.0
epoch: 47 return of episode: 100.0 avg 100: 95.10416666666667
epoch: 48 return of episode: 92.0 avg 

KeyboardInterrupt: ignored