In [None]:
!pip3 uninstall gym

Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/gym-0.25.2.dist-info/*
    /usr/local/lib/python3.10/dist-packages/gym/*
Proceed (Y/n)? y
  Successfully uninstalled gym-0.25.2


In [None]:
!pip3 install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[mujoco]
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting mujoco==2.2 (from gym[mujoco])
  Downloading mujoco-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco==2.2->gym[mujoco])
  Downloading glfw-2.5.9-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.8/207.8 kB[0m [31m22.5 MB/s[0m eta [36m0:00

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense


def create_policy_network(state_dim, action_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    mu = Dense(action_dim, activation=None)(x)
    sigma = Dense(action_dim, activation=tf.nn.softplus)(x)
    model = keras.Model(inputs=inputs, outputs=(mu, sigma))
    return model


def create_value_network(state_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    out = Dense(1, activation=None)(x)
    model = keras.Model(inputs=inputs, outputs=out)
    return model


import tensorflow as tf


class ReplayBuffer:

    def __init__(self, batch_size, shuffle_buffer_size=1024):
        self._episodes = []
        self._batch_size = batch_size
        self._shuffle_buffer_size = shuffle_buffer_size

    def add_episodes(self, episodes):
        self._episodes.append(episodes)

    def get_as_dataset_of_length(self, dataset_size=10000):
        choice_dataset = tf.data.Dataset.from_tensor_slices(
            tf.squeeze(tf.random.categorical([[float(x) for x in range(len(self._episodes))]], dataset_size)))
        episodes = [ds.shuffle(self._shuffle_buffer_size).repeat() for ds in self._episodes]
        return tf.data.Dataset.choose_from_datasets(episodes, choice_dataset).batch(self._batch_size)

    def get_as_dataset(self):
        ds = self._episodes[0]
        for e in self._episodes[1:len(self._episodes)]:
            ds = ds.concatenate(e)
        return ds.shuffle(self._shuffle_buffer_size).batch(self._batch_size)

    def get_as_dataset_repeated(self, count=4):
        return self.get_as_dataset().repeat(count)

    def clear(self):
        self._episodes.clear()


# from ReplayBuffer import ReplayBuffer
# from Policy import Policy
# from RollOutWorker import RollOutWorker
import tensorflow as tf
from tensorflow import math as tfm
from tensorflow.keras.optimizers import Adam
import numpy as np
from collections.abc import Iterable


class Agent:

    def __init__(self, environments, actor_network_generator, critic_network_generator, updates_per_episode=80,
                 epsilon=0.2, gae_lambda=0.95, learning_rate=0.0003, gamma=0.99, alpha=0.2, kld_threshold=0.05,
                 normalize_adv=False):
        self._updates_per_epoch = updates_per_episode
        self._epsilon = epsilon
        self._gae_lambda = gae_lambda
        self._gamma = gamma
        self._alpha = alpha
        self._learning_rate = learning_rate
        self._mse = tf.keras.losses.MeanSquaredError()
        self._policy_network = actor_network_generator()
        self._value_network = critic_network_generator()
        self._optimizer_policy = Adam(learning_rate=learning_rate)  # TODO: one optimizer for both?
        self._optimizer_value = Adam(learning_rate=learning_rate)
        self._kld_threshold = kld_threshold
        self._normalize_adv = normalize_adv
        self._policy = Policy(self._policy_network)
        # option for multiple workers for future parallelization
        if not isinstance(environments, Iterable):
            environments = [environments]
        self._workers = [RollOutWorker(self._policy, self._value_network, env, self._gamma, self._gae_lambda)
                         for env in environments]

    @tf.function
    def learn(self, data_set):
        kld, actor_loss, critic_loss = 0.0, 0.0, 0.0
        kld_next, actor_loss_next = 0.0, 0.0
        i = 0.0
        for s, a, ret, adv, prob_old_policy in data_set:
            early_stopping, kld_next, actor_loss_next = self.train_step_actor(s, a, adv, prob_old_policy)
            if early_stopping:
                break
            kld += kld_next
            actor_loss += actor_loss_next
            critic_loss += self.train_step_critic(s, ret)
            i += 1
        return kld / i, actor_loss / i, critic_loss / i, i, kld_next

    # Alternative that does not terminate the training of the value network if KLD is too high
    @tf.function
    def learn2(self, data_set):
        kld, actor_loss, critic_loss = 0.0, 0.0, 0.0
        kld_next, actor_loss_next = 0.0, 0.0
        i = 0.0
        j = 0.0
        for s, a, _, adv, prob_old_policy in data_set:
            early_stopping, kld_next, actor_loss_next = self.train_step_actor(s, a, adv, prob_old_policy)
            if early_stopping:
                break
            kld += kld_next
            actor_loss += actor_loss_next
            i += 1
        for s, _, ret, _, _ in data_set:
            critic_loss += self.train_step_critic(s, ret)
            j += 1
        return kld / i, actor_loss / i, critic_loss / j, i, kld_next

    @tf.function
    def train_step_actor(self, s, a, adv, prob_old_policy):
        early_stopping = False
        loss = 0.0
        if self._normalize_adv:
            adv = (adv - tfm.reduce_mean(adv)) / (tfm.reduce_std(adv) + 1e-8)
        with tf.GradientTape() as tape:
            distribution = self._policy.distribution_from_policy(s)
            prob_current_policy = self._policy.log_probs_from_distribution(distribution, a)
            log_ratio = prob_current_policy - prob_old_policy
            kld = tf.math.reduce_mean((tf.math.exp(log_ratio) - 1) - log_ratio)
            if kld > self._kld_threshold:  # early stoppling if KLD is too high
                early_stopping = True
            else:
                # prob of current policy / prob of old policy (log probs: p/p2 = log(p)-log(p2)
                p = tf.math.exp(prob_current_policy - prob_old_policy)  # exp() to un do log(p)
                clipped_p = tf.clip_by_value(p, 1 - self._epsilon, 1 + self._epsilon)
                policy_loss = -tfm.reduce_mean(tfm.minimum(p * adv, clipped_p * adv))
                # entropy_loss = -tfm.reduce_mean(-prob_current_policy)  # approximate entropy
                entropy_loss = -tfm.reduce_mean(distribution.entropy())
                loss = policy_loss + self._alpha * entropy_loss

                gradients = tape.gradient(loss, self._policy_network.trainable_variables)
                self._optimizer_policy.apply_gradients(zip(gradients, self._policy_network.trainable_variables))
        return early_stopping, kld, loss

    @tf.function
    def train_step_critic(self, s, r_sum):
        with tf.GradientTape() as tape:
            prev_v = self._value_network(s)
            loss = self._mse(r_sum, prev_v)
        gradients = tape.gradient(loss, self._value_network.trainable_variables)
        self._optimizer_value.apply_gradients(zip(gradients, self._value_network.trainable_variables))
        return loss

    def train(self, epochs, batch_size=64, sub_epochs=4, steps_per_trajectory=1024):
        print("start training!")
        rets = []
        replay_buffer = ReplayBuffer(batch_size)
        for e in range(epochs):
            trajectories = [worker.sample_trajectories(steps_per_trajectory) for worker in
                            self._workers]
            ac_ret = 0.0
            ac_dones = 0
            for episodes, ret, dones in trajectories:
                replay_buffer.add_episodes(episodes)
                ac_ret += ret
                ac_dones += dones
            ac_ret = ac_ret / len(self._workers)
            rets.append(ac_ret)
            print("epoch:", e, "return of episode:", ac_ret, "avg 10:", np.average(rets[-10:]), "dones:", ac_dones)
            kld, actor_loss, critic_loss, i, last_kld = self.learn(replay_buffer.get_as_dataset_repeated(sub_epochs))
            print(
                f"kld: {kld}, actor_loss: {actor_loss}, critic_loss: {critic_loss}, updates: {i}, last_kld: {last_kld}")
            replay_buffer.clear()
        print("training finished!")


import tensorflow as tf
from tensorflow import math as tfm
from tensorflow_probability import distributions as tfd


class Policy:

    def __init__(self, policy_network):
        self._policy_network = policy_network

    def distribution_from_policy(self, state):
        mu, sigma = self._policy_network(state)
        return tfd.Normal(mu, sigma)

    def sample_actions_from_policy(self, state):
        distribution = self.distribution_from_policy(state)
        actions = distribution.sample()
        log_probs = self.log_probs_from_distribution(distribution, actions)
        return actions, log_probs

    def log_probs_from_distribution(self, distribution, actions):
        log_probs = distribution.log_prob(actions)
        return tfm.reduce_sum(log_probs, axis=-1, keepdims=True)

    def act_stochastic(self, state, environment):
        actions_prime, log_probs = self.sample_actions_from_policy(tf.convert_to_tensor([state], dtype=tf.float32))
        observation_prime, reward, terminated, truncated, _ = environment.step(actions_prime[0])
        return actions_prime, observation_prime, reward, terminated or truncated, log_probs


import tensorflow as tf
import numpy as np


class RollOutWorker:

    def __init__(self, policy, value_network, environment, gamma, gae_lambda):
        self._policy = policy
        self._value_network = value_network
        self._environment = environment
        self._gamma = gamma
        self._gae_lambda = gae_lambda
        self._s = []
        self._a = []
        self._r = []
        self._v = []
        self._p = []
        self._d = []

        self.__s, _ = self._environment.reset()
        self.__d = 0
        self.__d_p = 0.0
        self.__s_p = None
        self.__ret = 0

    def add(self, s, a, r, v, p, d):
        self._s.append(tf.convert_to_tensor(s, dtype=tf.float32))
        self._a.append(tf.convert_to_tensor(a, dtype=tf.float32))
        self._r.append(tf.convert_to_tensor(r, dtype=tf.float32))
        self._v.append(tf.convert_to_tensor(v, dtype=tf.float32))
        self._p.append(tf.convert_to_tensor(p, dtype=tf.float32))
        self._d.append(tf.convert_to_tensor(d, dtype=tf.float32))

    def clear(self):
        self._s.clear()
        self._a.clear()
        self._r.clear()
        self._v.clear()
        self._p.clear()
        self._d.clear()

    # generalized advantage estimate (taken from https://ppo-details.cleanrl.dev//2021/11/05/ppo-implementation-details/)
    def estimate_advantage(self, rewards, values, dones, next_done, next_value):  # TODO: rework
        adv = np.zeros_like(rewards)
        last_gae_lamda = 0
        for t in reversed(range(len(rewards))):
            if t == len(rewards) - 1:
                next_non_terminal = 1.0 - next_done
                next_values = next_value
            else:
                next_non_terminal = 1.0 - dones[t + 1]
                next_values = values[t + 1]
            delta = rewards[t] + self._gamma * next_values * next_non_terminal - values[t]
            adv[t] = last_gae_lamda = delta + self._gamma * self._gae_lambda * next_non_terminal * last_gae_lamda
        return adv

    # TODO: Make Vectorized!!!
    def sample_trajectories(self, steps_per_trajectory):
        ack_ret = 0
        x = 0
        for _ in range(steps_per_trajectory):
            a, self.__s_p, r, self.__d_p, p = self._policy.act_stochastic(self.__s, self._environment)
            self.__d_p = float(self.__d_p)
            self.__ret += r
            v = self._value_network(tf.convert_to_tensor([self.__s], dtype=tf.float32))
            self.add(self.__s, tf.squeeze(a, 1), [r], tf.squeeze(v, 1), tf.squeeze(p, 1), self.__d)
            if self.__d_p:
                x += 1
                ack_ret += self.__ret
                self.__ret = 0
                self.__s, _ = self._environment.reset()
            self.__s = self.__s_p
            self.__d = self.__d_p
        v_p = self._value_network(tf.convert_to_tensor([self.__s_p], dtype=tf.float32))
        adv = self.estimate_advantage(self._r, self._v, self._d, next_done=self.__d_p, next_value=v_p)
        g = adv + np.asarray(self._v)  # TD(lambda)
        ds = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(self._s), tf.convert_to_tensor(self._a),
                                                 tf.convert_to_tensor(g), tf.convert_to_tensor(adv),
                                                 tf.convert_to_tensor(self._p)))
        self.clear()
        return ds, self.__ret if x < 1 else ack_ret / x, x


# from Agent import Agent
# from GenericMLPs1D import create_policy_network, create_value_network
import gym
from functools import partial
import tensorflow as tf

if __name__ == '__main__':
    tf.keras.backend.clear_session()
    env = gym.make('InvertedPendulum-v4')
    print("state_dim=", env.observation_space.shape, "action_dim=", env.action_space.shape[0], "action_scaling:",
          env.action_space.high)
    agent = Agent(environments=env,
                  actor_network_generator=partial(create_policy_network, state_dim=env.observation_space.shape[0],
                                                  action_dim=env.action_space.shape[0]),
                  critic_network_generator=partial(create_value_network, state_dim=env.observation_space.shape))

    agent.train(epochs=1000, batch_size=64, sub_epochs=4, steps_per_trajectory=640)

state_dim= (4,) action_dim= 1 action_scaling: [3.]
start training!
epoch: 0 return of episode: 9.696969696969697 avg 10: 9.696969696969697 dones: 66
kld: 0.0056070988066494465, actor_loss: -5.286477565765381, critic_loss: 26.44529151916504, updates: 40.0, last_kld: 0.002829697448760271
epoch: 1 return of episode: 10.258064516129032 avg 10: 9.977517106549364 dones: 62
kld: 0.008141379803419113, actor_loss: -3.0000336170196533, critic_loss: 22.622722625732422, updates: 40.0, last_kld: 0.003732385579496622
epoch: 2 return of episode: 13.040816326530612 avg 10: 10.998616846543113 dones: 49
kld: 0.006017050705850124, actor_loss: -3.509716033935547, critic_loss: 36.59486389160156, updates: 10.0, last_kld: 0.06666044890880585
epoch: 3 return of episode: 14.906976744186046 avg 10: 11.975706820953846 dones: 43
kld: 0.014186044223606586, actor_loss: -2.4896230697631836, critic_loss: 33.40525436401367, updates: 27.0, last_kld: 0.18440169095993042
epoch: 4 return of episode: 20.419354838709676 avg