In [1]:
!pip3 uninstall gym

Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/gym-0.25.2.dist-info/*
    /usr/local/lib/python3.7/dist-packages/gym/*
Proceed (y/n)? y
  Successfully uninstalled gym-0.25.2


In [2]:
!pip3 install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[mujoco]
  Downloading gym-0.26.1.tar.gz (719 kB)
[K     |████████████████████████████████| 719 kB 7.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting imageio>=2.14.1
  Downloading imageio-2.22.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 39.1 MB/s 
[?25hCollecting mujoco==2.2.0
  Downloading mujoco-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 49.3 MB/s 
Collecting glfw
  Downloading glfw-2.5.5-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[K     |████████████████████████████████| 207 kB 43.9 MB/s 
[?25hCollecting pillow>=8.3.2
  Downloading Pillow-9.2.0-cp37-cp37m-manylinux_2_

In [27]:
import tensorflow as tf
import numpy as np


class EpisodeBuffer:

    def __init__(self, advantage_estimator, calc_rewards_to_go):
        self._advantage_estimator = advantage_estimator
        self._calc_rewards_to_go = calc_rewards_to_go
        self._s = []
        self._a = []
        self._r = []
        self._v = []
        self._d = []

    def add(self, s, a, r, v, d):
        self._s.append(tf.convert_to_tensor(s, dtype=tf.float32))
        self._a.append(tf.convert_to_tensor(a, dtype=tf.float32))
        self._r.append(tf.convert_to_tensor(r, dtype=tf.float32))
        self._v.append(tf.convert_to_tensor(v, dtype=tf.float32))
        self._d.append(tf.convert_to_tensor(d, dtype=tf.int32))

    def get_as_data_set(self):
        g = self._calc_rewards_to_go(self._r)
        adv = self._advantage_estimator(self._r, self._v, self._d)
        return tf.data.Dataset.from_tensor_slices((self._s, self._a, self._r, g, adv)).batch(1)

    def size(self):
        return len(self._s)


import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense


def create_policy_network(learning_rate, state_dim, action_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    mu = Dense(action_dim, activation=None)(x)
    sigma = Dense(action_dim, activation=tf.nn.softplus)(x)
    model = keras.Model(inputs=inputs, outputs=(mu, sigma))
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


def create_value_network(learning_rate, state_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    out = Dense(1, activation=None)(x)
    model = keras.Model(inputs=inputs, outputs=out)
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


# from EpisodeBuffer import EpisodeBuffer
import tensorflow as tf
from tensorflow import math as tfm
from tensorflow_probability import distributions as tfd
import numpy as np


class Agent:

    def __init__(self, environment, actor_network_generator, critic_network_generator,
                 gae_lambda=0.95, learning_rate=0.0003, gamma=0.99):
        self._environment = environment
        self._gae_lambda = gae_lambda
        self._gamma = gamma
        self._learning_rate = learning_rate
        self._mse = tf.keras.losses.MeanSquaredError()
        self._policy_network = actor_network_generator(learning_rate)
        self._value_network = critic_network_generator(learning_rate)

    # generalized advantage estimate
    def estimate_advantage(self, rewards, values, dones):  # TODO: rework
        advantage = np.zeros(len(rewards), dtype=np.float32)
        for t in range(len(rewards) - 1):
            discount = 1
            a_t = 0
            for k in range(t, len(rewards) - 1):
                a_t += discount * (rewards[k] + self._gamma * values[k + 1] * (1 - dones[k]) - values[k])
                discount *= self._gamma * self._gae_lambda
            advantage[t] = a_t
        return advantage

    def calc_rewards_to_go(self, rewards):
        g = np.zeros_like(rewards, dtype=np.float32)  # TODO: better implementation for discounting (cumsum, tf.scan)
        for t in range(len(rewards)):
            g_sum = 0
            gamma_t = 1
            for k in range(t, len(rewards)):
                g_sum += rewards[k] * gamma_t  # .numpy()
                gamma_t *= self._gamma
            g[t] = g_sum
        return g

    @tf.function
    def distribution_form_policy(self, state):
        mu, sigma = self._policy_network(state)
        return tfd.Normal(mu, sigma)

    @tf.function
    def sample_actions_form_policy(self, state):
        distribution = self.distribution_form_policy(state)
        actions = distribution.sample()
        return actions

    @tf.function
    def log_probs_form_policy(self, state, actions):
        distribution = self.distribution_form_policy(state)
        log_probs = distribution.log_prob(actions)
        log_probs = tfm.reduce_sum(log_probs, axis=-1, keepdims=True)
        return log_probs

    def act_deterministic(self, state):
        actions_prime, _ = self._actor(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def act_stochastic(self, state):
        actions_prime = self.sample_actions_form_policy(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def _act(self, actions):
        observation_prime, reward, done, what, _ = self._environment.step(actions[0])
        return actions, observation_prime, reward, done

    # @tf.function
    def learn(self, episode, episode_size):
        self.train_step_actor(episode)
        self.train_step_critic(episode, episode_size)

    # @tf.function
    def train_step_actor(self, episode):
        with tf.GradientTape() as tape:
            loss = 0
            for s, a, _, _, adv in episode:
                prob_of_a = self.log_probs_form_policy(s, a)
                loss -= prob_of_a * adv
        gradients = tape.gradient(loss, self._policy_network.trainable_variables)
        self._policy_network.optimizer.apply_gradients(zip(gradients, self._policy_network.trainable_variables))

    # @tf.function
    def train_step_critic(self, episode, episode_size):
        with tf.GradientTape() as tape:
            loss = 0
            for s, _, _, r_sum, _ in episode:
                prev_v = self._value_network(s)
                loss += self._mse(r_sum, prev_v)
            loss = loss / episode_size
        gradients = tape.gradient(loss, self._value_network.trainable_variables)
        self._value_network.optimizer.apply_gradients(zip(gradients, self._value_network.trainable_variables))

    def sample_to_episode_buffer(self):
        buffer = EpisodeBuffer(self.estimate_advantage, self.calc_rewards_to_go)
        s, _ = self._environment.reset()
        d = 0
        ret = 0
        while not d:
            a, s_p, r, d = self.act_stochastic(s)
            ret += r
            v = self._value_network(tf.convert_to_tensor([s], dtype=tf.float32))
            buffer.add(s, a, r, v, d)
            s = s_p
        return buffer, ret

    def train(self, epochs):
        print("start training!")
        rets = []
        for e in range(epochs):
            print("epoch:", e)
            buffer, ret = self.sample_to_episode_buffer()
            rets.append(ret)
            print("return of episode:", ret, "avg 100:", np.average(rets[-100:]))
            episode = buffer.get_as_data_set()
            self.learn(episode, buffer.size())
        print("training finished!")


# from Agent import Agent
# from GenericMLPs1D import create_policy_network, create_value_network
import gym
from functools import partial

if __name__ == '__main__':
    tf.keras.backend.clear_session()
    env = gym.make('InvertedPendulum-v4')
    print("state_dim=", env.observation_space.shape, "action_dim=", env.action_space.shape[0], "action_scaling:",
          env.action_space.high)
    agent = Agent(environment=env,
                  actor_network_generator=partial(create_policy_network, state_dim=env.observation_space.shape[0],
                                                  action_dim=env.action_space.shape[0]),
                  critic_network_generator=partial(create_value_network, state_dim=env.observation_space.shape))
    agent.train(10000)

state_dim= (4,) action_dim= 1 action_scaling: [3.]
start training!
epoch: 0
return of episode: 6.0 avg 100: 6.0
epoch: 1
return of episode: 5.0 avg 100: 5.5
epoch: 2
return of episode: 6.0 avg 100: 5.666666666666667
epoch: 3
return of episode: 23.0 avg 100: 10.0
epoch: 4
return of episode: 8.0 avg 100: 9.6
epoch: 5
return of episode: 5.0 avg 100: 8.833333333333334
epoch: 6
return of episode: 10.0 avg 100: 9.0
epoch: 7
return of episode: 6.0 avg 100: 8.625
epoch: 8
return of episode: 9.0 avg 100: 8.666666666666666
epoch: 9
return of episode: 4.0 avg 100: 8.2
epoch: 10
return of episode: 30.0 avg 100: 10.181818181818182
epoch: 11
return of episode: 11.0 avg 100: 10.25
epoch: 12
return of episode: 7.0 avg 100: 10.0
epoch: 13
return of episode: 16.0 avg 100: 10.428571428571429
epoch: 14
return of episode: 6.0 avg 100: 10.133333333333333
epoch: 15
return of episode: 13.0 avg 100: 10.3125
epoch: 16
return of episode: 6.0 avg 100: 10.058823529411764
epoch: 17
return of episode: 18.0 avg 100: 

KeyboardInterrupt: ignored