In [1]:
!pip3 uninstall gym

Found existing installation: gym 0.25.1
Uninstalling gym-0.25.1:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/gym-0.25.1.dist-info/*
    /usr/local/lib/python3.7/dist-packages/gym/*
Proceed (y/n)? y
  Successfully uninstalled gym-0.25.1


In [2]:
!pip3 install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[mujoco]
  Downloading gym-0.25.2.tar.gz (734 kB)
[K     |████████████████████████████████| 734 kB 7.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting mujoco==2.2.0
  Downloading mujoco-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 57.5 MB/s 
[?25hCollecting imageio>=2.14.1
  Downloading imageio-2.21.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 40.1 MB/s 
Collecting glfw
  Downloading glfw-2.5.4-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[K     |████████████████████████████████| 207 kB 57.0 MB/s 
Collecting pillow>=8.3.2
  Downloading Pillow-9.2.0-cp37-cp37m-manylinux_2_17_x86

In [10]:
!pip3 install pybullet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pybullet
  Downloading pybullet-3.2.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (91.7 MB)
[K     |████████████████████████████████| 91.7 MB 28 kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.2.5


In [14]:
import tensorflow as tf
import numpy as np


class EpisodeBuffer:

    def __init__(self, gamma):
        self._gamma = gamma
        self._state_memory = []
        self._action_memory = []
        self.reward_memory = []

    def add(self, state, action, reward):
        self._state_memory.append(tf.convert_to_tensor(state, dtype=tf.float32))
        self._action_memory.append(tf.convert_to_tensor(action, dtype=tf.float32))
        self.reward_memory.append(tf.convert_to_tensor(reward, dtype=tf.float32))

    def get_as_data_set(self, batch_size=1):
        # TODO: better implementation for discounting (cumsum, tf.scan)
        g = np.zeros_like(self.reward_memory, dtype=np.float32)
        for t in range(len(self.reward_memory)):
            g_sum = 0
            gamma_t = 1
            for k in range(t, len(self.reward_memory)):
                g_sum += self.reward_memory[k] * gamma_t  # r_n[k].numpy()
                gamma_t *= self._gamma
            g[t] = g_sum
        # g = (g - np.mean(g)) / (np.std(g) + 1e-10) # normalize g?
        index = tf.range(len(self._state_memory), dtype=tf.float32)
        return tf.data.Dataset.from_tensor_slices(
            (self._state_memory, self._action_memory, self.reward_memory, g, index)).batch(batch_size)


import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense


def create_policy_network(learning_rate, state_dim=4, action_dim=1):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    mu = Dense(action_dim, activation=None)(x)  # TODO: better activation sigmoid?
    # TODO: better activation, sigmoid or tf.clip_by_value(sigma, 1e-6, 1)?
    sigma = Dense(action_dim, activation=tf.nn.softplus)(x)
    model = keras.Model(inputs=inputs, outputs=(mu, sigma))
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


# from EpisodeBuffer import EpisodeBuffer
import tensorflow as tf
from tensorflow import math as tfm
from tensorflow_probability import distributions as tfd
import numpy as np
import pybullet_envs


class Agent:

    def __init__(self, environment, policy_network_generator, learning_rate=0.0003, gamma=0.99):
        self._environment = environment
        self._gamma = gamma
        self._policy_network = policy_network_generator(learning_rate)

    def learn(self, episode):  # TODO: check dimensions
        with tf.GradientTape() as tape:
            loss = 0
            for state, action, _, g, i in episode:
                prob_of_action = self.log_probs_of_action_in_state_form_policy(state, action)
                loss += -(tfm.pow(self._gamma, i[0]) * g * prob_of_action)  # TODO: tfm.pow(self.gamma, i) necessary?
        gradients = tape.gradient(loss, self._policy_network.trainable_variables)
        self._policy_network.optimizer.apply_gradients(zip(gradients, self._policy_network.trainable_variables))

    def distribution_of_policy_in_state(self, state):
        mu, sigma = self._policy_network(state)
        # TODO: MultivariateNormalDiag(loc=mus, scale_diag=sigmas) better?
        distribution = tfd.Normal(mu, sigma)
        return distribution

    def log_probs_of_action_in_state_form_policy(self, state, action):
        distribution = self.distribution_of_policy_in_state(state)
        log_probs = distribution.log_prob(action)
        return log_probs

    def sample_actions_form_policy(self, state):
        distribution = self.distribution_of_policy_in_state(state)
        actions = distribution.sample()
        return actions

    def act_deterministic(self, state):
        actions_prime, _ = self._policy_network(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def act_stochastic(self, state):
        actions_prime = self.sample_actions_form_policy(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def _act(self, actions):
        observation_prime, reward, done, _ = self._environment.step(actions[0])
        return actions, observation_prime, reward, done

    def sample_to_episode_buffer(self):
        buffer = EpisodeBuffer(self._gamma)
        observation = self._environment.reset()
        done = 0
        ret = 0
        while not done:
            action, observation_prime, reward, done = self.act_stochastic(observation)
            ret += reward
            buffer.add(observation, action, reward)
            observation = observation_prime
        return buffer, ret

    def train(self, epochs):
        print("start training!")
        rets = []
        for e in range(epochs):
            buffer, ret = self.sample_to_episode_buffer()
            rets.append(ret)
            print("epoch:", e, "return:", ret, "avg return:", np.average(rets[-50:]))
            episode = buffer.get_as_data_set()
            self.learn(episode)
        print("training finished!")


from functools import partial
import gym
import tensorflow as tf
# import pybullet_envs

# from Networks.InvertedPendulumNetwork import create_policy_network
# from ReinforceAgent import Agent

if __name__ == '__main__':
    tf.keras.backend.clear_session()
    env = gym.make("InvertedPendulum-v4")
    #env = gym.make('InvertedPendulumBulletEnv-v0')
    print("state_dim=", env.observation_space.shape, "action_dim=", env.action_space.shape[0], "action_scaling:",
          env.action_space.high)

    agent = Agent(environment=env,
                  policy_network_generator=partial(create_policy_network, state_dim=env.observation_space.shape[0],
                                                   action_dim=env.action_space.shape[0]))
    agent.train(2000)

state_dim= (4,) action_dim= 1 action_scaling: [3.]
start training!
epoch: 0 return: 8.0 avg return: 8.0
epoch: 1 return: 9.0 avg return: 8.5
epoch: 2 return: 6.0 avg return: 7.666666666666667
epoch: 3 return: 5.0 avg return: 7.0
epoch: 4 return: 5.0 avg return: 6.6
epoch: 5 return: 8.0 avg return: 6.833333333333333
epoch: 6 return: 5.0 avg return: 6.571428571428571
epoch: 7 return: 8.0 avg return: 6.75
epoch: 8 return: 11.0 avg return: 7.222222222222222
epoch: 9 return: 10.0 avg return: 7.5
epoch: 10 return: 7.0 avg return: 7.454545454545454
epoch: 11 return: 7.0 avg return: 7.416666666666667
epoch: 12 return: 8.0 avg return: 7.461538461538462
epoch: 13 return: 11.0 avg return: 7.714285714285714
epoch: 14 return: 9.0 avg return: 7.8
epoch: 15 return: 17.0 avg return: 8.375
epoch: 16 return: 10.0 avg return: 8.470588235294118
epoch: 17 return: 7.0 avg return: 8.38888888888889
epoch: 18 return: 17.0 avg return: 8.842105263157896
epoch: 19 return: 11.0 avg return: 8.95
epoch: 20 return: 6

KeyboardInterrupt: ignored