In [1]:
!pip3 uninstall gym

Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/gym-0.25.2.dist-info/*
    /usr/local/lib/python3.7/dist-packages/gym/*
Proceed (y/n)? y
  Successfully uninstalled gym-0.25.2


In [2]:
!pip3 install gym[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[box2d]
  Downloading gym-0.26.1.tar.gz (719 kB)
[K     |████████████████████████████████| 719 kB 29.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting box2d-py==2.3.5
  Downloading box2d_py-2.3.5-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 37.7 MB/s 
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 77 kB/s 
[?25hCollecting swig==4.*
  Downloading swig-4.0.2-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (3.7 MB)
[K     |████████████████████████████████| 3.7 MB 51.4 MB/s 
Building wheels for collected packages: gym
  Building wheel for gym (PEP 517) ... [?

In [6]:
import tensorflow as tf
import numpy as np


class ExperienceReplayBuffer:
    def __init__(self, state_dims, action_dims, max_size=1000000, batch_size=256):
        self._max_size = max_size
        self._batch_size = batch_size
        self._size = 0
        self._current_position = 0
        self._state_memory = np.zeros((self._max_size, *state_dims))
        self._state_prime_memory = np.zeros((self._max_size, *state_dims))
        self._action_memory = np.zeros((self._max_size, 1))  # deiscrete actions
        self._reward_memory = np.zeros((self._max_size, 1))
        self._done_memory = np.zeros((self._max_size, 1), dtype=np.bool)

    def size(self):
        return self._size

    def add_transition(self, state, action, reward, state_, done):
        self._state_memory[self._current_position] = state
        self._state_prime_memory[self._current_position] = state_
        self._action_memory[self._current_position] = action
        self._reward_memory[self._current_position] = reward
        self._done_memory[self._current_position] = done
        # self.un_norm_r[self.current_position] = r
        # self.r = (self.un_norm_r - np.mean(self.un_norm_r)) / (np.std(self.un_norm_r) + 1e-10)
        if self._size < self._max_size:
            self._size += 1
        self._current_position = (self._current_position + 1) % self._max_size

    def sample_batch(self):
        batch_indices = np.random.choice(self._size, self._batch_size, replace=False)
        states = tf.convert_to_tensor(self._state_memory[batch_indices], dtype=tf.float32)
        states_prime = tf.convert_to_tensor(self._state_prime_memory[batch_indices], dtype=tf.float32)
        actions = tf.convert_to_tensor(self._action_memory[batch_indices], dtype=tf.int32)  # deiscrete actions
        rewards = tf.convert_to_tensor(self._reward_memory[batch_indices], dtype=tf.float32)
        dones = tf.convert_to_tensor(self._done_memory[batch_indices], dtype=tf.float32)
        return states, actions, rewards, states_prime, dones


from tensorflow import losses
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


def create_q_network(learning_rate, state_dim=8, action_dim=4):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation="relu")(inputs)
    x = Dense(256, activation="relu")(x)
    x = Dense(256, activation="relu")(x)
    out = Dense(action_dim, activation=None)(x)
    model = keras.Model(inputs=inputs, outputs=out)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss=losses.mse)
    return model


# from ExperienceReplayBuffer import ExperienceReplayBuffer
import tensorflow as tf
from tensorflow import math as tfm
from tensorflow_probability import distributions as tfd
import numpy as np


class Agent:
    def __init__(self, environment, state_dim, action_dim, q_network_generator,
                 learning_rate=0.0003, gamma=0.99, tau=0.005,
                 epsilon=1, epsilon_decay=0.99, min_epsilon=0.05,
                 batch_size=256, max_replay_buffer_size=1000000):
        self._environment = environment
        self._action_dim = action_dim
        self._gamma = gamma
        self._tau = tau
        self._epsilon = epsilon
        self._epsilon_decay = epsilon_decay
        self._min_epsilon = min_epsilon
        self._batch_size = batch_size
        self._mse = tf.keras.losses.MeanSquaredError()
        self._reply_buffer = ExperienceReplayBuffer(state_dim, action_dim, max_replay_buffer_size, batch_size)
        self._q_network = q_network_generator(learning_rate)
        self._q_network_t = q_network_generator(learning_rate)
        self._wight_init()

    def reply_buffer(self):
        return self._reply_buffer

    def environment(self):
        return self._environment

    def _wight_init(self):
        self._q_network_t.set_weights(self._q_network.weights)

    def update_target_weights(self):
        self._weight_update(self._q_network_t, self._q_network)

    def _weight_update(self, target_network, network):
        new_wights = []
        for w_t, w in zip(target_network.weights, network.weights):
            new_wights.append((1 - self._tau) * w_t + self._tau * w)
        target_network.set_weights(new_wights)

    def learn(self):
        states, actions, rewards, states_prime, dones = self._reply_buffer.sample_batch()
        self.train_step(states, actions, rewards, states_prime, dones)
        self.update_target_weights()

    @tf.function
    def train_step(self, states, actions, rewards, states_prime, dones):
        q_values_prime = self._q_network_t(states_prime)
        max_q = tf.reduce_max(q_values_prime, axis=-1, keepdims=True)
        targets = rewards + self._gamma * (1 - dones) * max_q  # (1-d) : no q if done
        with tf.GradientTape() as tape:
            q_values = self._q_network(states)
            q_values_of_actions = tf.gather(q_values, actions, axis=-1, batch_dims=1)
            loss = self._mse(targets, q_values_of_actions)
        gradients = tape.gradient(loss, self._q_network.trainable_variables)
        self._q_network.optimizer.apply_gradients(zip(gradients, self._q_network.trainable_variables))

    # alternative but ugly
    def train_step2(self, state, action, rewards, state_prime, dones):
        q_values = self._q_network_t(state_prime)
        max_q = tf.reduce_max(q_values, axis=-1).numpy()
        t = rewards + self._gamma * (1 - dones) * max_q  # (1-d) : no q if done

        t_batch = self._q_network(state).numpy()
        batch_index = np.arange(self._batch_size, dtype=np.int32)

        t_batch[batch_index, action] = t
        self._q_network.train_on_batch(state, t_batch)

    def sample_actions1(self, state):  # sample with e greedy policy, alternative would be Thompson sampling
        if np.random.random() <= self._epsilon:
            actions = tf.random.uniform((1,), minval=0, maxval=self._action_dim, dtype=tf.int32)
        else:
            actions = self._deterministic_action(state)
        self.epsilon = self._epsilon * self._epsilon_decay if self._epsilon > self._min_epsilon else self._min_epsilon
        return actions

    def sample_actions(self, state):
        q_values = self._q_network(state)
        distribution = tfd.Categorical(logits=q_values)
        return distribution.sample()

    def _deterministic_action(self, state):
        return tf.argmax(self._q_network(tf.convert_to_tensor(state, dtype=tf.float32)), axis=-1)

    def act_deterministic(self, state):
        actions_prime = self._deterministic_action(tf.convert_to_tensor(state, dtype=tf.float32))
        return self._act(actions_prime)

    def act_stochastic(self, state):
        actions_prime = self.sample_actions(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def _act(self, actions):
        observation_prime, reward, done, _, _ = self._environment.step(actions.numpy()[0])
        return actions, observation_prime, reward, done

    def train(self, epochs, environment_steps=1, training_steps=1, pre_sampling_steps=0):
        print(f"Random exploration for {pre_sampling_steps} steps!")
        observation, _ = self._environment.reset()
        ret = 0
        for _ in range(max(pre_sampling_steps, self._batch_size)):
            actions = tf.random.uniform((1,), minval=0, maxval=self._action_dim, dtype=tf.int32)
            observation_prime, reward, done, _, _ = self._environment.step(actions.numpy()[0])
            ret += reward
            self._reply_buffer.add_transition(observation, actions, reward, observation_prime, done)
            if done:
                print("print", ret)
                ret = 0
                observation, _ = self._environment.reset()
            else:
                observation = observation_prime
        print("print", ret)

        print("start training!")
        returns = []
        observation, _ = self._environment.reset()
        done = 0
        ret = 0
        epoch = 0
        steps = 0
        while True:
            i = 0
            while i < environment_steps or self._reply_buffer.size() < self._batch_size:
                if done:
                    observation, _ = self._environment.reset()
                    returns.append(ret)
                    print("epoch:", epoch, "steps:", steps, "return:", ret, "avg return:", np.average(returns[-50:]))
                    ret = 0
                    epoch += 1
                    if epoch >= epochs:
                        print("training finished!")
                        return
                actions, observation_prime, reward, done = self.act_stochastic(observation)
                self._reply_buffer.add_transition(observation, actions, reward, observation_prime, done)
                observation = observation_prime
                steps += 1
                ret += reward
                i += 1
            for _ in range(training_steps):
                self.learn()


from functools import partial
import gym
import tensorflow as tf

# from Networks.LunaLanderNetwork import create_q_network
# from DQNAgent import Agent

if __name__ == '__main__':
    tf.keras.backend.clear_session()
    env = gym.make("LunarLander-v2")
    print("state_dim=", env.observation_space.shape, "action_dim=", env.action_space)

    agent = Agent(environment=env, state_dim=env.observation_space.shape, action_dim=4,
                  q_network_generator=partial(create_q_network, state_dim=env.observation_space.shape[0], action_dim=4), batch_size=256)
    agent.train(10000)

state_dim= (8,) action_dim= Discrete(4)
Random exploration for 0 steps!
print -297.42340168639737


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ipykernel import kernelapp as app


print -161.4860191312706
print -14.869750413537542
start training!
epoch: 0 steps: 88 return: 30.855235133004783 avg return: 30.855235133004783
epoch: 1 steps: 204 return: -54.196082381617515 avg return: -11.670423624306366
epoch: 2 steps: 376 return: -166.83234015902195 avg return: -63.39106246921156
epoch: 3 steps: 498 return: -215.023563779379 avg return: -101.29918779675342
epoch: 4 steps: 621 return: -199.42255571043523 avg return: -120.9238613794898
epoch: 5 steps: 803 return: 22.283696324620152 avg return: -97.05593509547147
epoch: 6 steps: 1194 return: -77.40962665828505 avg return: -94.24931960444484
epoch: 7 steps: 1408 return: -85.65711927042575 avg return: -93.17529456269244
epoch: 8 steps: 1591 return: -597.2996430331536 avg return: -149.18911105941035
epoch: 9 steps: 1860 return: 1.4037125302213553 avg return: -134.12982870044718
epoch: 10 steps: 2599 return: 146.15933344769462 avg return: -108.64899577788884
epoch: 11 steps: 2874 return: 6.142562558208127 avg return: -99

KeyboardInterrupt: ignored