In [1]:
!pip3 uninstall gym

Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/gym-0.25.2.dist-info/*
    /usr/local/lib/python3.7/dist-packages/gym/*
Proceed (y/n)? y
  Successfully uninstalled gym-0.25.2


In [2]:
!pip3 install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[mujoco]
  Downloading gym-0.25.2.tar.gz (734 kB)
[K     |████████████████████████████████| 734 kB 5.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting imageio>=2.14.1
  Downloading imageio-2.21.2-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 41.6 MB/s 
[?25hCollecting mujoco==2.2.0
  Downloading mujoco-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 35.3 MB/s 
Collecting glfw
  Downloading glfw-2.5.4-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[K     |████████████████████████████████| 207 kB 19.6 MB/s 
Collecting pillow>=8.3.2
  Downloading Pillow-9.2.0-cp37-cp37m-manylinux_2_17_x86

In [3]:
import gym
import numpy as np


class SparseReacher2D:

    def __init__(self, delta=0.015):
        self._env = gym.make("Reacher-v4")
        self._delta = delta

    def observation_space_shape(self):
        return self._env.observation_space.shape

    def environment(self):
        return self._env

    def step(self, actions):
        return self._env.step(actions)

    def reset(self):
        return self._env.reset()

    # reacher gives the vector from fingertip to target instead of fingertip coordinates
    # we need to extract the achieved goal out of this vector
    def achieved_goal(self, state):
        x_t, y_t = self.desired_goal(state)
        x_v, y_v = state[8], state[9]  # fingertip to target vector
        x_g, y_g = x_v + x_t, y_v + y_t  # fingertip coordinates
        return x_g, y_g

    def desired_goal(self, state):
        return state[4], state[5]

    def set_goal(self, state, goal):
        x_g, y_g = goal
        x, y = self.achieved_goal(state)
        new_state = np.array(state)
        new_state[4], new_state[5] = x_g, y_g
        new_state[8], new_state[9] = x - x_g, y - y_g  # create fingertip to goal vector
        return new_state

    # check if distance between goal and fingertip is lower than epsilon
    def reward(self, state):
        return -1 if np.linalg.norm([state[8], state[9]]) > self._delta else 0

    def success(self, state):
        return self.reward(state) >= -0.0

In [4]:
!pip3 install panda_gym

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting panda_gym
  Downloading panda_gym-2.0.4-py3-none-any.whl (26 kB)
Collecting pybullet
  Downloading pybullet-3.2.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (91.7 MB)
[K     |████████████████████████████████| 91.7 MB 1.2 MB/s 
[?25hCollecting gym<=0.23,>=0.22
  Downloading gym-0.23.0.tar.gz (624 kB)
[K     |████████████████████████████████| 624 kB 65.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting gym-robotics
  Downloading gym-robotics-0.1.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 50.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gym, 

In [5]:
import gym
import numpy as np
import panda_gym


class SparseReacher3D:

    def __init__(self):
        self._env = gym.make("PandaReach-v2")

    def observation_space_shape(self):
        return (9,)

    def environment(self):
        return self._env

    def _select_observations(self, observation):
        return np.concatenate((observation['observation'], observation['desired_goal']))

    def step(self, actions):
        observation, reward, done, info = self._env.step(actions.numpy())  # converting to numpy before would be better
        return self._select_observations(observation), reward, done, info

    def reset(self):
        return self._select_observations(self._env.reset())

    def achieved_goal(self, state):
        return np.array([state[0], state[1], state[2]])

    def desired_goal(self, state):
        return np.array([state[6], state[7], state[8]])

    def set_goal(self, state, goal):
        new_state = np.array(state)
        new_state[6], new_state[7], new_state[8] = goal
        return new_state

    def reward(self, state):
        return self._env.compute_reward(self.achieved_goal(state), self.desired_goal(state), {})

    def success(self, state):
        return self.reward(state) >= -0.0

In [7]:
import tensorflow as tf
import numpy as np


class ExperienceReplayBuffer:
    def __init__(self, state_dims, action_dims, max_size=1000000, batch_size=256):
        self._max_size = max_size
        self._batch_size = batch_size
        self._size = 0
        self._current_position = 0
        self._state_memory = np.zeros((self._max_size, *state_dims))
        self._state_prime_memory = np.zeros((self._max_size, *state_dims))
        self._action_memory = np.zeros((self._max_size, action_dims))
        self._reward_memory = np.zeros((self._max_size, 1))
        self._done_memory = np.zeros((self._max_size, 1), dtype=bool)

    def size(self):
        return self._size

    def ready(self):
        return self._size >= self._batch_size

    def add_transition(self, state, action, reward, state_, done):
        self._state_memory[self._current_position] = state
        self._state_prime_memory[self._current_position] = state_
        self._action_memory[self._current_position] = action
        self._reward_memory[self._current_position] = reward
        self._done_memory[self._current_position] = done
        # self.un_norm_r[self.current_position] = r
        # self.r = (self.un_norm_r - np.mean(self.un_norm_r)) / (np.std(self.un_norm_r) + 1e-10)
        if self._size < self._max_size:
            self._size += 1
        self._current_position = (self._current_position + 1) % self._max_size

    def sample_batch(self):
        batch_indices = np.random.choice(self._size, self._batch_size, replace=False)
        states = tf.convert_to_tensor(self._state_memory[batch_indices], dtype=tf.float32)
        states_prime = tf.convert_to_tensor(self._state_prime_memory[batch_indices], dtype=tf.float32)
        actions = tf.convert_to_tensor(self._action_memory[batch_indices], dtype=tf.float32)
        rewards = tf.convert_to_tensor(self._reward_memory[batch_indices], dtype=tf.float32)
        dones = tf.convert_to_tensor(self._done_memory[batch_indices], dtype=tf.float32)
        return states, actions, rewards, states_prime, dones


import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Concatenate


def create_policy_network(learning_rate, state_dim, action_dim):
    inputs = keras.Input(shape=state_dim)
    x = Dense(256, activation=tf.nn.relu)(inputs)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    mu = Dense(action_dim, activation=None)(x)
    sigma = Dense(action_dim, activation=tf.nn.softplus)(x)
    model = keras.Model(inputs=inputs, outputs=(mu, sigma))
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


def create_q_network(learning_rate, state_dim, action_dim):
    inputs_s = keras.Input(shape=state_dim)
    inputs_a = keras.Input(shape=action_dim)
    x = Concatenate()([inputs_s, inputs_a])
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    x = Dense(256, activation=tf.nn.relu)(x)
    out = Dense(1, activation=None)(x)
    model = keras.Model(inputs=(inputs_s, inputs_a), outputs=out)
    model.compile(optimizer=Adam(learning_rate=learning_rate))
    return model


# from ExperienceReplayBuffer import ExperienceReplayBuffer
import tensorflow as tf
from tensorflow import math as tfm
from tensorflow_probability import distributions as tfd
import numpy as np


# input actions are always between (−1, 1)
def default_scaling(actions):
    return actions


# input actions are always between (−1, 1)
def multiplicative_scaling(actions, factors):
    return actions * factors


class Agent:
    def __init__(self, environment, state_dim, action_dim,
                 actor_network_generator, critic_network_generator, action_scaling=default_scaling,
                 learning_rate=0.0003, gamma=0.99, tau=0.005, reward_scale=1, alpha=0.2,
                 batch_size=256, max_replay_buffer_size=1000000):
        self._environment = environment
        self._action_dim = action_dim
        self._action_scaling = action_scaling
        self._gamma = gamma
        self._tau = tau
        self._reward_scale = reward_scale
        self._alpha = alpha
        self._batch_size = batch_size
        self._mse = tf.keras.losses.MeanSquaredError()
        self._reply_buffer = ExperienceReplayBuffer(state_dim, action_dim, max_replay_buffer_size, batch_size)
        self._actor = actor_network_generator(learning_rate)
        self._critic_1 = critic_network_generator(learning_rate)
        self._critic_2 = critic_network_generator(learning_rate)
        self._critic_1_t = critic_network_generator(learning_rate)
        self._critic_2_t = critic_network_generator(learning_rate)
        self._wight_init()

    def reply_buffer(self):
        return self._reply_buffer

    def environment(self):
        return self._environment

    def _wight_init(self):
        self._critic_1.set_weights(self._critic_1_t.weights)
        self._critic_2.set_weights(self._critic_2_t.weights)

    def update_target_weights(self):
        self._weight_update(self._critic_1_t, self._critic_1)
        self._weight_update(self._critic_2_t, self._critic_2)

    def _weight_update(self, target_network, network):
        new_wights = []
        for w_t, w in zip(target_network.weights, network.weights):
            new_wights.append((1 - self._tau) * w_t + self._tau * w)
        target_network.set_weights(new_wights)

    def learn(self):
        states, actions, rewards, states_prime, dones = self._reply_buffer.sample_batch()
        self.train_step_critic(states, actions, rewards, states_prime, dones)
        self.train_step_actor(states)
        self.update_target_weights()

    @tf.function
    def train_step_critic(self, states, actions, rewards, states_prime, dones):
        actions_prime, log_probs = self.sample_actions_form_policy(states_prime)
        q1 = self._critic_1_t((states_prime, actions_prime))
        q2 = self._critic_2_t((states_prime, actions_prime))
        q_r = tfm.minimum(q1, q2) - self._alpha * log_probs
        targets = self._reward_scale * rewards + self._gamma * (1 - dones) * q_r
        self._critic_update(self._critic_1, states, actions, targets)
        self._critic_update(self._critic_2, states, actions, targets)

    def _critic_update(self, critic, states, actions, targets):
        with tf.GradientTape() as tape:
            q = critic((states, actions))
            loss = 0.5 * self._mse(targets, q)
        gradients = tape.gradient(loss, critic.trainable_variables)
        critic.optimizer.apply_gradients(zip(gradients, critic.trainable_variables))

    @tf.function
    def train_step_actor(self, states):
        with tf.GradientTape() as tape:
            actions_new, log_probs = self.sample_actions_form_policy(states)
            q1 = self._critic_1((states, actions_new))
            q2 = self._critic_2((states, actions_new))
            loss = tfm.reduce_mean(self._alpha * log_probs - tfm.minimum(q1, q2))
            # equal to loss = -tfm.reduce_mean(tfm.minimum(q1, q2) - self._alpha * log_probs)
        gradients = tape.gradient(loss, self._actor.trainable_variables)
        self._actor.optimizer.apply_gradients(zip(gradients, self._actor.trainable_variables))

    @tf.function
    def sample_actions_form_policy(self, state):
        mu, sigma = self._actor(state)
        # MultivariateNormalDiag(loc=mus, scale_diag=sigmas) other option
        distribution = tfd.Normal(mu, sigma)
        actions = distribution.sample()
        log_probs = distribution.log_prob(actions)
        actions = tfm.tanh(actions)
        log_probs -= tfm.log(1 - tfm.pow(actions, 2) + 1e-6)  # + 1e-6 because log undefined for 0
        log_probs = tfm.reduce_sum(log_probs, axis=-1, keepdims=True)
        return actions, log_probs

    def act_deterministic(self, state):
        actions_prime, _ = self._actor(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def act_stochastic(self, state):
        actions_prime, _ = self.sample_actions_form_policy(tf.convert_to_tensor([state], dtype=tf.float32))
        return self._act(actions_prime)

    def _act(self, actions):
        scaled_actions = self._action_scaling(actions)  # scaled actions from (-1, 1) according (to environment)
        observation_prime, reward, done, _ = self._environment.step(scaled_actions[0])
        return actions, observation_prime, reward, done

    def train(self, epochs, environment_steps=1, training_steps=1, pre_sampling_steps=1024):
        print(f"Random exploration for {pre_sampling_steps} steps!")
        observation = self._environment.reset()
        ret = 0
        for _ in range(max(pre_sampling_steps, self._batch_size)):
            actions = tf.random.uniform((self._action_dim,), minval=-1, maxval=1)
            scaled_actions = self._action_scaling(actions)  # scaled actions from (-1, 1) according (to environment)
            observation_prime, reward, done, _ = self._environment.step(scaled_actions)
            ret += reward
            self._reply_buffer.add_transition(observation, actions, reward, observation_prime, done)
            if done:
                print("print", ret)
                ret = 0
                observation = self._environment.reset()
            else:
                observation = observation_prime
        print("print", ret)

        print("start training!")
        returns = []
        observation = self._environment.reset()
        done = 0
        ret = 0
        epoch = 0
        steps = 0
        while True:
            i = 0
            while i < environment_steps or self._reply_buffer.size() < self._batch_size:
                if done:
                    observation = self._environment.reset()
                    returns.append(ret)
                    print("epoch:", epoch, "steps:", steps, "return:", ret, "avg return:", np.average(returns[-50:]))
                    ret = 0
                    epoch += 1
                    if epoch >= epochs:
                        print("training finished!")
                        return
                actions, observation_prime, reward, done = self.act_stochastic(observation)
                self._reply_buffer.add_transition(observation, actions, reward, observation_prime, done)
                observation = observation_prime
                steps += 1
                ret += reward
                i += 1
            for _ in range(training_steps):
                self.learn()


import numpy as np
import pandas as pd


def final_goal_sampling_strategy(trajectory, current_index, environment):
    _, _, s_p, _ = trajectory[-1]
    g = environment.achieved_goal(s_p)
    return [g]


def k_final_goal_sampling_strategy(trajectory, current_index, environment, k=4):
    return final_goal_sampling_strategy(trajectory, current_index, environment) * k


def future_goal_sampling_strategy(trajectory, current_index, environment, k=4):
    goals = []
    for _ in range(k):
        i = np.random.randint(current_index, len(trajectory))
        _, _, s_p, _ = trajectory[i]
        goals.append(environment.achieved_goal(s_p))
    return goals


def no_goal_sampling_strategy(trajectory, current_index, environment):
    return []


class HindsightExperienceReplayBuffer:

    def __init__(self, agent, goal_sampling_strategy=final_goal_sampling_strategy):
        self._agent = agent
        self._goal_sampling_strategy = goal_sampling_strategy
        self._replay_buffer = agent.reply_buffer()
        self._environment = self._agent.environment()

    def evaluate(self, steps, epoch, successes, avg_returns):
        success_cnt = 0
        rets = []
        for _ in range(steps):
            state = self._environment.reset()
            done = False
            ret = 0
            while not done:
                _, state, reward, done = self._agent.act_deterministic(state)
                ret += self._environment.reward(state)
                if self._environment.success(state):
                    success_cnt += 1
                    done = True
            rets.append(ret)
        avg_return = np.average(rets)
        success_rate = success_cnt / steps
        successes.append(success_rate)
        avg_returns.append(avg_return)
        print(f"epoch {epoch}: avg return={avg_return}, success rate={success_rate} (with {steps} evaluation steps)")

    def train(self, epochs=200, cycles=50, episodes=16, n=40, t=1000,
              eval_steps=100, save_eval=False, eval_name='evaluation'):
        successes = []
        avg_returns = []
        self.evaluate(eval_steps, 0, successes, avg_returns)
        for e in range(1, epochs + 1):
            for _ in range(cycles):
                for _ in range(episodes):
                    state = self._environment.reset()
                    trajectory = []
                    dones = 0
                    j = 0
                    while not dones and j < t:
                        actions, state_prime, r, dones = self._agent.act_stochastic(state)
                        trajectory.append((state, actions, state_prime, dones))
                        state = state_prime
                        j += 1
                        if self._environment.success(state):
                            dones = True
                    for i, (state, actions, state_prime, dones) in enumerate(trajectory):
                        reward = self._environment.reward(state_prime)
                        self._replay_buffer.add_transition(state, actions, reward, state_prime, dones)
                        goals = self._goal_sampling_strategy(trajectory, i, self._environment)
                        for g in goals:
                            state_new = self._environment.set_goal(state, g)
                            state_prime_new = self._environment.set_goal(state_prime, g)
                            reward_new = self._environment.reward(state_prime_new)
                            self._replay_buffer.add_transition(state_new, actions, reward_new, state_prime_new, dones)
                if self._replay_buffer.ready():
                    for i in range(n):
                        self._agent.learn()
            self.evaluate(eval_steps, e, successes, avg_returns)
        if save_eval:
            data = {'epoch': range(epochs + 1), 'success rate': successes, 'average return': avg_returns}
            df = pd.DataFrame.from_dict(data)
            df.to_csv(f'{eval_name}.csv')


from functools import partial
import tensorflow as tf

# from final_project.Networks.GenericMLPs1D import create_policy_network, create_q_network
# from SoftActorCriticAgent import Agent, multiplicative_scaling
# from Environments.SparseReacher3D import SparseReacher3D
# from HER import HindsightExperienceReplayBuffer, future_goal_sampling_strategy

if __name__ == '__main__':
    tf.keras.backend.clear_session()
    environment = SparseReacher3D()
    env = environment.environment()
    state_dim = environment.observation_space_shape()
    action_dim = env.action_space.shape[0]
    action_scaling = env.action_space.high
    print("state_dim=", state_dim, "action_dim=", action_dim, "action_scaling:", action_scaling)
    agent = Agent(environment=environment, state_dim=state_dim, action_dim=action_dim, alpha=0.05,
                  action_scaling=partial(multiplicative_scaling, factors=action_scaling),
                  actor_network_generator=partial(create_policy_network, state_dim=state_dim[0], action_dim=action_dim),
                  critic_network_generator=partial(create_q_network, state_dim=state_dim[0], action_dim=action_dim))
    her = HindsightExperienceReplayBuffer(agent, goal_sampling_strategy=future_goal_sampling_strategy)
    her.train(epochs=5)

state_dim= (9,) action_dim= 3 action_scaling: [1. 1. 1.]
epoch 0: avg return=-49.08, success rate=0.02 (with 100 evaluation steps)
epoch 1: avg return=-1.85, success rate=1.0 (with 100 evaluation steps)
epoch 2: avg return=-1.82, success rate=1.0 (with 100 evaluation steps)
epoch 3: avg return=-1.59, success rate=1.0 (with 100 evaluation steps)
epoch 4: avg return=-1.64, success rate=1.0 (with 100 evaluation steps)
epoch 5: avg return=-1.71, success rate=1.0 (with 100 evaluation steps)
