# Setup

In [1]:
import tensorflow as tf
import numpy as np
import tf_agents
import matplotlib.pyplot as plt

from tf_env.UR_ENV import UR_env

from tf_agents.train.utils import spec_utils
from tf_agents.agents.ddpg import critic_network
from tf_agents.networks import actor_distribution_network
from tf_agents.agents.sac import tanh_normal_projection_network
from tf_agents.train.utils import train_utils
from tf_agents.agents.sac import sac_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_py_policy
from tf_agents.train import actor
from tf_agents.replay_buffers import reverb_utils


# ff
## Hyperparameters

In [2]:
num_iterations = 1000 # @param {type:"integer"}

# initial_collect_steps = 10000 # @param {type:"integer"}
collect_steps_per_run = 25 # @param {type:"integer"}
replay_buffer_capacity = 10000 # @param {type:"integer"}

# batch_size = 256 # @param {type:"integer"}

critic_learning_rate = 3e-4 # @param {type:"number"}
actor_learning_rate = 3e-4 # @param {type:"number"}
alpha_learning_rate = 3e-4 # @param {type:"number"}
target_update_tau = 0.005 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.99 # @param {type:"number"}
reward_scale_factor = 1.0 # @param {type:"number"}

actor_fc_layer_params = (256, 256)
critic_joint_fc_layer_params = (256, 256)

log_interval = 5000 # @param {type:"integer"}

num_eval_episodes = 20 # @param {type:"integer"}
eval_interval = 10000 # @param {type:"integer"}

# policy_save_interval = 5000 # @param {type:"integer"}

## Environment

In [3]:
from tf_agents.environments import tf_py_environment

train_env_py= UR_env()
eval_env_py=UR_env()

train_env_tf=tf_py_environment.TFPyEnvironment(train_env_py)
eval_env_tf=tf_py_environment.TFPyEnvironment(eval_env_py)

  j_orient =orientation.as_euler('ZXZ',degrees=True)


Let's look at the information the environment provides as an observation which the policy will use to generate actions.

In [4]:
print('Observation Spec:')
print(train_env_tf.time_step_spec().observation)
print('Action Spec:')
print(train_env_tf.action_spec())

Observation Spec:
BoundedTensorSpec(shape=(2, 3), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32))
Action Spec:
BoundedTensorSpec(shape=(6,), dtype=tf.float32, name='action', minimum=array(-1., dtype=float32), maximum=array(1., dtype=float32))


In [5]:
from tf_agents.environments import utils

utils.validate_py_environment(train_env_py,episodes=5,)

## Distribution Strategy

In [6]:
from tf_agents.train.utils import strategy_utils
use_gpu = False
strategy = strategy_utils.get_strategy(tpu=False, use_gpu=use_gpu)

## Agent
To create an SAC Agent, we first need to create the networks that it will train. SAC is an actor-critic agent, so we will need two networks.

The critic will give us value estimates for Q(s,a). That is, it will recieve as input an observation and an action, and it will give us an estimate of how good that action was for the given state.

In [7]:
observation_spec, action_spec, time_step_spec = (
      spec_utils.get_tensor_specs(train_env_py))

with strategy.scope():
  critic_net = critic_network.CriticNetwork(
        (observation_spec, action_spec),
        observation_fc_layer_params=None,
        action_fc_layer_params=None,
        joint_fc_layer_params=critic_joint_fc_layer_params,
        kernel_initializer='glorot_uniform',
        last_kernel_initializer='glorot_uniform')

In [8]:
observation_spec

BoundedTensorSpec(shape=(2, 3), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32))

In [9]:
action_spec

BoundedTensorSpec(shape=(6,), dtype=tf.float32, name='action', minimum=array(-1., dtype=float32), maximum=array(1., dtype=float32))

In [10]:
time_step_spec

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': BoundedTensorSpec(shape=(2, 3), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

In [11]:
with strategy.scope():
  actor_net = actor_distribution_network.ActorDistributionNetwork(
      observation_spec,
      action_spec,
      fc_layer_params=actor_fc_layer_params,
      continuous_projection_net=(
          tanh_normal_projection_network.TanhNormalProjectionNetwork))

In [12]:
with strategy.scope():
  train_step = train_utils.create_train_step()

  tf_agent = sac_agent.SacAgent(
        time_step_spec,
        action_spec,
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer=tf.keras.optimizers.Adam(
            learning_rate=actor_learning_rate),
        critic_optimizer=tf.keras.optimizers.Adam(
            learning_rate=critic_learning_rate),
        alpha_optimizer=tf.keras.optimizers.Adam(
            learning_rate=alpha_learning_rate),
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        td_errors_loss_fn=tf.math.squared_difference,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        train_step_counter=train_step)

  tf_agent.initialize()

## Replay buffer

In [13]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    tf_agent.collect_data_spec,
    batch_size=train_env_tf.batch_size,
    max_length=replay_buffer_capacity)

# replay_observer = [replay_buffer.add_batch]

dataset = replay_buffer.as_dataset(
    sample_batch_size = collect_steps_per_run
)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


## Policy

In [14]:
tf_eval_policy = tf_agent.policy
eval_policy = py_tf_eager_policy.PyTFEagerPolicy(
  tf_eval_policy, use_tf_function=True)

In [15]:
tf_collect_policy = tf_agent.collect_policy
collect_policy = py_tf_eager_policy.PyTFEagerPolicy(
  tf_collect_policy, use_tf_function=True)

In [16]:
random_policy = random_py_policy.RandomPyPolicy(
  train_env_py.time_step_spec(), train_env_py.action_spec())

In [17]:
initial_collect_actor = actor.Actor(
  train_env_py,
  random_policy,
  train_step,
  steps_per_run=collect_steps_per_run,
  observers=[replay_buffer.add_batch])
initial_collect_actor.run()


KeyboardInterrupt



In [None]:
from tf_agents.metrics import py_metrics
# from tf_agents.train import learner
# import os
# import tempfile

# tempdir = tempfile.gettempdir()

env_step_metric = py_metrics.EnvironmentSteps()
collect_actor = actor.Actor(
  train_env_py,
  collect_policy,
  train_step,
  steps_per_run=1,
  metrics=actor.collect_metrics(10),
  observers=[replay_buffer, env_step_metric])

In [None]:
num_eval_episodes = 20 # @param {type:"integer"}

eval_actor = actor.Actor(
  train_env_py,
  eval_policy,
  train_step,
  episodes_per_run=num_eval_episodes,
  metrics=actor.eval_metrics(num_eval_episodes)
)

In [None]:
from tf_agents.train import triggers

policy_save_interval = 5000 # @param {type:"integer"}

saved_model_dir = os.path.join(tempdir, learner.POLICY_SAVED_MODEL_DIR)

# Triggers to save the agent's policy checkpoints.
learning_triggers = [
    triggers.PolicySavedModelTrigger(
        saved_model_dir,
        tf_agent,
        train_step,
        interval=policy_save_interval),
    triggers.StepPerSecondLogTrigger(train_step, interval=1000),
]

agent_learner = learner.Learner(
  tempdir,
  train_step,
  tf_agent,
  experience_dataset_fn,
  triggers=learning_triggers,
  strategy=strategy)

2022-10-28 13:16:04.633751: W tensorflow/core/framework/dataset.cc:769] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


In [None]:
def get_eval_metrics():
  eval_actor.run()
  results = {}
  for metric in eval_actor.metrics:
    results[metric.name] = metric.result()
  return results

metrics = get_eval_metrics()

  orientation=orientation.as_euler('ZXZ',degrees=True)


In [None]:
def log_eval_metrics(step, metrics):
  eval_results = (', ').join(
      '{} = {:.6f}'.format(name, result) for name, result in metrics.items())
  print('step = {0}: {1}'.format(step, eval_results))

log_eval_metrics(0, metrics)

step = 0: AverageReturn = 493.899994, AverageEpisodeLength = 1001.000000


In [None]:

# Reset the train step
tf_agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = get_eval_metrics()["AverageReturn"]
returns = [avg_return]

In [None]:
num_iterations = 10000 # @param {type:"integer"}
log_interval = 5000 # @param {type:"integer"}
eval_interval = 1000

for _ in range(num_iterations):
  # Training.
  collect_actor.run()
  loss_info = agent_learner.run(iterations=1)

  # Evaluating.
  step = agent_learner.train_step_numpy

  if eval_interval and step % eval_interval == 0:
    metrics = get_eval_metrics()
    log_eval_metrics(step, metrics)
    returns.append(metrics["AverageReturn"])

  if log_interval and step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, loss_info.loss.numpy()))

rb_observer.close()
reverb_server.stop()

[reverb/cc/client.cc:165] Sampler and server are owned by the same process (15529) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (15529) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (15529) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (15529) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (15529) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (15529) so Table uniform_table is accessed directly without gRPC.


InvalidArgumentError: Graph execution error:

Detected at node 'CheckNumerics' defined at (most recent call last):
    File "/usr/lib/python3.10/threading.py", line 973, in _bootstrap
      self._bootstrap_inner()
    File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
      self.run()
    File "/home/anton/.local/lib/python3.10/site-packages/tf_agents/agents/tf_agent.py", line 330, in train
      loss_info = self._train_fn(
    File "/home/anton/.local/lib/python3.10/site-packages/tf_agents/utils/common.py", line 188, in with_check_resource_vars
      return fn(*fn_args, **fn_kwargs)
    File "/home/anton/.local/lib/python3.10/site-packages/tf_agents/agents/sac/sac_agent.py", line 323, in _train
      tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.')
Node: 'CheckNumerics'
Critic loss is inf or nan. : Tensor had Inf values
	 [[{{node CheckNumerics}}]] [Op:__inference__train_210142]

In [None]:
steps = range(0, num_iterations + 1, eval_interval)
plt.plot(steps, returns)
plt.ylabel('Average Return')
plt.xlabel('Step')
plt.ylim()