In [1]:
import ray

In [2]:
from rldm.utils import system_tools as st
n_cpus, n_gpus = st.get_cpu_gpu_count()
debug = False
ray.init(num_cpus=n_cpus, num_gpus=n_gpus, local_mode=debug)



{'node_ip_address': '172.17.0.2',
 'raylet_ip_address': '172.17.0.2',
 'redis_address': '172.17.0.2:18815',
 'object_store_address': '/tmp/ray/session_2022-07-05_06-57-07_192528_17353/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-07-05_06-57-07_192528_17353/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-07-05_06-57-07_192528_17353',
 'metrics_export_port': 63541,
 'node_id': 'b895acec01f5fc71ce3727ff2acfae37c57d4b3efb931badbb827316'}

In [21]:
from ray import tune

## register the env

In [14]:
from gfootball import env as fe
from rldm.utils import football_tools as ft
from ray.tune.registry import register_env
import numpy as np
num_players = 3
shared_policy = False
n_policies = 1 if shared_policy else num_players - 1 # hard-coding
env_name = ft.n_players_to_env_name(num_players, True)
register_env(env_name, lambda _: ft.RllibGFootball(env_name=env_name))

## configuration for the experiment 

In [19]:
obs_space, act_space = ft.get_obs_act_space(env_name)

def gen_policy(idx):
    return (None, obs_space[f'player_{idx}'], act_space[f'player_{idx}'], {})

policies = {
        'agent_{}'.format(idx): gen_policy(idx) for idx in range(n_policies)
    }

policy_ids = list(policies.keys())

policy_mapping_fn = lambda agent_id, episode, **kwargs: \
        policy_ids[0 if len(policy_ids) == 1 else int(agent_id.split('_')[1])]
#in case of using a indiviual policy
default_multiagent = {
        'policies': policies,
        'policy_mapping_fn': policy_mapping_fn,
    }
#in case of using a shared policy 
shared_policy = {'agent_0': gen_policy(0)}
shared_policy_mapping_fn = lambda agent_id, episode, **kwargs: 'agent_0'
shared_multiagent = {
    'policies': shared_policy,
    'policy_mapping_fn': shared_policy_mapping_fn,
}

## SAC

In [35]:
from rldm.utils.collection_tools import deep_merge
import random
use_tune_config = True
config = {"env":env_name,
           "evaluation_interval":10,
           "evaluation_num_episodes":20,
            "rollout_fragment_length": 100,
          "train_batch_size": 2_800,
           "num_gpus":n_gpus,
           "num_workers":n_cpus-1,
           'multiagent': default_multiagent,}

if use_tune_config:
    tune_config = {
        "Q_model": {
            "fcnet_hiddens": tune.sample_from(
                    lambda _: random.sample([
                        [256, 256],
                        [128, 256],
                        [256, 128],
                        [128, 128],
                    ], 1)[0])
        },
        
        "policy_model": {
            "fcnet_hiddens": tune.sample_from(
                    lambda _: random.sample([
                        [256, 256],
                        [128, 256],
                        [256, 128],
                        [128, 128],
                    ], 1)[0])
        },
        "tau": tune.uniform(0.00005, 0.0005),
        # === Optimization ===
        "optimization": {
            "actor_learning_rate": tune.uniform(0.001, 1e-5),
            "critic_learning_rate": tune.uniform(0.001, 1e-5),
            "entropy_learning_rate": tune.uniform(0.001, 1e-5),
        },
        "_deterministic_loss": tune.choice(["tanh", "relu"]),
        
        'multiagent': tune.choice([default_multiagent, shared_multiagent]),
    }
    config = deep_merge(config, tune_config)


{'env': '3_vs_3_auto_GK', 'evaluation_interval': 10, 'evaluation_num_episodes': 20, 'rollout_fragment_length': 100, 'train_batch_size': 2800, 'num_gpus': 1, 'num_workers': 19, 'multiagent': <ray.tune.sample.Categorical object at 0x7f41fdd86750>, 'Q_model': {'fcnet_hiddens': <ray.tune.sample.Function object at 0x7f41fddbb9d0>}, 'policy_model': {'fcnet_hiddens': <ray.tune.sample.Function object at 0x7f41fddbbf10>}, 'tau': <ray.tune.sample.Float object at 0x7f41fddbbe90>, 'optimization': {'actor_learning_rate': <ray.tune.sample.Float object at 0x7f41fddbbf50>, 'critic_learning_rate': <ray.tune.sample.Float object at 0x7f41fdd86a50>, 'entropy_learning_rate': <ray.tune.sample.Float object at 0x7f41fdd86910>}, '_deterministic_loss': <ray.tune.sample.Categorical object at 0x7f41fdd86e10>}


## add a scheduler to terminate any bad trial

In [46]:
from ray.tune.schedulers import ASHAScheduler
use_callbacks = False
if use_callbacks:
    config['callbacks'] = ft.FootballCallbacks

use_scheduler = True
n_timesteps =20_000_000
scheduler = None
stop = {
    "timesteps_total": n_timesteps,
}
if use_scheduler: 
    scheduler = ASHAScheduler(
        time_attr='timesteps_total',
        metric='episode_reward_mean',
        mode='max',
        max_t=n_timesteps,
        grace_period=int(n_timesteps*0.10),
        reduction_factor=3,
        brackets=1)
    stop = None

NameError: name '__file__' is not defined

In [47]:
import os 
filename_stem = os.path.basename(__file__).split(".")[0]
policy_type = 'search' if use_tune_config else \
    'shared' if n_policies == 1 else 'independent'
scheduler_type = 'asha' if use_scheduler else 'fifo'
config_type = 'tune' if use_tune_config else 'fixed'
experiment_name =f"{filename_stem}_{env_name}_{policy_type}_{n_timesteps}_{scheduler_type}_{config_type}"
script_dir = os.path.dirname(os.path.realpath(__file__))
local_dir = os.path.join(script_dir, '..', '..', 'logs')
print(experiment_name)

3_vs_3_auto_GK_search_20000000_asha_tune


In [None]:
n_samples = 8

In [36]:
a = tune.run("SAC",
        name=experiment_name,
        reuse_actors=False,
        scheduler=scheduler,
        raise_on_failed_trial=True,
        fail_fast=True,
        max_failures=0,
        num_samples=n_samples,
        stop=stop,
        checkpoint_freq=100,
        checkpoint_at_end=True,
        local_dir=local_dir,
        config=config,
        verbose=1 if not debug else 3
        )

Trial name,status,loc,Q_model/fcnet_hiddens,_deterministic_loss,multiagent,optimization/actor_learning_rate,optimization/critic_learning_rate,optimization/entropy_learning_rate,policy_model/fcnet_hiddens,tau
SAC_3_vs_3_auto_GK_052d0_00000,PENDING,,"[128, 256]",tanh,"{'policies': {'agent_0': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {}), 'agent_1': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {})}, 'policy_mapping_fn': <function <lambda> at 0x7f42000e1200>}",0.000844197,0.000504442,0.000405481,"[256, 256]",0.000197781


[2m[36m(pid=22153)[0m 2022-07-05 07:56:51,742	INFO trainer.py:714 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=22153)[0m 2022-07-05 07:56:51,743	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=22153)[0m 2022-07-05 07:57:24,274	INFO trainable.py:109 -- Trainable.setup took 32.571 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Result for SAC_3_vs_3_auto_GK_052d0_00000:
  agent_timesteps_total: 3800
  custom_metrics: {}
  date: 2022-07-05_07-57-27
  done: false
  episode_len_mean: .nan
  episode_media: {}
  episode_reward_max: .nan
  episode_reward_mean: .nan
  episode_reward_min: .nan
  episodes_this_iter: 0
  episodes_total: 0
  experiment_id: 364e664aea254cfa92b79839278a231c
  hostname: b87bc6aeba4a
  info:
    last_target_update_ts: 1900
    learner:
      agent_0:
        learner_stats:
          actor_loss: -2.9440126419067383
          alpha_loss: 0.0
          alpha_value: 1.0
          critic_loss: 2.4138424396514893
          max_q: 0.012492471374571323
          mean_q: -0.0004241162387188524
          mean_td_error: 2.9138424396514893
          min_q: -0.015866102650761604
          model: {}
          target_entropy: 2.8855501556396486
        td_error: '[2.9125385 2.9140475 2.9143581 ... 2.9121308 2.9132519 2.9149926]'
        train: null
      agent_1:
        learner_stats:
          actor_los

Trial name,status,loc,Q_model/fcnet_hiddens,_deterministic_loss,multiagent,optimization/actor_learning_rate,optimization/critic_learning_rate,optimization/entropy_learning_rate,policy_model/fcnet_hiddens,tau,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_3_vs_3_auto_GK_052d0_00000,RUNNING,172.17.0.2:22153,"[128, 256]",tanh,"{'policies': {'agent_0': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {}), 'agent_1': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {})}, 'policy_mapping_fn': <function <lambda> at 0x7f42000e1200>}",0.000844197,0.000504442,0.000405481,"[256, 256]",0.000197781,1,3.47283,1900,,,,


Result for SAC_3_vs_3_auto_GK_052d0_00000:
  agent_timesteps_total: 15200
  custom_metrics: {}
  date: 2022-07-05_07-57-34
  done: false
  episode_len_mean: 158.69565217391303
  episode_media: {}
  episode_reward_max: 0.20000000298023224
  episode_reward_mean: -0.5043478258278059
  episode_reward_min: -2.0
  episodes_this_iter: 7
  episodes_total: 23
  experiment_id: 364e664aea254cfa92b79839278a231c
  hostname: b87bc6aeba4a
  info:
    last_target_update_ts: 7600
    learner:
      agent_0:
        learner_stats:
          actor_loss: -2.952843427658081
          alpha_loss: -1.7664715414866805e-05
          alpha_value: 0.9997000694274902
          critic_loss: 2.395547866821289
          max_q: 0.07300771772861481
          mean_q: 0.009286877699196339
          mean_td_error: 2.8950254917144775
          min_q: 0.0038774393033236265
          model: {}
          target_entropy: 2.8855501556396486
        td_error: '[2.903942  2.9058862 2.9039352 ... 2.9019675 2.904931  2.9013946]'
 

Trial name,status,loc,Q_model/fcnet_hiddens,_deterministic_loss,multiagent,optimization/actor_learning_rate,optimization/critic_learning_rate,optimization/entropy_learning_rate,policy_model/fcnet_hiddens,tau,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_3_vs_3_auto_GK_052d0_00000,RUNNING,172.17.0.2:22153,"[128, 256]",tanh,"{'policies': {'agent_0': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {}), 'agent_1': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {})}, 'policy_mapping_fn': <function <lambda> at 0x7f42000e1200>}",0.000844197,0.000504442,0.000405481,"[256, 256]",0.000197781,4,9.60813,7600,-0.504348,0.2,-2,158.696


Result for SAC_3_vs_3_auto_GK_052d0_00000:
  agent_timesteps_total: 26600
  custom_metrics: {}
  date: 2022-07-05_07-57-40
  done: false
  episode_len_mean: 208.9111111111111
  episode_media: {}
  episode_reward_max: 0.20000000298023224
  episode_reward_mean: -0.3466666665342119
  episode_reward_min: -2.0
  episodes_this_iter: 5
  episodes_total: 45
  experiment_id: 364e664aea254cfa92b79839278a231c
  hostname: b87bc6aeba4a
  info:
    last_target_update_ts: 13300
    learner:
      agent_0:
        learner_stats:
          actor_loss: -2.9620001316070557
          alpha_loss: -3.533079507178627e-05
          alpha_value: 0.9994001388549805
          critic_loss: 2.3870372772216797
          max_q: 0.12748725712299347
          mean_q: 0.01932591013610363
          mean_td_error: 2.8860080242156982
          min_q: 0.012125525623559952
          model: {}
          target_entropy: 2.8855501556396486
        td_error: '[2.8946853 2.8960662 2.8965225 ... 2.8918705 2.896205  2.8940802]'
  

Trial name,status,loc,Q_model/fcnet_hiddens,_deterministic_loss,multiagent,optimization/actor_learning_rate,optimization/critic_learning_rate,optimization/entropy_learning_rate,policy_model/fcnet_hiddens,tau,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_3_vs_3_auto_GK_052d0_00000,RUNNING,172.17.0.2:22153,"[128, 256]",tanh,"{'policies': {'agent_0': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {}), 'agent_1': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {})}, 'policy_mapping_fn': <function <lambda> at 0x7f42000e1200>}",0.000844197,0.000504442,0.000405481,"[256, 256]",0.000197781,7,16.0345,13300,-0.346667,0.2,-2,208.911




Result for SAC_3_vs_3_auto_GK_052d0_00000:
  agent_timesteps_total: 38000
  custom_metrics: {}
  date: 2022-07-05_07-58-51
  done: false
  episode_len_mean: 225.95774647887325
  episode_media: {}
  episode_reward_max: 0.30000000447034836
  episode_reward_mean: -0.3478873236708238
  episode_reward_min: -2.0
  episodes_this_iter: 12
  episodes_total: 71
  evaluation:
    custom_metrics: {}
    episode_len_mean: 225.52
    episode_media: {}
    episode_reward_max: 0.0
    episode_reward_mean: -0.7919999998807907
    episode_reward_min: -2.0
    episodes_this_iter: 25
    hist_stats:
      episode_lengths:
      - 242
      - 104
      - 384
      - 120
      - 257
      - 72
      - 192
      - 188
      - 174
      - 200
      - 435
      - 77
      - 501
      - 78
      - 230
      - 148
      - 501
      - 78
      - 409
      - 147
      - 203
      - 79
      - 124
      - 194
      - 501
      episode_reward:
      - 0.0
      - 0.0
      - 0.0
      - -2.0
      - -2.0
      - 0.0

Trial name,status,loc,Q_model/fcnet_hiddens,_deterministic_loss,multiagent,optimization/actor_learning_rate,optimization/critic_learning_rate,optimization/entropy_learning_rate,policy_model/fcnet_hiddens,tau,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_3_vs_3_auto_GK_052d0_00000,RUNNING,172.17.0.2:22153,"[128, 256]",tanh,"{'policies': {'agent_0': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {}), 'agent_1': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {})}, 'policy_mapping_fn': <function <lambda> at 0x7f42000e1200>}",0.000844197,0.000504442,0.000405481,"[256, 256]",0.000197781,10,86.7453,19000,-0.347887,0.3,-2,225.958


Trial name,status,loc,Q_model/fcnet_hiddens,_deterministic_loss,multiagent,optimization/actor_learning_rate,optimization/critic_learning_rate,optimization/entropy_learning_rate,policy_model/fcnet_hiddens,tau,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_3_vs_3_auto_GK_052d0_00000,RUNNING,172.17.0.2:22153,"[128, 256]",tanh,"{'policies': {'agent_0': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {}), 'agent_1': (None, Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf  -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf  inf inf inf inf inf inf inf], (43,), float32), Discrete(19), {})}, 'policy_mapping_fn': <function <lambda> at 0x7f42000e1200>}",0.000844197,0.000504442,0.000405481,"[256, 256]",0.000197781,10,86.7453,19000,-0.347887,0.3,-2,225.958


[2m[36m(pid=22153)[0m 2022-07-05 07:58:51,645	ERROR worker.py:428 -- SystemExit was raised from the worker
[2m[36m(pid=22153)[0m Traceback (most recent call last):
[2m[36m(pid=22153)[0m   File "python/ray/_raylet.pyx", line 640, in ray._raylet.task_execution_handler
[2m[36m(pid=22153)[0m   File "python/ray/_raylet.pyx", line 488, in ray._raylet.execute_task
[2m[36m(pid=22153)[0m   File "python/ray/_raylet.pyx", line 525, in ray._raylet.execute_task
[2m[36m(pid=22153)[0m   File "python/ray/_raylet.pyx", line 532, in ray._raylet.execute_task
[2m[36m(pid=22153)[0m   File "python/ray/_raylet.pyx", line 536, in ray._raylet.execute_task
[2m[36m(pid=22153)[0m   File "python/ray/_raylet.pyx", line 486, in ray._raylet.execute_task.function_executor
[2m[36m(pid=22153)[0m   File "/opt/conda/lib/python3.7/site-packages/ray/_private/function_manager.py", line 563, in actor_method_executor
[2m[36m(pid=22153)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(p

2022-07-05 07:58:51,843	INFO tune.py:561 -- Total run time: 124.16 seconds (123.89 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f42014a9d10>

In [None]:
checkpoint_path = a.get_best_checkpoint(a.get_best_trial("episode_reward_mean", "max"), "episode_reward_mean", "max")
print('Best checkpoint found:', checkpoint_path)
ray.shutdown()