In [3]:
"""
Trains PPO baseline agent.
"""
from typing import Any

import ray
from gymnasium.spaces import Discrete
from ray import air, tune
from ray.rllib.algorithms import ppo  # import the type of agents
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from ray.rllib.policy.policy import PolicySpec

from mahrl.experiments.callback import CustomMetricsCallback
from mahrl.experiments.rewards import LossReward
from mahrl.grid2op_env.custom_environment import CustomizedGrid2OpEnvironment
from mahrl.multi_agent.policy import (
    DoNothingPolicy,
    SelectAgentPolicy,
    policy_mapping_fn,
)
import yaml

ENV_NAME = "rte_case5_example"
ENV_IS_TEST = True
LIB_DIR = "/Users/barberademol/Documents/GitHub/mahrl_grid2op/"
# LIB_DIR = "/home/daddabarba/VirtualEnvs/mahrl/lib/python3.10/site-packages/grid2op/data"
RHO_THRESHOLD = 0.9
NB_TSTEPS = 50000
CHECKPOINT_FREQ = 1000
VERBOSE = 1

policies = {
    "high_level_policy": PolicySpec(  # chooses RL or do-nothing agent
        policy_class=SelectAgentPolicy,
        observation_space=None,  # infer automatically from env
        action_space=Discrete(2),  # choose one of agents
        config=(
            AlgorithmConfig()
            .training(
                _enable_learner_api=False,
            )
            .rl_module(_enable_rl_module_api=False)
            .exploration(
                exploration_config={
                    "type": "EpsilonGreedy",
                }
            )
            .rollouts(preprocessor_pref=None)
        ),
    ),
    "reinforcement_learning_policy": PolicySpec(  # performs RL topology
        policy_class=None,  # use default policy of PPO
        observation_space=None,  # infer automatically from env
        action_space=None,  # infer automatically from env
        config=(
            AlgorithmConfig()
            .training(
                _enable_learner_api=False,
            )
            .rl_module(_enable_rl_module_api=False)
            .exploration(
                exploration_config={
                    "type": "EpsilonGreedy",
                }
            )
        ),
    ),
    "do_nothing_policy": PolicySpec(  # performs do-nothing action
        policy_class=DoNothingPolicy,
        observation_space=None,  # infer automatically from env --TODO not actually needed
        action_space=Discrete(1),  # only perform do-nothing
        config=(
            AlgorithmConfig()
            .training(_enable_learner_api=False)
            .rl_module(_enable_rl_module_api=False)
            .exploration(
                exploration_config={
                    "type": "EpsilonGreedy",
                }
            )
        ),
    ),
}

ppo_config = ppo.PPOConfig()
ppo_config = ppo_config.training(
    _enable_learner_api=False,
    gamma=0.99,
    lr=0.00005,
    # gamma=tune.grid_search([0.9, 0.99, 0.999]),
    # lr=tune.grid_search([0.0003, 0.003, 0.03]),
    vf_loss_coeff=0.5,
    entropy_coeff=0.01,
    clip_param=0.2,
    lambda_=0.95,
    sgd_minibatch_size=32,
    train_batch_size=128,
    # lambda_=tune.grid_search([0.9, 0.95, 0.999]),
    # sgd_minibatch_size=tune.grid_search([32, 64, 128]),
    # train_batch_size=tune.grid_search([32, 64, 128]),
    # seed=14,
    model={
        "fcnet_hiddens": [256, 256],
    },
)
ppo_config = ppo_config.environment(
    env=CustomizedGrid2OpEnvironment,
    env_config={
        "env_name": ENV_NAME,
        "num_agents": len(policies),
        "action_space": "tennet",
        "lib_dir": LIB_DIR,
        "max_tsteps": NB_TSTEPS,
        "grid2op_kwargs": {
            "test": ENV_IS_TEST,
            # "reward_class": Reward.L2RPNReward,
            "reward_class": LossReward,
        },
    },
)

ppo_config.multi_agent(
    policies=policies,
    policy_mapping_fn=policy_mapping_fn,
    policies_to_train=["reinforcement_learning_policy"],
)

ppo_config.framework(framework="torch")
ppo_config.rl_module(_enable_rl_module_api=False)
ppo_config.exploration(
    exploration_config={
        "type": "EpsilonGreedy",
    }
)
ppo_config.callbacks(CustomMetricsCallback)



# Convert to YAML string
yaml_string = yaml.dump(ppo_config)

# Write YAML string to a file
with open("ppo_config.yaml", "w") as yaml_file:
    yaml_file.write(yaml_string)




In [4]:
from numpy import save
import yaml
from ray import tune
    
from gymnasium.spaces.discrete import Discrete
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from ray.rllib.policy.policy import PolicySpec

# Custom constructor for Discrete
def discrete_constructor(loader, node):
    return Discrete(int(loader.construct_scalar(node)))

# Custom constructor for AlgorithmConfig
def algorithm_config_constructor(loader, node):
    fields = loader.construct_mapping(node)
    return AlgorithmConfig()

# Custom constructor for PolicySpec
def policy_spec_constructor(loader, node):
    fields = loader.construct_mapping(node)
    return PolicySpec()

# Custom constructor for CustomizedGrid2OpEnvironment
def customized_environment_constructor(loader, node):
    fields = loader.construct_mapping(node, deep=True)
    env_config = fields.get('env_config', {})  # Extract env_config explicitly
    fields['env_config'] = env_config
    return CustomizedGrid2OpEnvironment(**fields)

# Custom constructor for LossReward
def loss_reward_constructor(loader, node):
    return LossReward()

# Custom constructor for policy_mapping_fn
def policy_mapping_fn_constructor(loader, node):
    return policy_mapping_fn

# Custom constructor for CustomMetricsCallback
def custom_metrics_callback_constructor(loader, node):
    return CustomMetricsCallback

# Custom constructor for SelectAgentPolicy
def select_agent_policy_constructor(loader, node):
    fields = loader.construct_mapping(node)
    return SelectAgentPolicy(**fields)

# Custom constructor for DoNothingPolicy
def do_nothing_policy_constructor(loader, node):
    fields = loader.construct_mapping(node)
    return DoNothingPolicy(**fields)

# Add the constructors to the yaml loader
yaml.add_constructor('!CustomizedGrid2OpEnvironment', customized_environment_constructor)
yaml.add_constructor('!LossReward', loss_reward_constructor)
yaml.add_constructor('!policy_mapping_fn', policy_mapping_fn_constructor)
yaml.add_constructor('!CustomMetricsCallback', custom_metrics_callback_constructor)
yaml.add_constructor('!SelectAgentPolicy', select_agent_policy_constructor)
yaml.add_constructor('!DoNothingPolicy', do_nothing_policy_constructor)
yaml.add_constructor('!Discrete', discrete_constructor)
yaml.add_constructor('!AlgorithmConfig', algorithm_config_constructor)
yaml.add_constructor('!PolicySpec', policy_spec_constructor)

config = yaml.load(open("/Users/barberademol/Documents/GitHub/mahrl_grid2op/experiments/configurations/ppo_baseline.yaml"), Loader=yaml.FullLoader)



{'action_space': Discrete(2), 'observation_space': 'None', 'config': <ray.rllib.algorithms.algorithm_config.AlgorithmConfig object at 0x2898016f0>}


In [5]:

# mapper = {
#     "callbacks":{
#         "LogDistributionsCallback": LogDistributionsCallback
#     },
#     "env":{
#         "Grid_Gym": Grid_Gym
#     }
# }
# def preprocess_config(config):
#     """
#     Transform the string representations of classes in YAML
#     files to the corresponding python objects.

#     Args:
#         config (dict): parsed YAML config file
#     """
#     if "callbacks" in config["tune_config"]:
#         config["tune_config"]["callbacks"] = mapper["callbacks"][config["tune_config"]["callbacks"]]
#     try:
#         config["tune_config"]["env"] = mapper["env"][config["tune_config"]["env"]]
#     except: # if the env is not in the mapper, it is an already registerd env
#         pass


#     return config

# config = preprocess_config(yaml.load(open(args.algorithm_config_path), Loader=get_loader()))["tune_config"]