## Import Library

In [1]:
import os
import pprint

import ray
from ray.rllib.algorithms.ppo import PPOConfig

from env.eehemt_env import EEHEMTEnv_Norm, tunable_params_config
import torch as th

## Define args

In [None]:
csv_file_path = (
    "/home/u5977862/DRL-on-parameter-extraction/data/S25E02A025WS_25C_GMVG.csv"
)
va_file_path = "/home/u5977862/DRL-on-parameter-extraction/eehemt/eehemt114_2.va"
test_modified = True
n_iterations = 1

In [3]:
if th.cuda.device_count() == 4:
    num_learners = 4
    num_gpus_per_learner = 1.0
elif th.cuda.device_count == 2:
    num_learners = 2
    num_gpus_per_learner = 1.0
print(f"num_learners: {num_learners}\nnum_gpus_per_learner: {num_gpus_per_learner}")

num_learners: 4
num_gpus_per_learner: 1.0


In [4]:
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.env.env_runner_group import EnvRunnerGroup
from ray.rllib.utils.typing import ResultDict


def run_and_plot_evaluation(
    algorithm: Algorithm, eval_workers: EnvRunnerGroup
) -> ResultDict:
    """
    Custom evaluation function that runs one episode, plots the I-V curve,
    and returns final metrics.
    """
    print("\n--- Running final evaluation and plotting I-V curve... ---")

    # 1. Get the local evaluation worker, its environment, and the trained policy.
    local_worker = eval_workers.local_worker()
    env = local_worker.env
    policy = algorithm.get_policy()

    # 2. Run a single, deterministic episode to find the best parameters.
    obs, info = env.reset()
    terminated = truncated = False
    total_reward = 0.0

    while not terminated and not truncated:
        action, _, _ = policy.compute_single_action(observation=obs, explore=False)
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward

    print("Final evaluation episode finished.")
    print(f"Final RMSPE: {info['current_rmspe']:.6f}")
    print("Final Tunable Parameters:")
    final_tunable_params = {
        k: info["current_params"][k] for k in tunable_params_config.keys()
    }
    pprint.pprint(final_tunable_params)

    # 3. Plot the I-V curve using the environment's final state.
    output_dir = os.path.join(os.getcwd(), "results")
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, "final_iv_curve.png")

    env.plot_iv_curve(
        plot_initial=True, plot_modified=True, plot_current=True, save_path=save_path
    )

    # 4. Return a dictionary of final metrics.
    return {
        "final_episode_reward": total_reward,
        "final_rmspe": info["current_rmspe"],
    }


In [7]:
config = (
    PPOConfig()
    .environment(
        EEHEMTEnv_Norm,
        env_config={
            "csv_file_path": csv_file_path,
            "tunable_params_config": tunable_params_config,
            "va_file_path": va_file_path,
            "test_modified": test_modified,
        },
    )
    .env_runners(
        observation_filter="MeanStdFilter",  # Z-score norm better than L2 norm.
    )
    .training(
        train_batch_size_per_learner=2000,
        lr=0.0004,
    )
    .learners(
        num_learners=num_learners,
        num_gpus_per_learner=num_gpus_per_learner,
    )
    .framework("torch")
    .evaluation(
        # We only need one evaluation worker for plotting
        evaluation_num_env_runners=1,
        # We will call `evaluate()` manually, so no interval is needed.
        evaluation_interval=None,
        # Point to our custom function
        custom_evaluation_function=run_and_plot_evaluation,
        # Ensure evaluation is deterministic
        evaluation_config={"explore": False},
    )
)

## Training

In [None]:
algo = config.build_algo()

# Run the training loop
for i in range(n_iterations):
    results = algo.train()
    print(f"--- Iteration: {i + 1}/{n_iterations} ---")
    print(f"Episode Reward Mean: {results['episode_reward_mean']:.4f}")

print("\n--- Training completed. ---")

[2025-08-06 12:02:02,733 E 54475 54475] core_worker.cc:2740: Actor with class name: 'SingleAgentEnvRunner' and ID: '70f14b6bee89ce8f000ae38001000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor restart will fail. See https://github.com/ray-project/ray/issues/53727 for more details.
[2025-08-06 12:02:02,766 E 54475 54475] core_worker.cc:2740: Actor with class name: 'SingleAgentEnvRunner' and ID: '950e7c1b8192240292f4443e01000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor restart will fail. See https://github.com/ray-project/ray/issues/53727 for more details.
[2025-08-06 12:02:06,580 E 54475 54475] core_worker.cc:2740: Actor with class name: 'SingleAgentEnvRunner' and ID: 'cff6b5c2ac22db0f04ad6eec01000000' has constructor arguments in the object store and max_restarts > 0. If the arguments

## Evaluation

In [None]:
final_results = algo.evaluate()
print("\n--- Custom evaluation results ---")
pprint.pprint(final_results)

In [None]:
checkpoint_dir = "/home/u5977862/DRL-on-parameter-extraction/result/1"
checkpoint_dir = algo.save_to_path(checkpoint_dir)
print(f"\nFinal algorithm checkpoint saved to: {checkpoint_dir}")

algo.stop()
ray.shutdown()
print("\n--- Script finished. ---")

saved algo to /home/u5977862/DRL-on-parameter-extraction/result
