## Base-stock policy

In [1]:
import numpy as np
import ray
from ray.tune.registry import register_env
from ray.rllib.policy import Policy
from config import env_configs
from env import env_creator


class BaseStockPolicy(Policy):
    """
    Base Stock Policy orders as much to reach a desired inventory level equal to 
    production capacity (prod_capacity) for each stage.
    
    The order is computed as:
    
    order = max{ prod_capacity - (inventory + upstream_backlog + sum(deliveries)), 0 },
       
    """
    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self, observation_space, action_space, config)
        self.env = env_creator(config['env_config'])
        self.observation_space = self.env.agent_observation_space
        self.action_space = self.env.agent_action_space

    def compute_actions(
        self, obs_batch, state_batches, prev_action_batch=None, prev_reward_batch=None, info_batch=None,
        episodes=None, **kwargs):
        # Decode each observation.        
        decoded_obs = [self.decode_obs(obs) for obs in obs_batch]
        # Parse observations.
        parsed_obs = []
        for obs in decoded_obs:
            if isinstance(obs, dict):
                parsed_obs.append(obs)
            else:
                parsed_obs.append(self.env._parse_state(obs))
        actions = [self.get_base_stock_action(obs) for obs in parsed_obs]
        return actions, [], {}

    def decode_obs(self, obs):
        """
        Decode the o observation back to its original MultiDiscrete format.
        If the observation is already a dictionary, return it unchanged.
        """
        if isinstance(obs, dict):
            return obs

        original_obs = []
        index = 0
        for size in self.observation_space.nvec:
            one_hot_vector = obs[index:index + size]
            value = np.argmax(one_hot_vector)
            original_obs.append(value)
            index += size
        return np.array(original_obs)

    def get_base_stock_action(self, obs: dict, safety_ratio: float = 1.5) -> int:
        """
        Computíng the order based on the base-stock policy.
        order = desired_inventory - (inventory + upstream_backlog + sum(deliveries))
        """
        desired_inventory = int(1 * obs['prod_capacity'])
        current_position = obs['inventory'] + obs['upstream_backlog'] + np.sum(obs['deliveries'])
        action = desired_inventory - current_position
        action = min(max(0, action), self.env.max_production)
        return action

    def learn_on_batch(self, samples):
        return {}

    def get_weights(self):
        return {}

    def set_weights(self, weights):
        pass


def evaluate_policy(env_config_name: str, env_config: dict, num_episodes: int = 10):
    """
    Evaluating the BaseStockPolicy by running a loop over episodes.
    For stochastic demand environments, a new demand series is generated each episode.
    """
    ray.init(ignore_reinit_error=True)
    register_env("InventoryManagementEnv", env_creator)
    np.random.seed(0)

    env_instance = env_creator(env_config)
    agent_observation_space = env_instance.agent_observation_space
    agent_action_space = env_instance.agent_action_space

    # Instantiate the policy.
    policy_config = {"env_config": env_config}
    base_stock_policy = BaseStockPolicy(agent_observation_space, agent_action_space, policy_config)
    base_stock_policy.env = env_instance

    episode_rewards = []
    for ep in range(num_episodes):
        env_instance.reset()
        if env_config_name == "stochastic_demand":
            print(f"Episode {ep+1}: Generating a new stochastic demand series...")
            env_config["demand_fn"].generate_new_series()
        episode_reward = 0
        for period in range(env_instance.num_periods):
            state_dict = env_instance.parse_state(env_instance.state_dict)
            obs_batch = [state_dict[f'stage_{i}'] for i in range(env_instance.num_stages)]
            actions, _, _ = base_stock_policy.compute_actions(obs_batch, state_batches=None)
            action_dict = {f'stage_{i}': actions[i] for i in range(env_instance.num_stages)}
            next_states, rewards, terminations, truncations, infos = env_instance.step(action_dict)
            episode_reward += sum(rewards.values())
        episode_rewards.append(episode_reward)
        print(f"Episode {ep+1}: Total Reward = {episode_reward}")
        
    episode_reward_mean = np.mean(episode_rewards)
    episode_reward_std = np.std(episode_rewards)
    print(f"env_config_name = {env_config_name}, num_episodes = {num_episodes}, "
          f"episode_reward_mean = {episode_reward_mean:.2f}, episode_reward_std = {episode_reward_std:.2f}")
    ray.shutdown()


if __name__ == '__main__':
    for env_config_name, env_config in env_configs.items():
        evaluate_policy(env_config_name, env_config, num_episodes=100)


Variable demand for t=0: 4


2025-03-29 12:04:04,074	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = -296
Episode 2: Total Reward = -296
Episode 3: Total Reward = -296
Episode 4: Total Reward = -296
Episode 5: Total Reward = -296
Episode 6: Total Reward = -296
Episode 7: Total Reward = -296
Episode 8: Total Reward = -296
Episode 9: Total Reward = -296
Episode 10: Total Reward = -296
Episode 11: Total Reward = -296
Episode 12: Total Reward = -296
Episode 13: Total Reward = -296
Episode 14: Total Reward = -296
Episode 15: Total Reward = -296
Episode 16: Total Reward = -296
Episode 17: Total Reward = -296
Episode 18: Total Reward = -296
Episode 19: Total Reward = -296
Episode 20: Total Reward = -296
Episode 21: Total Reward = -296
Episode 22: Total Reward = -296
Episode 23: Total Reward = -296
Episode 24: Total Reward = -296
Episode 25: Total Reward = -296
Episode 26: Total Reward = -296
Episode 27: Total Reward = -296
Episode 28: Total Reward = -296
Episode 29: Total Reward = -296
Episode 30: Total Reward = -296
Episode 31: Total Reward = -296
Episode 32: Total

2025-03-29 12:04:11,050	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = -486
Episode 2: Total Reward = -600
Episode 3: Total Reward = -554
Episode 4: Total Reward = -484
Episode 5: Total Reward = -582
Episode 6: Total Reward = -461
Episode 7: Total Reward = -454
Episode 8: Total Reward = -580
Episode 9: Total Reward = -525
Episode 10: Total Reward = -496
Episode 11: Total Reward = -583
Episode 12: Total Reward = -612
Episode 13: Total Reward = -458
Episode 14: Total Reward = -634
Episode 15: Total Reward = -592
Episode 16: Total Reward = -515
Episode 17: Total Reward = -461
Episode 18: Total Reward = -510
Episode 19: Total Reward = -564
Episode 20: Total Reward = -595
Episode 21: Total Reward = -485
Episode 22: Total Reward = -484
Episode 23: Total Reward = -553
Episode 24: Total Reward = -515
Episode 25: Total Reward = -521
Episode 26: Total Reward = -502
Episode 27: Total Reward = -484
Episode 28: Total Reward = -500
Episode 29: Total Reward = -619
Episode 30: Total Reward = -556
Episode 31: Total Reward = -490
Episode 32: Total

2025-03-29 12:04:17,775	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = -404
Episode 2: Total Reward = -271
Episode 3: Total Reward = -532
Episode 4: Total Reward = -588
Episode 5: Total Reward = -366
Episode 6: Total Reward = -446
Episode 7: Total Reward = -435
Episode 8: Total Reward = -645
Episode 9: Total Reward = -604
Episode 10: Total Reward = -522
Episode 11: Total Reward = -284
Episode 12: Total Reward = -400
Episode 13: Total Reward = -397
Episode 14: Total Reward = -257
Episode 15: Total Reward = -281
Episode 16: Total Reward = -356
Episode 17: Total Reward = -391
Episode 18: Total Reward = -552
Episode 19: Total Reward = -605
Episode 20: Total Reward = -326
Episode 21: Total Reward = -628
Episode 22: Total Reward = -289
Episode 23: Total Reward = -430
Episode 24: Total Reward = -316
Episode 25: Total Reward = -427
Episode 26: Total Reward = -381
Episode 27: Total Reward = -346
Episode 28: Total Reward = -341
Episode 29: Total Reward = -349
Episode 30: Total Reward = -600
Episode 31: Total Reward = -529
Episode 32: Total

2025-03-29 12:04:24,938	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = -203
Episode 2: Total Reward = -278
Episode 3: Total Reward = -279
Episode 4: Total Reward = -278
Episode 5: Total Reward = -311
Episode 6: Total Reward = -327
Episode 7: Total Reward = -294
Episode 8: Total Reward = -262
Episode 9: Total Reward = -230
Episode 10: Total Reward = -309
Episode 11: Total Reward = -235
Episode 12: Total Reward = -261
Episode 13: Total Reward = -213
Episode 14: Total Reward = -233
Episode 15: Total Reward = -327
Episode 16: Total Reward = -281
Episode 17: Total Reward = -229
Episode 18: Total Reward = -367
Episode 19: Total Reward = -300
Episode 20: Total Reward = -329
Episode 21: Total Reward = -282
Episode 22: Total Reward = -286
Episode 23: Total Reward = -251
Episode 24: Total Reward = -250
Episode 25: Total Reward = -307
Episode 26: Total Reward = -276
Episode 27: Total Reward = -360
Episode 28: Total Reward = -270
Episode 29: Total Reward = -196
Episode 30: Total Reward = -235
Episode 31: Total Reward = -258
Episode 32: Total

2025-03-29 12:04:31,920	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = -92
Episode 2: Total Reward = -284
Episode 3: Total Reward = -251
Episode 4: Total Reward = -382
Episode 5: Total Reward = -479
Episode 6: Total Reward = -481
Episode 7: Total Reward = -433
Episode 8: Total Reward = -114
Episode 9: Total Reward = -249
Episode 10: Total Reward = -66
Episode 11: Total Reward = -258
Episode 12: Total Reward = -353
Episode 13: Total Reward = -274
Episode 14: Total Reward = -371
Episode 15: Total Reward = -298
Episode 16: Total Reward = -522
Episode 17: Total Reward = -163
Episode 18: Total Reward = -356
Episode 19: Total Reward = -322
Episode 20: Total Reward = -424
Episode 21: Total Reward = -416
Episode 22: Total Reward = -449
Episode 23: Total Reward = -327
Episode 24: Total Reward = -214
Episode 25: Total Reward = -192
Episode 26: Total Reward = -349
Episode 27: Total Reward = -471
Episode 28: Total Reward = -344
Episode 29: Total Reward = -371
Episode 30: Total Reward = -430
Episode 31: Total Reward = -315
Episode 32: Total R

2025-03-29 12:04:38,946	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = 43
Episode 2: Total Reward = 43
Episode 3: Total Reward = 43
Episode 4: Total Reward = 43
Episode 5: Total Reward = 43
Episode 6: Total Reward = 43
Episode 7: Total Reward = 43
Episode 8: Total Reward = 43
Episode 9: Total Reward = 43
Episode 10: Total Reward = 43
Episode 11: Total Reward = 43
Episode 12: Total Reward = 43
Episode 13: Total Reward = 43
Episode 14: Total Reward = 43
Episode 15: Total Reward = 43
Episode 16: Total Reward = 43
Episode 17: Total Reward = 43
Episode 18: Total Reward = 43
Episode 19: Total Reward = 43
Episode 20: Total Reward = 43
Episode 21: Total Reward = 43
Episode 22: Total Reward = 43
Episode 23: Total Reward = 43
Episode 24: Total Reward = 43
Episode 25: Total Reward = 43
Episode 26: Total Reward = 43
Episode 27: Total Reward = 43
Episode 28: Total Reward = 43
Episode 29: Total Reward = 43
Episode 30: Total Reward = 43
Episode 31: Total Reward = 43
Episode 32: Total Reward = 43
Episode 33: Total Reward = 43
Episode 34: Total R

2025-03-29 12:04:46,038	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = -679
Episode 2: Total Reward = -679
Episode 3: Total Reward = -679
Episode 4: Total Reward = -679
Episode 5: Total Reward = -679
Episode 6: Total Reward = -679
Episode 7: Total Reward = -679
Episode 8: Total Reward = -679
Episode 9: Total Reward = -679
Episode 10: Total Reward = -679
Episode 11: Total Reward = -679
Episode 12: Total Reward = -679
Episode 13: Total Reward = -679
Episode 14: Total Reward = -679
Episode 15: Total Reward = -679
Episode 16: Total Reward = -679
Episode 17: Total Reward = -679
Episode 18: Total Reward = -679
Episode 19: Total Reward = -679
Episode 20: Total Reward = -679
Episode 21: Total Reward = -679
Episode 22: Total Reward = -679
Episode 23: Total Reward = -679
Episode 24: Total Reward = -679
Episode 25: Total Reward = -679
Episode 26: Total Reward = -679
Episode 27: Total Reward = -679
Episode 28: Total Reward = -679
Episode 29: Total Reward = -679
Episode 30: Total Reward = -679
Episode 31: Total Reward = -679
Episode 32: Total

2025-03-29 12:04:52,787	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = -364
Episode 2: Total Reward = -364
Episode 3: Total Reward = -364
Episode 4: Total Reward = -364
Episode 5: Total Reward = -364
Episode 6: Total Reward = -364
Episode 7: Total Reward = -364
Episode 8: Total Reward = -364
Episode 9: Total Reward = -364
Episode 10: Total Reward = -364
Episode 11: Total Reward = -364
Episode 12: Total Reward = -364
Episode 13: Total Reward = -364
Episode 14: Total Reward = -364
Episode 15: Total Reward = -364
Episode 16: Total Reward = -364
Episode 17: Total Reward = -364
Episode 18: Total Reward = -364
Episode 19: Total Reward = -364
Episode 20: Total Reward = -364
Episode 21: Total Reward = -364
Episode 22: Total Reward = -364
Episode 23: Total Reward = -364
Episode 24: Total Reward = -364
Episode 25: Total Reward = -364
Episode 26: Total Reward = -364
Episode 27: Total Reward = -364
Episode 28: Total Reward = -364
Episode 29: Total Reward = -364
Episode 30: Total Reward = -364
Episode 31: Total Reward = -364
Episode 32: Total

2025-03-29 12:04:59,493	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Generating a new stochastic demand series...
Episode 1: Total Reward = -342
Episode 2: Generating a new stochastic demand series...
Episode 2: Total Reward = -656
Episode 3: Generating a new stochastic demand series...
Episode 3: Total Reward = -326
Episode 4: Generating a new stochastic demand series...
Episode 4: Total Reward = -512
Episode 5: Generating a new stochastic demand series...
Episode 5: Total Reward = -494
Episode 6: Generating a new stochastic demand series...
Episode 6: Total Reward = -382
Episode 7: Generating a new stochastic demand series...
Episode 7: Total Reward = -593
Episode 8: Generating a new stochastic demand series...
Episode 8: Total Reward = -227
Episode 9: Generating a new stochastic demand series...
Episode 9: Total Reward = -554
Episode 10: Generating a new stochastic demand series...
Episode 10: Total Reward = -321
Episode 11: Generating a new stochastic demand series...
Episode 11: Total Reward = -526
Episode 12: Generating a new stochastic

## rQ heuristics

In [2]:
class RQ_Policy:
    """
    (R, Q) Policy for a single-item inventory control problem.
    
    At each review period (every `review_interval` time steps), if the pipeline inventory is below 
    the reorder point (R), the policy orders a fixed quantity (Q). Otherwise, no order is placed.
    """
    def __init__(self, observation_space, action_space, config):
        # This policy is used for simulation; no learning is performed.
        self.env = env_creator(config['env_config'])
        self.observation_space = self.env.agent_observation_space
        self.action_space = self.env.agent_action_space
        self.review_interval = config.get("review_interval", 1)
        self.order_quantity = config.get("order_quantity", 5)
        self.safety_ratio = config.get("safety_ratio", 1.5)

    def compute_actions(self, obs_batch, state_batches=None, **kwargs):
        # Decode observations: if already a dictionary, return as-is.
        decoded_obs = [self.decode_obs(obs) for obs in obs_batch]
        # If the observation is already a dict, skip _parse_state.
        parsed_obs = [
            obs if isinstance(obs, dict) else self.env._parse_state(obs)
            for obs in decoded_obs
        ]
        actions = [self.get_rq_action(obs) for obs in parsed_obs]
        return actions, [], {}

    def decode_obs(self, obs):
        """
        Decoding an observation back into its original MultiDiscrete format.
        If the observation is already a dictionary, assume it is already decoded.
        """
        if isinstance(obs, dict):
            return obs
        original_obs = []
        index = 0
        for size in self.observation_space.nvec:
            one_hot_vector = obs[index:index + size]
            value = np.argmax(one_hot_vector)
            original_obs.append(value)
            index += size
        return np.array(original_obs)

    def get_rq_action(self, obs: dict) -> int:
        """
        Compute the (R, Q) action.
        If the current period is not a review period, no order is placed.
        Otherwise, the policy computes the reorder point (R) based on historical sales and lead time.
        If the pipeline inventory is below R, it orders a fixed quantity (Q).
        """
        current_period = obs.get("period", 0)
        if self.review_interval > 1 and (current_period % self.review_interval != 0):
            return 0

        # Computing demand statistics.
        mean_demand = np.mean(obs['sales'])
        demand_std = np.std(obs['sales'])
        lead_time = obs['lead_time']

        # Computing safety stock and reorder point.
        safety_stock = self.safety_ratio * demand_std * np.sqrt(lead_time)
        reorder_point = mean_demand * lead_time + safety_stock

        # Computing pipeline inventory.
        pipeline_inventory = obs['inventory'] + obs['upstream_backlog'] + np.sum(obs['deliveries'])
        order = self.order_quantity if pipeline_inventory < reorder_point else 0

        # Ensuring order quantity is within valid bounds.
        order = min(max(0, int(order)), self.env.max_production)
        return order

    def learn_on_batch(self, samples):
        return {}

    def get_weights(self):
        return {}

    def set_weights(self, weights):
        pass


def evaluate_rq_policy(env_config_name: str, env_config: dict, num_episodes: int = 10):
    """
    Evaluate the RQ_Policy in the inventory management environment.
    
    For each episode, the environment is reset, and, if stochastic demand is configured,
    a new demand series is generated once per episode.
    """
    ray.init(ignore_reinit_error=True)
    register_env("InventoryManagementEnv", env_creator)
    np.random.seed(0)

    # Create one environment instance to extract observation/action spaces.
    im_env = env_creator(env_config)
    agent_observation_space = im_env.agent_observation_space
    agent_action_space = im_env.agent_action_space

    # Instantiate the RQ_Policy. Override its internal env with our simulation instance.
    policy_config = {"env_config": env_config}
    rq_policy = RQ_Policy(agent_observation_space, agent_action_space, policy_config)
    rq_policy.env = im_env

    episode_rewards = []
    for ep in range(num_episodes):
        # Reset environment for a new episode.
        im_env.reset()
        if env_config_name == "stochastic_demand":
            print(f"Episode {ep + 1}: Generating a new stochastic demand series...")
            env_config["demand_fn"].generate_new_series()

        episode_reward = 0
        for period in range(im_env.num_periods):
            # Getring current state of the environment.
            state_dict = im_env.parse_state(im_env.state_dict)
            obs_batch = [state_dict[f'stage_{i}'] for i in range(im_env.num_stages)]
            # Compute actions using the RQ_Policy.
            actions, _, _ = rq_policy.compute_actions(obs_batch)
            action_dict = {f'stage_{i}': actions[i] for i in range(im_env.num_stages)}
            # Take a step in the environment.
            next_states, rewards, terminations, truncations, infos = im_env.step(action_dict)
            episode_reward += sum(rewards.values())
        episode_rewards.append(episode_reward)
        print(f"Episode {ep + 1}: Total Reward = {episode_reward}")

    episode_reward_std = np.std(episode_rewards)
    avg_reward = np.mean(episode_rewards)
    print(f"env_config_name = {env_config_name}", 
        f"Average Reward over {num_episodes} episodes: {avg_reward:.2f}"
          f"episode_reward_std = {episode_reward_std:.2f}")

if __name__ == '__main__':
    for env_config_name, env_config in env_configs.items():
        evaluate_rq_policy(env_config_name, env_config, num_episodes=100)


2025-03-29 12:05:22,499	INFO worker.py:1816 -- Started a local Ray instance.


Episode 1: Total Reward = -207
Episode 2: Total Reward = -207
Episode 3: Total Reward = -207
Episode 4: Total Reward = -207
Episode 5: Total Reward = -207
Episode 6: Total Reward = -207
Episode 7: Total Reward = -207
Episode 8: Total Reward = -207
Episode 9: Total Reward = -207
Episode 10: Total Reward = -207
Episode 11: Total Reward = -207
Episode 12: Total Reward = -207
Episode 13: Total Reward = -207
Episode 14: Total Reward = -207
Episode 15: Total Reward = -207
Episode 16: Total Reward = -207
Episode 17: Total Reward = -207
Episode 18: Total Reward = -207
Episode 19: Total Reward = -207
Episode 20: Total Reward = -207
Episode 21: Total Reward = -207
Episode 22: Total Reward = -207
Episode 23: Total Reward = -207
Episode 24: Total Reward = -207
Episode 25: Total Reward = -207
Episode 26: Total Reward = -207
Episode 27: Total Reward = -207
Episode 28: Total Reward = -207
Episode 29: Total Reward = -207
Episode 30: Total Reward = -207
Episode 31: Total Reward = -207
Episode 32: Total

2025-03-29 12:05:26,343	INFO worker.py:1649 -- Calling ray.init() again after it has already been called.


Episode 1: Total Reward = -319
Episode 2: Total Reward = -424
Episode 3: Total Reward = -374
Episode 4: Total Reward = -294
Episode 5: Total Reward = -445
Episode 6: Total Reward = -310
Episode 7: Total Reward = -296
Episode 8: Total Reward = -427
Episode 9: Total Reward = -377
Episode 10: Total Reward = -321
Episode 11: Total Reward = -400
Episode 12: Total Reward = -446
Episode 13: Total Reward = -284
Episode 14: Total Reward = -485
Episode 15: Total Reward = -422
Episode 16: Total Reward = -371
Episode 17: Total Reward = -298
Episode 18: Total Reward = -320
Episode 19: Total Reward = -382
Episode 20: Total Reward = -414
Episode 21: Total Reward = -347
Episode 22: Total Reward = -298
Episode 23: Total Reward = -406
Episode 24: Total Reward = -354
Episode 25: Total Reward = -359
Episode 26: Total Reward = -330
Episode 27: Total Reward = -333
Episode 28: Total Reward = -338
Episode 29: Total Reward = -427
Episode 30: Total Reward = -376
Episode 31: Total Reward = -340
Episode 32: Total

2025-03-29 12:05:27,388	INFO worker.py:1649 -- Calling ray.init() again after it has already been called.


Episode 1: Total Reward = -472
Episode 2: Total Reward = -202
Episode 3: Total Reward = -521
Episode 4: Total Reward = -482
Episode 5: Total Reward = -227
Episode 6: Total Reward = -343
Episode 7: Total Reward = -274
Episode 8: Total Reward = -603
Episode 9: Total Reward = -537
Episode 10: Total Reward = -405
Episode 11: Total Reward = -258
Episode 12: Total Reward = -363
Episode 13: Total Reward = -439
Episode 14: Total Reward = -140
Episode 15: Total Reward = -217
Episode 16: Total Reward = -256
Episode 17: Total Reward = -361
Episode 18: Total Reward = -515
Episode 19: Total Reward = -524
Episode 20: Total Reward = -306
Episode 21: Total Reward = -600
Episode 22: Total Reward = -436
Episode 23: Total Reward = -298
Episode 24: Total Reward = -211
Episode 25: Total Reward = -390
Episode 26: Total Reward = -491
Episode 27: Total Reward = -190
Episode 28: Total Reward = -262
Episode 29: Total Reward = -305
Episode 30: Total Reward = -553
Episode 31: Total Reward = -412
Episode 32: Total

2025-03-29 12:05:28,458	INFO worker.py:1649 -- Calling ray.init() again after it has already been called.


Episode 1: Total Reward = -311
Episode 2: Total Reward = -302
Episode 3: Total Reward = -312
Episode 4: Total Reward = -369
Episode 5: Total Reward = -359
Episode 6: Total Reward = -276
Episode 7: Total Reward = -343
Episode 8: Total Reward = -294
Episode 9: Total Reward = -186
Episode 10: Total Reward = -337
Episode 11: Total Reward = -282
Episode 12: Total Reward = -334
Episode 13: Total Reward = -288
Episode 14: Total Reward = -285
Episode 15: Total Reward = -287
Episode 16: Total Reward = -281
Episode 17: Total Reward = -226
Episode 18: Total Reward = -311
Episode 19: Total Reward = -342
Episode 20: Total Reward = -352
Episode 21: Total Reward = -290
Episode 22: Total Reward = -281
Episode 23: Total Reward = -332
Episode 24: Total Reward = -312
Episode 25: Total Reward = -273
Episode 26: Total Reward = -274
Episode 27: Total Reward = -317
Episode 28: Total Reward = -270
Episode 29: Total Reward = -267
Episode 30: Total Reward = -222
Episode 31: Total Reward = -330
Episode 32: Total

2025-03-29 12:05:29,454	INFO worker.py:1649 -- Calling ray.init() again after it has already been called.


Episode 1: Total Reward = -27
Episode 2: Total Reward = -63
Episode 3: Total Reward = -74
Episode 4: Total Reward = -81
Episode 5: Total Reward = -234
Episode 6: Total Reward = -262
Episode 7: Total Reward = -176
Episode 8: Total Reward = -122
Episode 9: Total Reward = -72
Episode 10: Total Reward = -37
Episode 11: Total Reward = -176
Episode 12: Total Reward = -145
Episode 13: Total Reward = 6
Episode 14: Total Reward = -145
Episode 15: Total Reward = -54
Episode 16: Total Reward = -310
Episode 17: Total Reward = -122
Episode 18: Total Reward = -143
Episode 19: Total Reward = -84
Episode 20: Total Reward = -193
Episode 21: Total Reward = -202
Episode 22: Total Reward = -161
Episode 23: Total Reward = -129
Episode 24: Total Reward = -159
Episode 25: Total Reward = -70
Episode 26: Total Reward = -133
Episode 27: Total Reward = -187
Episode 28: Total Reward = -148
Episode 29: Total Reward = -151
Episode 30: Total Reward = -173
Episode 31: Total Reward = -51
Episode 32: Total Reward = -13

KeyboardInterrupt: 