In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [2]:
class BusbarIESEnv(gym.Env):
    """
    一个简化的母线制电热综合能源系统环境 (IES)
    - State: [时刻(0-23), 电价, 电负荷, 热负荷]
    - Action: [CHP出力(0-1), EB出力(0-1)] (连续动作)
    """

    def __init__(self):
        super(BusbarIESEnv, self).__init__()

        # --- 定义动作空间和状态空间 ---
        # 动作空间: 2个连续动作，均归一化到[-1, 1]
        self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)

        # 状态空间: 4个连续值
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 0]),
            high=np.array([23, 1.5, 500, 500]),  # 假设电价、负荷上限
            shape=(4,),
            dtype=np.float32,
        )

        # --- 系统参数 (简化) ---
        self.gas_price = 0.35  # 元/kWh
        self.chp_max_power = 300  # kW
        self.chp_elec_efficiency = 0.4  # 发电效率
        self.chp_heat_efficiency = 0.5  # 余热回收效率
        self.eb_max_power = 200  # kW
        self.eb_efficiency = 0.98

        # --- 模拟数据 (简化) ---
        # 预设一天24小时的负荷和电价曲线
        self.elec_prices = np.array(
            [
                0.4,
                0.4,
                0.4,
                0.4,
                0.4,
                0.7,
                0.7,
                1.2,
                1.2,
                1.2,
                0.7,
                0.7,
                1.2,
                1.2,
                0.7,
                0.7,
                0.7,
                1.2,
                1.2,
                1.2,
                0.7,
                0.7,
                0.4,
                0.4,
            ]
        )
        self.elec_loads = np.array(
            [
                100,
                100,
                100,
                110,
                120,
                150,
                200,
                300,
                320,
                350,
                300,
                250,
                220,
                220,
                250,
                280,
                330,
                400,
                420,
                380,
                300,
                250,
                180,
                120,
            ]
        )
        self.heat_loads = np.array(
            [
                150,
                150,
                140,
                140,
                160,
                180,
                220,
                280,
                300,
                300,
                280,
                250,
                230,
                220,
                230,
                260,
                300,
                350,
                360,
                320,
                280,
                240,
                200,
                160,
            ]
        )

        self.current_step = 0

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        obs = self._get_obs()
        info = {}
        return obs, info

    def step(self, action):
        # --- 1. 将归一化的动作转换为实际物理值 ---
        # tanh激活函数输出在[-1, 1], 我们把它映射到[0, 1]
        chp_output_ratio = (action[0] + 1) / 2
        eb_output_ratio = (action[1] + 1) / 2

        # --- 2. 计算能源产出 ---
        chp_gas_consumption = (
            self.chp_max_power * chp_output_ratio
        ) / self.chp_elec_efficiency
        chp_elec_production = self.chp_max_power * chp_output_ratio
        chp_heat_production = chp_gas_consumption * self.chp_heat_efficiency

        eb_elec_consumption = self.eb_max_power * eb_output_ratio
        eb_heat_production = eb_elec_consumption * self.eb_efficiency

        # --- 3. 进行能量平衡计算 ---
        # 热平衡 (热负荷必须满足，不足则惩罚)
        total_heat_production = chp_heat_production + eb_heat_production
        heat_load = self.heat_loads[self.current_step]
        heat_mismatch = heat_load - total_heat_production

        # 电平衡
        elec_load = self.elec_loads[self.current_step]
        total_elec_consumption = elec_load + eb_elec_consumption
        elec_from_grid = total_elec_consumption - chp_elec_production

        # --- 4. 计算成本和奖励 ---
        grid_price = self.elec_prices[self.current_step]

        # a. 能源成本
        gas_cost = chp_gas_consumption * self.gas_price
        elec_cost = max(0, elec_from_grid) * grid_price  # 只有买电才花钱

        # b. 售电收入
        elec_revenue = -min(0, elec_from_grid) * grid_price * 0.8  # 卖电收入打8折

        # c. 热负荷惩罚 (关键!)
        heat_penalty = max(0, heat_mismatch) * 999  # 严重惩罚供热不足

        # d. 总奖励 (目标是最大化奖励，所以成本和惩罚是负的)
        reward = elec_revenue - gas_cost - elec_cost - heat_penalty

        # --- 5. 更新状态 ---
        self.current_step += 1
        done = self.current_step >= 23  # 一天24小时结束

        obs = self._get_obs()
        info = {}  # 可以用来返回调试信息

        # Gymnasium API返回5个值
        return obs, reward, done, False, info

    def _get_obs(self):
        # 获取当前状态
        hour = self.current_step
        price = self.elec_prices[hour]
        elec_load = self.elec_loads[hour]
        heat_load = self.heat_loads[hour]
        return np.array([hour, price, elec_load, heat_load], dtype=np.float32)


In [4]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# 1. 创建并向量化环境
# 向量化是PPO等算法高效采样的关键
vec_env = make_vec_env(BusbarIESEnv, n_envs=4)  # 使用4个并行环境来加速采样

# 2. 创建PPO模型
# "MlpPolicy" 表示使用多层感知机作为Actor和Critic的网络结构
model = PPO("MlpPolicy", vec_env, verbose=1, tensorboard_log="./ppo_ies_tensorboard/")

# 3. 开始训练！
# SB3会自动处理PPO的“收集数据->更新->丢弃数据”的循环
print("--- 开始训练PPO模型 ---")
model.learn(total_timesteps=500_000)
print("--- 训练完成 ---")

# 4. 评估训练好的模型
print("\n--- 评估训练好的策略 ---")
eval_env = BusbarIESEnv()
obs, _ = eval_env.reset()
total_reward = 0
for day in range(3):  # 模拟评估3天
    obs, _ = eval_env.reset()
    done = False
    while not done:
        # deterministic=True表示在评估时不进行随机探索
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, _, info = eval_env.step(action)
        total_reward += reward
print(f"3天模拟的总奖励(越高越好，代表成本越低): {total_reward:.2f}")


Using cpu device
--- 开始训练PPO模型 ---
Logging to ./ppo_ies_tensorboard/PPO_2


  gym.logger.warn(


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23       |
|    ep_rew_mean     | -1.1e+06 |
| time/              |          |
|    fps             | 10660    |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 8192     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 23            |
|    ep_rew_mean          | -1.07e+06     |
| time/                   |               |
|    fps                  | 3515          |
|    iterations           | 2             |
|    time_elapsed         | 4             |
|    total_timesteps      | 16384         |
| train/                  |               |
|    approx_kl            | 0.00054356747 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.84         |
|    explained_variance   | 7.75e-07      |
