In [1]:
import random
import re
import tensorflow as tf
import os
import sys
import grid2op
import numpy as np
import shutil
from grid2op.Agent import AgentWithConverter, BaseAgent
from collections import deque
import time

In [4]:
class DQN_Model(tf.keras.Model):
    def __init__(self, act_dim):
        '''act_dim：动作空间的长度'''
        super().__init__()
        self.dense_cells = 1500
        self.act_dim = act_dim
        self.dense1 = tf.keras.layers.Dense(units=self.dense_cells, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units=self.dense_cells, activation=tf.nn.relu)
        self.dense3 = tf.keras.layers.Dense(units=self.dense_cells, activation=tf.nn.relu)
        self.dense4 = tf.keras.layers.Dense(units=self.act_dim, activation=tf.nn.softmax)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dense4(x)
        return x

    def predict(self,inputs):
        # TODO: inputs：obs，return：动作
        q_values = self(inputs)
        return int(tf.argmax(q_values,axis=-1))

In [5]:
def array2action(env, array):
    action = env.action_space.from_vect(array)
    return action

In [10]:
learning_rate = 10e-5
initial_epsilon = 0.2            # 探索起始时的探索率
final_epsilon = 0.01            # 探索终止时的探索率
num_episodes = 1000                # 游戏训练的总episode数量
batch_size = 64
gamma = 0.90                      # 折扣因子
# num_exploration_episodes = np.floor(num_episodes/5)
num_exploration_episodes = 200
DATA_PATH = '/Users/yuzhao/miniforge3/envs/env_rl/lib/python3.8/site-packages/grid2op/data/l2rpn_case14_sandbox'
SCENARIO_PATH = '/Users/yuzhao/miniforge3/envs/env_rl/lib/python3.8/site-packages/grid2op/data/l2rpn_case14_sandbox/chronics'


In [11]:
if __name__ == '__main__':
    env = grid2op.make(dataset=DATA_PATH, chronics_path=SCENARIO_PATH)
    all_actions = env.action_space.get_all_unitary_topologies_change(env.action_space)
    model = DQN_Model(act_dim=len(all_actions))
    summary_writer = tf.summary.create_file_writer('./tensorboard/')
    checkpoint = tf.train.Checkpoint(myModel=model)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    replay_buffer = deque(maxlen=10000)
    epsilon = initial_epsilon
    stp = 0
    loss = 0

    for episode in range(num_episodes):
        print('====================================')
        cur_time = time.time()
        epsilon = max(initial_epsilon * (num_exploration_episodes - episode) / num_exploration_episodes,final_epsilon)# 计算当前探索率
        env = grid2op.make(dataset=DATA_PATH, chronics_path=SCENARIO_PATH)
        #以下两行作用是保存部分chronics，并舍弃部分chronics，见官方文档
        env.chronics_handler.set_filter(lambda path: re.match(".*00[0-9].*", path) is not None)
        kept = env.chronics_handler.reset()
        env.chronics_handler.shuffle()
        # 遍历chronics
        for chronic in range(len(kept)):
            SCN_STEP = 0

            env.reset()
            dst_step = 0
            print('Scenario 为 [%s]' % (env.chronics_handler.get_name()))
            env.fast_forward_chronics(dst_step)
            obs, done = env.get_obs(), False
            while not done:
                stp+=1
                SCN_STEP+=1
                if random.random() < epsilon:
                    action = random.choice(all_actions)
                    action_idx = all_actions.index(action)
                else:
                    action_idx = model.predict(obs.to_vect().reshape(-1,len(obs.to_vect())))
                    action = all_actions[int(action_idx)]
                obs_, reward, done, _ = env.step(action)
                # reward = -10 if done else reward
                with summary_writer.as_default():
                    tf.summary.scalar("reward", reward, step=stp)
                replay_buffer.append((obs.to_vect().reshape(-1,len(obs.to_vect())), action_idx, reward, obs_.to_vect().reshape(-1,len(obs_.to_vect())), 1 if done else 0)) #放入经验回放池
                obs = obs_

                if done:
                    print("episode: %4d, epsilon %.4f"%(episode, epsilon))
                    # print(loss)
                    print(SCN_STEP)
                    with summary_writer.as_default():
                        tf.summary.scalar("RUN_STEPS",SCN_STEP,step=stp)
                    break

                if len(replay_buffer) >= batch_size:
                    # 从经验回放池中随机取一个batch的数据
                    # mini_batch = random.sample(replay_buffer, batch_size)
                    # obs_batch, action_idx_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
                    # for experience in mini_batch:
                    #     s, a, r, s_p, dd = experience
                    #     obs_batch.append(s)
                    #     action_idx_batch.append(a)
                    #     reward_batch.append(r)
                    #     next_obs_batch.append(s_p)
                    #     done_batch.append(dd)
                    # obs_batch = np.array(obs_batch).astype('float32')
                    # action_idx_batch = np.array(action_idx_batch,dtype=object).astype('int32')
                    # reward_batch = np.array(reward_batch).astype('float32')
                    # next_obs_batch = np.array(next_obs_batch).astype('float32')
                    # done_batch = np.array(done_batch).astype('float32')
                    obs_batch, action_idx_batch, reward_batch, next_obs_batch, done_batch = \
                    map(np.array, zip(*random.sample(replay_buffer, batch_size)))
                    q_value = model(next_obs_batch)
                    q_value = tf.squeeze(q_value)
                    y = reward_batch + (gamma * tf.reduce_max(q_value, axis=1)) * (1-done_batch)

                    with tf.GradientTape() as tape:
                        loss = tf.keras.losses.mean_squared_error(y_true=y,y_pred=tf.reduce_sum(tf.squeeze(model(obs_batch)) * tf.one_hot(action_idx_batch, depth=len(all_actions)), axis=1))
                    grads = tape.gradient(loss, model.variables)
                    with summary_writer.as_default():
                        tf.summary.scalar("loss", loss, step=stp)
                    optimizer.apply_gradients(grads_and_vars=zip(grads, model.variables))
        print(time.time()-cur_time)
        # file_path = '/Users/yuzhao/Desktop/StudyPool/pythonProject/DQNckpt/DQN_2'+str(episode+1)
        # model.save(filepath=file_path)


Scenario 为 [0001]
episode:    0, epsilon 0.2000
20
Scenario 为 [0000]
episode:    0, epsilon 0.2000
14
Scenario 为 [0002]
episode:    0, epsilon 0.2000
2
1.0150890350341797
Scenario 为 [0002]
episode:    1, epsilon 0.1990
35
Scenario 为 [0000]
episode:    1, epsilon 0.1990
12
Scenario 为 [0001]
episode:    1, epsilon 0.1990
2
2.2475950717926025
Scenario 为 [0001]
episode:    2, epsilon 0.1980
16
Scenario 为 [0000]
episode:    2, epsilon 0.1980
17
Scenario 为 [0002]
episode:    2, epsilon 0.1980
1
2.5257818698883057
Scenario 为 [0001]
episode:    3, epsilon 0.1970
15
Scenario 为 [0002]
episode:    3, epsilon 0.1970
21
Scenario 为 [0000]
episode:    3, epsilon 0.1970
10
3.478698968887329
Scenario 为 [0001]
episode:    4, epsilon 0.1960
5
Scenario 为 [0002]
episode:    4, epsilon 0.1960
5
Scenario 为 [0000]
episode:    4, epsilon 0.1960
1
0.8369920253753662
Scenario 为 [0002]
episode:    5, epsilon 0.1950
11
Scenario 为 [0000]
episode:    5, epsilon 0.1950
19
Scenario 为 [0001]
episode:    5, epsilon 0.19

KeyboardInterrupt: 

In [12]:
class DQNAgent(BaseAgent):
    def __init__(self, env, action_space, model):
        super(DQNAgent, self).__init__(action_space=action_space)
        self.env = env
        self.actions = action_space
        self.dqn_model = model

    def find_best_line_to_reconnect(self, obs, original_action):    #这里返回的是两个动作合在一起
        disconnected_lines = np.where(obs.line_status == False)[0]
        if not len(disconnected_lines):
            return original_action
        if (obs.time_before_cooldown_line[disconnected_lines] > 0).all():
            return original_action
        o, _, _, _ = obs.simulate(original_action)
        min_rho = o.rho.max()
        line_to_reconnect = -1
        for line in disconnected_lines:
            if not obs.time_before_cooldown_line[line]:
                reconnect_array = np.zeros_like(obs.rho)
                reconnect_array[line] = 1
                reconnect_action = deepcopy(original_action)
                reconnect_action.update({'set_line_status': reconnect_array})
                if not self.is_legal(reconnect_action, obs):
                    continue
                o, _, _, _ = obs.simulate(reconnect_action)
                if o.rho.max() < min_rho:
                    line_to_reconnect = line
                    min_rho = o.rho.max()
        if line_to_reconnect != -1:
            reconnect_array = np.zeros_like(obs.rho)
            reconnect_array[line_to_reconnect] = 1
            original_action.update({'set_line_status': reconnect_array})
        return original_action

    def act(self, observation, done=False):
        if observation.rho.max() < 0.999:
            a=self.env.action_space()
            return self.find_best_line_to_reconnect(observation, a)    #这里返回的是两个动作合在一起
            # return a
        else:# >0.999
            action_idx = model.predict(observation.to_vect().reshape(-1,len(observation.to_vect())))
            a = self.actions[int(action_idx)]
            return a

In [13]:
DATA_PATH = '/Users/yuzhao/miniforge3/envs/env_rl/lib/python3.8/site-packages/grid2op/data/l2rpn_case14_sandbox'
SCENARIO_PATH = '/Users/yuzhao/miniforge3/envs/env_rl/lib/python3.8/site-packages/grid2op/data/l2rpn_case14_sandbox/chronics'
env = grid2op.make(dataset=DATA_PATH, chronics_path=SCENARIO_PATH)
all_actions = env.action_space.get_all_unitary_topologies_change(env.action_space)
DQNA = DQNAgent(env, all_actions, model)

In [14]:
steps = []
tt_reward = []
av_reward = []
old_t = time.time()
for i in range(30):
    print(i)
    done = False  #先把done置0
    time_step = int(0)  #计数
    cum_reward = 0.
    obs = env.reset()  #复位env
    reward = env.reward_range[0]
    max_iter = 8064  #设置最大迭代次数
    while not done:
        act = DQNA.act(observation=obs) # chose an action to do, in this case "do nothing"
        obs, reward, done, info = env.step(act) # implement this action on the powergrid
        cum_reward += reward
        time_step += 1
        if time_step >= max_iter:
            break
    steps.append(time_step)
    tt_reward.append(cum_reward)
    av_reward.append(cum_reward/time_step)
ttt = time.time()-old_t


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [15]:
steps

[575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575,
 575]

In [16]:
model.summary()

Model: "dqn__model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             multiple                  670500    
_________________________________________________________________
dense_13 (Dense)             multiple                  2251500   
_________________________________________________________________
dense_14 (Dense)             multiple                  2251500   
_________________________________________________________________
dense_15 (Dense)             multiple                  282188    
Total params: 5,455,688
Trainable params: 5,455,688
Non-trainable params: 0
_________________________________________________________________


In [17]:
tf.reduce_mean(av_reward)


<tf.Tensor: shape=(), dtype=float64, numpy=792.3049238918138>

In [18]:
ttt

157.4586501121521