In [20]:
import tensorflow
tf = tensorflow.compat.v1
tf.disable_eager_execution()

In [21]:
import gym
import math
import time
import random
import numpy as np
from collections import deque

In [22]:
# Hyper Parameters
ACTOR_LAYER1_SIZE = 256
ACTOR_LAYER2_SIZE = 256
ACTOR_LEARNING_RATE = 0.0001
ACTOR_TAU = 0.001

CRITIC_LAYER1_SIZE = 256
CRITIC_LAYER2_SIZE = 256
CRITIC_LEARNING_RATE = 0.0001
CRITIC_TAU = 0.001
CRITIC_L2 = 0.0001

In [23]:
def weight_variable(shape, name):
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial, name)


def bias_variable(shape, name):
    initial = tf.constant(0.03, shape=shape)
    return tf.Variable(initial, name)

In [24]:
class ActorNetwork(object):
    """ Map: state + limit_load -> action """

    def __init__(self, sess, input_config, load_model, summ_writer):
        self.sess = sess
        self.state_dimension = input_config.state_dimension
        self.action_dimension = input_config.action_dimension
        self.save_iter = input_config.save_iter  # interval of saving log
        self.save_path = input_config.model_save_path + "/actor"  # interval of saving model
        self.log_iter = input_config.log_iter  # logging interval in training phase
        self.log_path = input_config.log_path  # log path
        self.clip_norm = input_config.clip_norm
        self.step = 0

        self.train_writer = summ_writer

        # create actor network
        self.state_input, self.action_output, self.net = self.create_network(self.state_dimension, self.action_dimension)
        # create target actor network
        self.target_state_input, self.target_action_output, self.target_update, self.target_net = self.create_target_network(
            self.state_dimension, self.action_dimension, self.net)
        self.create_training_method()

        self.saver = tf.train.Saver()
        # self.saver = tf.train.Saver(tf.global_variables(scope=scope))
        if load_model:
            # restore actor network
            print('actor network restore weights')
            self.saver.restore(sess=self.sess, save_path=tf.train.latest_checkpoint(input_config.load_path))
        else:
            self.sess.run(tf.global_variables_initializer())

        self.update_target()


    def create_training_method(self):
        self.q_gradient_input = tf.placeholder("float", [None, self.action_dimension])
        self.unnormalized_actor_gradients = tf.gradients(self.action_output, self.net, -self.q_gradient_input)
        # self.actor_gradients = list(map(lambda x: tf.div(x, BATCH_SIZE), self.unnormalized_actor_gradients))
        # gradients clip
        # self.actor_gradients, _ = tf.clip_by_global_norm(self.actor_gradients, clip_norm=self.clip_norm)

        # extra_ops = tf.get_collection('actor_parameters_extra_option')
        # apply_op = tf.train.AdamOptimizer(ACTOR_LEARNING_RATE).apply_gradients(zip(self.unnormalized_actor_gradients, self.net))
        apply_op = tf.train.RMSPropOptimizer(ACTOR_LEARNING_RATE).apply_gradients(zip(self.unnormalized_actor_gradients, self.net))

        # train_ops = [apply_op] + extra_ops
        # self.optimizer = tf.group(*apply_op)
        self.optimizer = apply_op

        diff = self.action_output - self.target_action_output
        self.mse = tf.reduce_mean(tf.square(diff))
        pretrain_grad = tf.gradients(self.mse, self.net)
        self.pretrain_update = tf.train.AdamOptimizer(ACTOR_LEARNING_RATE).apply_gradients(
            zip(pretrain_grad, self.net))



    def create_network(self, state_dimension, action_dimension):
        ACTOR_LAYER1_SIZE = ACTOR_LAYER1_SIZE
        ACTOR_LAYER2_SIZE = ACTOR_LAYER2_SIZE

        state_input = tf.placeholder("float", [None, state_dimension])

        w1 = self.variable([state_dimension, ACTOR_LAYER1_SIZE], state_dimension)
        b1 = self.variable([ACTOR_LAYER1_SIZE], state_dimension)
        w2 = self.variable([ACTOR_LAYER1_SIZE, ACTOR_LAYER2_SIZE], ACTOR_LAYER1_SIZE)
        b2 = self.variable([ACTOR_LAYER2_SIZE], ACTOR_LAYER1_SIZE)
        w3 = tf.Variable(tf.random_uniform([ACTOR_LAYER2_SIZE, action_dimension], -3e-3, 3e-3))
        b3 = tf.Variable(tf.random_uniform([action_dimension], -3e-3, 3e-3))

        layer1 = tf.nn.relu(tf.matmul(state_input, w1) + b1)
        layer2 = tf.nn.relu(tf.matmul(layer1, w2) + b2)
        action_output = tf.sigmoid(tf.matmul(layer2, w3) + b3)
        out_summ = tf.summary.histogram('action_output', action_output)

        w1_summ = tf.summary.histogram('W1', values=w1)
        b1_summ = tf.summary.histogram('b1', values=b1)

        w2_summ = tf.summary.histogram('W2', values=w2)
        b2_summ = tf.summary.histogram('b2', values=b2)

        w3_summ = tf.summary.histogram('W3', values=w3)
        b3_summ = tf.summary.histogram('b3', values=b3)

        self.merged_summ = tf.summary.merge([out_summ, w1_summ, b1_summ, w2_summ, b2_summ, w3_summ, b3_summ])
        # self.merged_summ = tf.summary.merge([out_summ])

        return state_input, action_output, [w1, b1, w2, b2, w3, b3]

    def create_target_network(self, state_dimension, action_dimension, net):
        state_input = tf.placeholder("float", [None, state_dimension])
        ema = tf.train.ExponentialMovingAverage(decay=1 - ACTOR_TAU)
        target_update = ema.apply(net)
        target_net = [ema.average(x) for x in net]

        layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1])
        layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + target_net[3])

        action_output = tf.tanh(tf.matmul(layer2, target_net[4]) + target_net[5])

        return state_input, action_output, target_update, target_net

    def update_target(self):
        self.sess.run(self.target_update)

    def train(self, q_gradient_batch, state_batch):
        train_feed_dict = {
            self.q_gradient_input: q_gradient_batch,
            self.state_input: state_batch
        }
        summ, _ = self.sess.run([self.merged_summ, self.optimizer], feed_dict=train_feed_dict)
        # _ = self.sess.run([self.optimizer], feed_dict=train_feed_dict)

        # save actor network
        if self.step % self.save_iter == 0:
            self.saver.save(self.sess, save_path=self.save_path, global_step=self.step)

        if self.step % self.log_iter == 0:
            self.train_writer.add_summary(summ, global_step=self.step)

        self.step += 1

    def pretrain(self, state, label):
        # cost
        train_feed_dict = {self.state_input: state, self.target_action_output: label}
        _, net, mse = self.sess.run([self.pretrain_update, self.net, self.mse], feed_dict=train_feed_dict)
        # save actor network
        if self.step % self.save_iter == 0:
            self.saver.save(self.sess, save_path=self.save_path, global_step=self.step)

        self.step += 1
        return net, mse

    def actions(self, state_batch):
        return self.sess.run(self.action_output, feed_dict={
            self.state_input: state_batch
        })

    def action(self, state):
        return self.sess.run(self.action_output, feed_dict={
            self.state_input: [state]
        })[0]

    def target_actions(self, state_batch):
        return self.sess.run(self.target_action_output, feed_dict={
            self.target_state_input: state_batch
        })

        # f fan-in size
    def variable(self, shape, f):
        return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f)))

    def save_network(self, episode):
        print('save actor-network...', episode)
        self.saver.save(self.sess, 'saved_actor_networks/' + 'actor-network', global_step=episode)

'''
    def load_network(self):
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state("saved_actor_networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print "Successfully loaded:", checkpoint.model_checkpoint_path
        else:
            print "Could not find old network weights"

'''


'\n    def load_network(self):\n        self.saver = tf.train.Saver()\n        checkpoint = tf.train.get_checkpoint_state("saved_actor_networks")\n        if checkpoint and checkpoint.model_checkpoint_path:\n            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)\n            print "Successfully loaded:", checkpoint.model_checkpoint_path\n        else:\n            print "Could not find old network weights"\n\n'

In [25]:
class CostCriticNetwork(object):
    def __init__(self, sess, input_config, summ_writer):
        self.time_step = 0
        self.sess = sess
        self.state_dimension = input_config.state_dimension
        self.action_dimension = input_config.action_dimension
        self.clip_norm = input_config.clip_norm
        self.step = 0
        self.log_iter = input_config.log_iter  # logging interval in training phase
        self.log_path = input_config.log_path  # logging interval in training phase

        self.train_writer_cost = summ_writer


        # create cost network
        self.state_input, \
        self.action_input, \
        self.cost_value_output, \
        self.cost_net = self.create_cost_network(self.state_dimension, self.action_dimension)

        # create target cost network (the same structure with cost network)
        self.target_state_input, \
        self.target_action_input, \
        self.target_cost_value_output, \
        self.cost_target_update = self.create_target_cost_network(self.state_dimension, self.action_dimension, self.cost_net)

        self.create_training_method()

        self.sess.run(tf.global_variables_initializer())

        self.update_target()


    def create_training_method(self):
        # Define training optimizer
        self.z_input = tf.placeholder("float", [None, 1])
        weight_decay = tf.add_n([CRITIC_L2 * tf.nn.CRITIC_L2_loss(var) for var in self.cost_net])
        self.cost_cost = tf.reduce_mean(tf.square(self.z_input - self.cost_value_output)) + weight_decay
        self.optimizer = tf.train.AdamOptimizer(CRITIC_LEARNING_RATE).minimize(self.cost_cost)
        self.action_gradients_cost = tf.gradients(self.cost_value_output, self.action_input)

    def create_cost_network(self, state_dimension, action_dimension):
        # the layer size could be changed
        CRITIC_LAYER1_SIZE = CRITIC_LAYER1_SIZE
        CRITIC_LAYER2_SIZE = CRITIC_LAYER2_SIZE

        state_input = tf.placeholder("float", [None, state_dimension])
        action_input = tf.placeholder("float", [None, action_dimension])

        # Input -> Hidden Layer
        w1 = weight_variable([state_dimension, CRITIC_LAYER1_SIZE])
        b1 = bias_variable([CRITIC_LAYER1_SIZE])
        # Hidden Layer -> Hidden Layer + Action
        w2 = weight_variable([CRITIC_LAYER1_SIZE, CRITIC_LAYER2_SIZE])
        w2a = weight_variable([action_dimension, CRITIC_LAYER2_SIZE])
        b2 = bias_variable([CRITIC_LAYER2_SIZE])
        # Hidden Layer -> Output (Q)
        w3 = weight_variable([CRITIC_LAYER2_SIZE, 1])
        b3 = bias_variable([1])

        # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
        h1 = tf.nn.relu(tf.matmul(state_input, w1) + b1)
        # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
        # Action inserted here
        h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action_input, w2a) + b2)

        cost_value_output = tf.matmul(h2, w3) + b3

        return state_input, action_input, cost_value_output, [w1, b1, w2, w2a, b2, w3, b3]

    def create_target_cost_network(self, state_dimension, action_dimension, net):
        state_input = tf.placeholder("float", [None, state_dimension])
        action_input = tf.placeholder("float", [None, action_dimension])

        ema = tf.train.ExponentialMovingAverage(decay=1 - CRITIC_TAU)
        target_update = ema.apply(net)
        target_net = [ema.average(x) for x in net]

        layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1])
        layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + tf.matmul(action_input, target_net[3]) + target_net[4])
        cost_value_output = tf.identity(tf.matmul(layer2, target_net[5]) + target_net[6])

        return state_input, action_input, cost_value_output, target_update

    def update_target(self):
        self.sess.run(self.cost_target_update)

    def train(self, z_batch, state_batch, action_batch):
        # c_loss_summ = tf.summary.scalar('cost_critic_loss', self.cost_cost)
        # self.merged_cost = tf.summary.merge([c_loss_summ])

        train_feed_dict = {
            self.z_input: z_batch,
            self.state_input: state_batch,
            self.action_input: action_batch
        }
        _, cost_critic_loss, cost_action_grad_norm = \
            self.sess.run([self.optimizer, self.cost_cost, self.action_gradients_cost], train_feed_dict)

        # if self.step % self.log_iter == 0:
        #     self.train_writer_cost.add_summary(merged_summ_cost, global_step=self.step)

        self.step += 1

        return cost_critic_loss, cost_action_grad_norm

    def pretrain(self, z_batch, state_batch, action_batch):
        train_feed_dict = {
            self.z_input: z_batch,
            self.state_input: state_batch,
            self.action_input: action_batch
        }
        _, cost_critic_loss = self.sess.run([self.optimizer, self.cost_cost], train_feed_dict)
        return cost_critic_loss

    def gradients(self, state_batch, action_batch):
        return self.sess.run(self.action_gradients_cost, feed_dict={
            self.state_input: state_batch,
            self.action_input: action_batch
        })[0]

    def target_cost(self, state_batch, action_batch):
        return self.sess.run(self.target_cost_value_output, feed_dict={
            self.target_state_input: state_batch,
            self.target_action_input: action_batch
        })

    def cost_value(self, state_batch, action_batch):
        return self.sess.run(self.cost_value_output, feed_dict={
            self.state_input: state_batch,
            self.action_input: action_batch})

        # f fan-in size
    def variable(self, shape, f):
        return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f)))

'''
    def load_network(self):
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state("saved_cost_critic_networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print "Successfully loaded:", checkpoint.model_checkpoint_path
        else:
            print "Could not find old network weights"
    def save_network(self,time_step):
        print 'save cost-critic-network...',time_step
        self.saver.save(self.sess, 'saved_cost_critic_networks/' + 'cost-critic-network', global_step = time_step)
'''


'\n    def load_network(self):\n        self.saver = tf.train.Saver()\n        checkpoint = tf.train.get_checkpoint_state("saved_cost_critic_networks")\n        if checkpoint and checkpoint.model_checkpoint_path:\n            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)\n            print "Successfully loaded:", checkpoint.model_checkpoint_path\n        else:\n            print "Could not find old network weights"\n    def save_network(self,time_step):\n        print \'save cost-critic-network...\',time_step\n        self.saver.save(self.sess, \'saved_cost_critic_networks/\' + \'cost-critic-network\', global_step = time_step)\n'

In [26]:
class RewardCriticNetwork(object):
    def __init__(self, sess, input_config, summ_writer):
        self.time_step = 0
        self.sess = sess
        self.state_dimension = input_config.state_dimension
        self.action_dimension = input_config.action_dimension
        self.clip_norm = input_config.clip_norm
        self.step = 0
        self.log_iter = input_config.log_iter  # logging interval in training phase
        self.log_path = input_config.log_path  # logging interval in training phase

        self.train_writer = summ_writer

        # create reward network
        self.state_input, \
        self.action_input, \
        self.reward_value_output, \
        self.net = self.create_reward_network(self.state_dimension, self.action_dimension)

        # create target reward network (the same structure with reward network)
        self.target_state_input, \
        self.target_action_input, \
        self.target_reward_value_output, \
        self.target_update = self.create_target_reward_network(self.state_dimension, self.action_dimension, self.net)

        self.create_training_method()

        self.sess.run(tf.global_variables_initializer())

        self.update_target()

    def create_training_method(self):
        # Define training optimizer
        self.y_input = tf.placeholder("float", [None, 1])
        weight_decay = tf.add_n([CRITIC_L2 * tf.nn.l2_loss(var) for var in self.net])
        self.cost = tf.reduce_mean(tf.square(self.y_input - self.reward_value_output)) + weight_decay
        self.optimizer = tf.train.AdamOptimizer(CRITIC_LEARNING_RATE).minimize(self.cost)
        self.action_gradients = tf.gradients(self.reward_value_output, self.action_input)


    def create_reward_network(self, state_dimension, action_dimension):
        # the layer size could be changed
        layer1_size = CRITIC_LAYER1_SIZE
        layer2_size = CRITIC_LAYER2_SIZE

        state_input = tf.placeholder("float", [None, state_dimension])
        action_input = tf.placeholder("float", [None, action_dimension])

        # Input -> Hidden Layer
        w1 = weight_variable([state_dimension, layer1_size])
        b1 = bias_variable([layer1_size])
        # Hidden Layer -> Hidden Layer + Action
        w2 = weight_variable([layer1_size, layer2_size])
        w2a = weight_variable([action_dimension, layer2_size])
        b2 = bias_variable([layer2_size])
        # Hidden Layer -> Output (Q)
        w3 = weight_variable([layer2_size, 1])
        b3 = bias_variable([1])

        # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
        h1 = tf.nn.relu(tf.matmul(state_input, w1) + b1)
        # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
        # Action inserted here
        h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action_input, w2a) + b2)

        reward_value_output = tf.matmul(h2, w3) + b3

        return state_input, action_input, reward_value_output, [w1, b1, w2, w2a, b2, w3, b3]

    def create_target_reward_network(self, state_dimension, action_dimension, net):
        state_input = tf.placeholder("float", [None, state_dimension])
        action_input = tf.placeholder("float", [None, action_dimension])

        ema = tf.train.ExponentialMovingAverage(decay=1 - CRITIC_TAU)
        target_update = ema.apply(net)
        target_net = [ema.average(x) for x in net]

        layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1])
        layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + tf.matmul(action_input, target_net[3]) + target_net[4])
        reward_value_output = tf.identity(tf.matmul(layer2, target_net[5]) + target_net[6])

        return state_input, action_input, reward_value_output, target_update

    def update_target(self):
        self.sess.run(self.target_update)

    def train(self, y_batch, state_batch, action_batch):
        # r_loss_summ = tf.summary.scalar('reward_critic_loss', self.cost)
        # self.merged = tf.summary.merge([r_loss_summ])

        train_feed_dict = {
            self.y_input: y_batch,
            self.state_input: state_batch,
            self.action_input: action_batch
        }
        _, reward_critic_loss, reward_action_grad_norm = \
            self.sess.run([self.optimizer, self.cost, self.action_gradients], train_feed_dict)

        # if self.step % self.log_iter == 0:
        #     self.train_writer.add_summary(merged_summ, global_step=self.step)

        self.step += 1

        return reward_critic_loss, reward_action_grad_norm

    def pretrain(self, y_batch, state_batch, action_batch):
        train_feed_dict = {
            self.y_input: y_batch,
            self.state_input: state_batch,
            self.action_input: action_batch
        }
        _, reward_critic_loss = self.sess.run([self.optimizer, self.cost], train_feed_dict)
        return reward_critic_loss

    def gradients(self, state_batch, action_batch):
        return self.sess.run(self.action_gradients, feed_dict={
            self.state_input: state_batch,
            self.action_input: action_batch
        })[0]

    def target_reward(self, state_batch, action_batch):
        return self.sess.run(self.target_reward_value_output, feed_dict={
            self.target_state_input: state_batch,
            self.target_action_input: action_batch
        })

    def reward_value(self, state_batch, action_batch):
        return self.sess.run(self.reward_value_output, feed_dict={
            self.state_input: state_batch,
            self.action_input: action_batch})

        # f fan-in size
    def variable(self, shape, f):
        return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f)))

'''
    def load_network(self):
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state("saved_reward_critic_networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print "Successfully loaded:", checkpoint.model_checkpoint_path
        else:
            print "Could not find old network weights"
    def save_network(self,time_step):
        print 'save reward-critic-network...',time_step
        self.saver.save(self.sess, 'saved_reward_critic_networks/' + 'reward-critic-network', global_step = time_step)
'''

'\n    def load_network(self):\n        self.saver = tf.train.Saver()\n        checkpoint = tf.train.get_checkpoint_state("saved_reward_critic_networks")\n        if checkpoint and checkpoint.model_checkpoint_path:\n            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)\n            print "Successfully loaded:", checkpoint.model_checkpoint_path\n        else:\n            print "Could not find old network weights"\n    def save_network(self,time_step):\n        print \'save reward-critic-network...\',time_step\n        self.saver.save(self.sess, \'saved_reward_critic_networks/\' + \'reward-critic-network\', global_step = time_step)\n'

====

In [27]:
OUTER_START_POS = 0
OUTER_SIZE = 11
STATE_SIZE = 47
ACTION_SIZE = 51
STATE_START_POS = OUTER_START_POS + OUTER_SIZE
ACTION_START_POS = STATE_START_POS + STATE_SIZE
NEW_STATE_START_POS = ACTION_START_POS + ACTION_SIZE

NOX_POS = 40
STEAM_TEMP_POS = 48
STEAM_PRES_POS = 49
NEG_PRES_POS = 44
LIM_LOAD_POS = 11
LOAD_POS = 46
EFFI_WEIGHT = 0.8

In [28]:
def get_efficiency(state):
    return 0.8  # @todo

def get_emission(state):
    return 0.2  # @todo

In [29]:
def compute_reward(state):
    # coals = get_coals(action)
    efficiency = get_efficiency(state)
    emission = get_emission(state)
    # print('effi', EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission)
    reward = EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission
    if np.mean(reward) > 1:
        print(reward, efficiency, emission)
    return 10*(EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission)


def compute_cost(state):
    # cost 1, 负荷:lim_load ~ limload+25
    cost_load = np.zeros([len(state)])

    # cost 2, 主蒸汽温度:569-10 ~ 569+5
    cost_steam_temp = np.zeros([len(state)])

    # cost 3, 主蒸汽压力:given_pres-0.5 ~ given_pres+0.5
    cost_steam_pres = np.zeros([len(state)])

    return 1/3*cost_load + 1/3*cost_steam_temp + 1/3*cost_steam_pres


def compute_done(state):
    return False

In [30]:
class ReplayBuffer(object):
    """Using explorated data based on simulator"""
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.num_experiences = 0
        self.buffer = deque()
        self.real_data = np.load('/Users/xhr/PycharmProjects/Boiler/Simulator/data/replay_buffer.npy')
        nums = len(self.real_data)
        self.num_indices = list(range(nums))
        random.shuffle(self.num_indices)
        self.real_start_indice = 0

    def get_batch(self, batch_size):
        # Randomly sample batch_size examples
        return random.sample(self.buffer, batch_size)

    def get_real_batch(self, batch_size):
        return self.real_data[np.random.choice(self.real_data.shape[0], batch_size, replace=False), :]

    def size(self):
        return self.buffer_size

    def add(self, state, action, reward, cost, new_state, done, mix_ratio):
        experience = (state, action, reward, cost, new_state, done)
        if self.num_experiences < self.buffer_size:
            self.buffer.append(experience)
            for _ in range(mix_ratio):
                s, a, s_, done = self.generate_real()
                r = compute_reward(s)
                c = compute_cost(s)
                d = compute_done(s)
                e = (s, a, r, c, s_, d)
                # print('s-{}-a{}-ns{}'.format(s.shape, a.shape, s_.shape))

                self.buffer.append(e)
            self.num_experiences += 1
        else:
            for _ in range(mix_ratio+1):
                self.buffer.popleft()
            self.buffer.append(experience)
            for _ in range(mix_ratio):
                s, a, s_, done = self.generate_real()
                r = compute_reward(s)
                c = compute_cost(s)
                d = compute_done(s)
                e = (s, a, r, c, s_, d)
                self.buffer.append(e)

    def generate_real(self):
            s = self.real_data[self.real_start_indice, :58]
            a = self.real_data[self.real_start_indice, 58:109]
            s_ = self.real_data[self.real_start_indice, 109:156]
            s_ = np.concatenate([s[:11], s_])
            done = self.real_data[self.real_start_indice, -1]
            self.real_start_indice += 1
            if self.real_start_indice == len(self.real_data):
                self.real_start_indice = 0
            return s, a, s_, done

    def count(self):
        # if buffer is full, return buffer size
        # otherwise, return experience counter
        return self.num_experiences

    def erase(self):
        self.buffer = deque()
        self.num_experiences = 0

noise

In [31]:
class OUNoise:
    """docstring for OUNoise"""
    def __init__(self, action_dimension, mu=0.5, theta=0.4, sigma=0.2, weight_decay=0.9999):
        self.action_dimension = action_dimension
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dimension) * self.mu
        self.weight = 1
        self.weight_decay = weight_decay
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu

    def update_weight(self):
        self.weight *= self.weight_decay

    def noise(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx * self.weight
        return self.state

DDPG

In [32]:
# EPSILON定义一个极小值
EPSILON = 1e-5
# Hyper Parameters:
REPLAY_MEMORY_SIZE = 10000
REPLAY_START_SIZE = 1000
GAMMA = 0.9
COST_EPSILON = 1
DUAL_STEP_SIZE = 0.01
is_grad_inverter = False

In [33]:
class PrimalDualDDPG(object):
    """ Primal Dual Deep Deterministic Policy Gradient Algorithm"""

    def __init__(self, sess, input_config, is_batch_norm, summ_writer=None, load_model=False):
        self.state_dimension = input_config.state_dimension
        self.action_dimension = input_config.action_dimension
        self.dual_lambda = input_config.init_dual_lambda
        self.save_path = input_config.model_save_path
        self.train_display_iter = input_config.train_display_iter
        self.batch_size = input_config.batch_size
        self.gamma = GAMMA
        self.summay_writer = summ_writer

        self.sess = sess
        self.step = 0


        # if is_batch_norm:
        #     self.rewward_critic_network = RewardCriticNetwork(self.sess, self.state_dimension, self.action_dimension)
        #     self.cost_critic_network = CostCriticNetwork(self.sess, self.state_dimension, self.action_dimension)
        #     self.actor_network = ActorNetwork(self.sess, self.state_dimension, self.action_dimension)

        # else:
        self.reward_critic_network = RewardCriticNetwork(self.sess, input_config, self.summay_writer)
        self.cost_critic_network = CostCriticNetwork(self.sess, input_config, self.summay_writer)
        self.actor_network = ActorNetwork(self.sess, input_config, load_model=False, summ_writer=self.summay_writer)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_MEMORY_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dimension)

        # for name in input_config.__dict__:
        #     if isinstance(input_config.__dict__[name], int) or isinstance(input_config.__dict__[name], float):
        #         self.log(f'parameter|input_config_{name}:{input_config.__dict__[name]}')

        # model saver
        self.saver = tf.train.Saver()
        if load_model:
            self.saver.restore(sess=self.sess, save_path=tf.train.latest_checkpoint(self.save_path))


    # def __del__(self):
    #     self.logfile.close()
    #
    # def log(self, *args):
    #     self.logfile.write(*args)
    #     self.logfile.write('\n')

    def train(self):
        # print "train step", self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(self.batch_size)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        cost_batch = np.asarray([data[3] for data in minibatch])
        next_state_batch = np.asarray([data[4] for data in minibatch])
        done_batch = np.asarray([data[5] for data in minibatch])

        # Calculate y_batch
        target_action_batch = self.actor_network.target_actions(next_state_batch)
        target_reward_value = self.reward_critic_network.target_reward(next_state_batch, target_action_batch)
        target_cost_value = self.cost_critic_network.target_cost(next_state_batch, target_action_batch)
        y_batch, z_batch = [], []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
                z_batch.append(cost_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * target_reward_value[i])
                z_batch.append(cost_batch[i] + GAMMA * target_cost_value[i])

        y_batch = np.resize(y_batch, [self.batch_size, 1])
        z_batch = np.resize(z_batch, [self.batch_size, 1])

        # Update reward critic by minimizing the loss L
        reward_critic_loss, reward_action_grad_norm = self.reward_critic_network.train(y_batch, state_batch, action_batch)
        # q_value = self.critic_network.get_q_value(state_limit_batch, action_batch)

        # Update cost critic by minimizing the loss L
        cost_critic_loss, cost_action_grad_norm = self.cost_critic_network.train(z_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient
        if is_grad_inverter:
            action_batch_for_gradients = self.actor_network.actions(state_batch)
            action_batch_for_gradients = self.grad_inv.invert(action_batch_for_gradients, )
        else:
            action_batch_for_gradients = self.actor_network.actions(state_batch)
        print('action_batch_for_gradients', action_batch_for_gradients)
        reward_gradient_batch = self.reward_critic_network.gradients(state_batch, action_batch_for_gradients)
        cost_gradient_batch = self.cost_critic_network.gradients(state_batch, action_batch_for_gradients)
        q_gradient_batch = reward_gradient_batch - self.dual_lambda * cost_gradient_batch
        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the dual variable using the sample gradient
        cost_value_batch = self.cost_critic_network.cost_value(state_batch, action_batch_for_gradients)
        cost_limit_batch = np.array([[COST_EPSILON] for _ in range(self.batch_size)])
        self.dual_gradients = np.mean(cost_value_batch - cost_limit_batch)
        self.dual_lambda += DUAL_STEP_SIZE * self.dual_gradients
        self.dual_lambda = np.max([EPSILON, self.dual_lambda])  # ensure dual >= 0

        if self.step % self.train_display_iter == 0:
            print("reward_critic: loss:{:.3f} action_grads_norm:{:.3f} "
                  "| cost_critic: loss:{:.3f} action_grads_norm:{:.3f}"
                  "| q_gradient:{:.3f}".format(
                reward_critic_loss, np.mean(reward_action_grad_norm),
                cost_critic_loss, np.mean(cost_action_grad_norm), np.mean(q_gradient_batch)))
            print("Dual lambda: {}".format(self.dual_lambda))


        # Update the target networks
        self.reward_critic_network.update_target()
        self.cost_critic_network.update_target()
        self.actor_network.update_target()
        self.step += 1

    def noise_action(self, state, episode):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        if episode % 10 == 0:
            self.exploration_noise.update_weight()
        noise_action = action + self.exploration_noise.noise()
        noise_action = np.minimum(np.maximum(noise_action, 0), 1)  # bound action to [0, 1]
        return noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def get_dual_lambda(self):
        return self.dual_lambda

    def perceive(self, state, action, reward, cost, next_state, done, mix_ratio):
        # Store transition (s_t,a_t,r_t,c_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, cost, next_state, done, mix_ratio)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def save_model(self):
        self.saver.save(sess=self.sess, save_path=self.save_path)  #global_step=10,会自动生成名字-10

In [34]:
SIM_REAL_RATIO = 1
MAX_EPISODES = 30000
MAX_EP_STEPS = 10

In [35]:
class input_config():
    batch_size = 32
    init_dual_lambda = 1
    state_dimension = 58
    action_dimension = 51
    clip_norm = 5.
    train_display_iter = 200
    model_save_path = './models/'
    # model_name = "sim_ddpg"
    # logdir = './logs/{}-{}-{}-{:.2f}/'.format(
    #     model_name, MAX_EP_STEPS, SIM_REAL_RATIO, init_dual_lambda)
    # log_path = logdir + 'saved_models/'
    log_path = "logs/nonpre_nonexp_" + str(SIM_REAL_RATIO) + "_pdddpg_summary"
    save_iter = 500
    log_iter = 100

In [36]:
def pre_train_actor_network(agent, epochs=3):
    replay_buffer = agent.replay_buffer

    for epoch in range(epochs):
        step = 0
        while step < 1000:
            minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size)
            step += 1
            state_batch, action_batch, _, _ = convert_to_tuple(minibatch)

            _, mse = agent.actor_network.pretrain(state=state_batch, label=action_batch)

        # display
        if epoch % 1 == 0:
            print('-----------------pre-train actor network-----------------')
            print('epoch = {} mse = {:.4f}'.format(epoch, mse))

# 预训练 reward critical 网络
def pre_train_reward_critic_network(agent, epochs=3):
    replay_buffer = agent.replay_buffer
    for train_times in range(epochs):
        step = 0
        while step < 1000:
            minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size)
            step += 1
            state_batch, action_batch, next_state_batch, _ = convert_to_tuple(minibatch)
            reward_batch = compute_reward(state_batch)

            y_batch = []
            target_action = agent.actor_network.target_actions(next_state_batch)
            target_value = agent.reward_critic_network.target_reward(next_state_batch, target_action)

            for i in range(len(minibatch)):
                y_batch.append(reward_batch[i] + agent.gamma * target_value[i])

            # update critic network
            reward_critic_loss = agent.reward_critic_network.pretrain(y_batch, state_batch, action_batch)

        # display
        if train_times % 1 == 0:
            print('-----------------pre-train reward critic network-----------------')
            print("reward_critic: loss:{:.3f}".format(reward_critic_loss))

# 预训练 cost critical 网络
def pre_train_cost_critic_network(agent, epochs=3):
    replay_buffer = agent.replay_buffer
    step = 0
    for train_times in range(epochs):
        step = 0
        while step < 1000:
            minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size)
            step += 1
            state_batch, action_batch, next_state_batch, _ = convert_to_tuple(minibatch)
            cost_batch = compute_cost(state_batch)

            z_batch = []
            target_action = agent.actor_network.target_actions(next_state_batch)
            target_value = agent.cost_critic_network.target_cost(next_state_batch, target_action)

            for i in range(len(minibatch)):
                z_batch.append(cost_batch[i] + agent.gamma * target_value[i])

            # update critic network
            cost_critic_loss = agent.cost_critic_network.pretrain(z_batch, state_batch, action_batch)

        # display
        if train_times % 1 == 0:
            print('-----------------pre-train cost critic network-----------------')
            print("reward_critic: loss:{:.3f}".format(cost_critic_loss))


def main():
    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    config.gpu_options.allow_growth = True

    # Set up summary writer
    summary_writer = tf.summary.FileWriter(input_config.log_path)

    tf.reset_default_graph()
    agent_graph = tf.Graph()
    agent_sess = tf.Session(config=config, graph=agent_graph)
    with agent_graph.as_default():
        agent = PrimalDualDDPG(sess=agent_sess, input_config=input_config, is_batch_norm=False, summ_writer=summary_writer)
        total_parameters = 0
        for variable in tf.trainable_variables():
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            # print(shape)
            # print(len(shape))
            variable_parameters = 1
            for dim in shape:
                # print(dim)
                variable_parameters *= dim.value
            # print(variable_parameters)
            total_parameters += variable_parameters
        print('total parameters: {}'.format(total_parameters))

    # build environment graph
    env_graph = tf.Graph()
    env_sess = tf.Session(config=config, graph=env_graph)
    with env_graph.as_default():
        env = SimulatorEnvironment(sess=env_sess)

    # pre_train
    # pre_train_actor_network(agent=agent, epochs=1)
    # pre_train_reward_critic_network(agent=agent, epochs=1)
    # pre_train_cost_critic_network(agent=agent, epochs=1)
    # agent.actor_network.update_target()
    # agent.reward_critic_network.update_target()
    # agent.cost_critic_network.update_target()

    for episode in range(MAX_EPISODES):
        dual_variable = input_config.init_dual_lambda
        ep_reward = 0
        ep_cost = 0
        state = env.reset()

        for step in range(MAX_EP_STEPS):
            # action = restrictive_action(agent.action(state), episode)
            action = agent.noise_action(state, episode)
            next_state, reward, cost, done = env.step(action)
            ep_reward += reward
            ep_cost += cost
            agent.perceive(state, action, reward, cost, next_state, done, mix_ratio=SIM_REAL_RATIO)
            dual_variable = agent.get_dual_lambda()
            state = next_state
        summary = tf.Summary()
        summary.value.add(tag='Steps_sum_Reward', simple_value=float(ep_reward/MAX_EP_STEPS))
        summary.value.add(tag='Steps_sum_Cost', simple_value=float(ep_cost/MAX_EP_STEPS))
        summary.value.add(tag='Dual_variable', simple_value=float(dual_variable))
        summary_writer.add_summary(summary, episode)

        summary_writer.flush()

        print('Episode:{} | Reward: {:.2f} | Cost: {:.2f}'.format(episode, ep_reward/MAX_EP_STEPS, ep_cost/MAX_EP_STEPS))

        if episode % 100 == 0 and episode >= 100:
            agent.save_model()

    print("-------------save model--------------------")
    agent.save_model()

    summary_writer.close()

In [37]:
main()

TypeError: weight_variable() missing 1 required positional argument: 'name'