# 问题设定

在小车倒立杆（CartPole）游戏中，我们希望通过强化学习训练一个智能体（agent），尽可能不断地左右移动小车，使得小车上的杆不倒，我们首先定义CartPole游戏：

CartPole游戏即是强化学习模型的enviorment，它与agent交互，实时更新state，内部定义了reward function，其中state有以下定义：

state每一个维度分别代表了：

- 小车位置，它的取值范围是-2.4到2.4
- 小车速度，它的取值范围是负无穷到正无穷
- 杆的角度，它的取值范围是-41.8°到41.8°
- 杆的角速，它的取值范围是负无穷到正无穷

action是一个2维向量，每一个维度分别代表向左和向右移动。

$$
action \in \mathbb{R}^2
$$

# DQN

我们将设计一个网络，作为状态-动作值函数（state-action value function），其输入是state，输出是对应各个action的value，并TD（Temporal Difference）进行迭代训练直至收敛。我们将定义两个这样的网络，分别记作$\theta$和$\theta^-$，分别代表估计网络与目标网络。

我们希望最小化：

$$
\left( y_j - Q \left( \phi_j, a_j; \theta \right) \right)^2
$$

其中，$a_j$具有以下形式：

$$
a_j = \mathrm{argmax}_{a} Q \left( \phi(s_j), a; \theta\right)
$$

其中，$y_j$具有以下形式：

$$
f(x)=
\begin{cases}
r_j & \text{if episode ends at j + 1}\\
r_j + \gamma \max_{a^{\prime}} \hat{Q} \left( \phi_{j+1}, a^{\prime}; \theta^{-} \right)& \text{otherwise}
\end{cases}$$



在最小化TD-Error时，我们将固定目标网络，只对估计网络做梯度反向传播，每次到达一定迭代次数后，将估计网络的权重复制到目标网络。在这个过程中，需要用到经验回放（Experience Replay）技术，即将每一次迭代观测到的$s_t, r_t, a_t, s_{t+1}$作为一个元组缓存，然后在这些缓存中随机抽取元组做批次梯度下降。

# 代码实现

In [2]:
# coding=utf-8

import tensorflow as tf
import numpy as np
import gym
import sys

sys.path.append('..')

from base.model import *

%matplotlib inline

  return f(*args, **kwds)


In [3]:
class Agent(BaseRLModel):

    def __init__(self, session, env, a_space, s_space, **options):
        super(Agent, self).__init__(session, env, a_space, s_space, **options)

        self._init_input()
        self._init_nn()
        self._init_op()
        self._init_saver()

        self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))
        self.buffer_count = 0

        self.total_train_step = 0

        self.update_target_net_step = 200

        self.session.run(tf.global_variables_initializer())

    def _init_input(self, *args):
        with tf.variable_scope('input'):
            self.s_n = tf.placeholder(tf.float32, [None, self.s_space])
            self.s = tf.placeholder(tf.float32,   [None, self.s_space])
            self.r = tf.placeholder(tf.float32,   [None, ])
            self.a = tf.placeholder(tf.int32,     [None, ])

    def _init_nn(self, *args):
        with tf.variable_scope('actor_net'):
            # w,b initializer
            w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3)
            b_initializer = tf.constant_initializer(0.1)

            with tf.variable_scope('predict_q_net'):
                phi_state = tf.layers.dense(self.s,
                                            32,
                                            tf.nn.relu,
                                            kernel_initializer=w_initializer,
                                            bias_initializer=b_initializer)

                self.q_predict = tf.layers.dense(phi_state,
                                                 self.a_space,
                                                 kernel_initializer=w_initializer,
                                                 bias_initializer=b_initializer)

            with tf.variable_scope('target_q_net'):
                phi_state_next = tf.layers.dense(self.s_n,
                                                 32,
                                                 tf.nn.relu,
                                                 kernel_initializer=w_initializer,
                                                 bias_initializer=b_initializer)

                self.q_target = tf.layers.dense(phi_state_next,
                                                self.a_space,
                                                kernel_initializer=w_initializer,
                                                bias_initializer=b_initializer)

    def _init_op(self):
        with tf.variable_scope('q_real'):
            # size of q_value_real is [BATCH_SIZE, 1]
            max_q_value = tf.reduce_max(self.q_target, axis=1)
            q_next = self.r + self.gamma * max_q_value
            self.q_next = tf.stop_gradient(q_next)

        with tf.variable_scope('q_predict'):
            # size of q_value_predict is [BATCH_SIZE, 1]
            action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
            self.q_eval = tf.gather_nd(self.q_predict, action_indices)

        with tf.variable_scope('loss'):
            self.loss_func = tf.reduce_mean(tf.squared_difference(self.q_next, self.q_eval, name='mse'))

        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)

        with tf.variable_scope('update_target_net'):
            t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')
            p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')
            self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]

    def predict(self, s):
        if np.random.uniform() < self.epsilon:
            a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))
        else:
            a = np.random.randint(0, self.a_space)
        return a

    def snapshot(self, s, a, r, s_n):
        self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))
        self.buffer_count += 1

    def train(self):
        if self.total_train_step % self.update_target_net_step == 0:
            self.session.run(self.update_q_net)

        batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]

        s = batch[:, :self.s_space]
        s_n = batch[:, -self.s_space:]
        a = batch[:, self.s_space].reshape((-1))
        r = batch[:, self.s_space + 1]

        _, cost = self.session.run([self.train_op, self.loss_func], {
            self.s: s, self.a: a, self.r: r, self.s_n: s_n
        })

    def run(self):
        if self.mode == 'train':
            for episode in range(self.train_episodes):
                s, r_episode = self.env.reset(), 0
                while True:
                    # if episode > 400:
                    #     self.env.render()
                    a = self.predict(s)
                    s_n, r, done, _ = self.env.step(a)
                    if done:
                        r = -5
                    r_episode += r
                    self.snapshot(s, a, r_episode, s_n)
                    s = s_n
                    if done:
                        break
                if self.buffer_count > self.buffer_size:
                    self.train()
                if episode % 200 == 0:
                    self.logger.warning('Episode: {} | Rewards: {}'.format(episode, r_episode))
                    self.save()
        else:
            for episode in range(self.eval_episodes):
                s, r_episode = self.env.reset()
                while True:
                    a = self.predict(s)
                    s_n, r, done, _ = self.env.step(a)
                    r_episode += r
                    s = s_n
                    if done:
                        break

In [4]:
def main(_):
    # Make env.
    env = gym.make('CartPole-v0')
    env.seed(1)
    env = env.unwrapped
    # Init session.
    session = tf.Session()
    # Init agent.
    agent = Agent(session, env, env.action_space.n, env.observation_space.shape[0], **{
        KEY_MODEL_NAME: 'DQN',
        KEY_TRAIN_EPISODE: 3000
    })
    agent.run()


In [None]:
main(_)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


Episode: 0 | Rewards: 3.0
Episode: 200 | Rewards: 4.0
Episode: 400 | Rewards: 4.0
Episode: 600 | Rewards: 4.0
Episode: 800 | Rewards: 3.0
Episode: 1000 | Rewards: 3.0
Episode: 1200 | Rewards: 36.0
Episode: 1400 | Rewards: 50.0
Episode: 1600 | Rewards: 31.0
Episode: 1800 | Rewards: 187.0
