In [10]:
!pip list | grep paddlepaddle
!pip list | grep parl

paddlepaddle        1.6.3
paddlepaddle-gpu    1.8.3.post107
parl                1.3.1


In [9]:
pip install --upgrade pip

Collecting pip
  Downloading pip-20.2-py2.py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 2.3 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.1.1
    Uninstalling pip-20.1.1:
      Successfully uninstalled pip-20.1.1
Successfully installed pip-20.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import gym
import numpy as np

import paddle.fluid as fluid
import parl
from parl import layers
from parl.utils import logger

In [3]:
LEARNING_RATE = 1e-3

In [4]:
class Model(parl.Model):
    def __init__(self, act_dim):
        act_dim = act_dim
        hid1_size = act_dim * 10

        self.fc1 = layers.fc(size=hid1_size, act='tanh')
        self.fc2 = layers.fc(size=act_dim, act='softmax')

    def forward(self, obs):  # 可直接用 model = Model(5); model(obs)调用
        out = self.fc1(obs)
        out = self.fc2(out)
        return out

In [5]:
class PolicyGradient(parl.Algorithm):
    def __init__(self, model, lr=None):
        """ Policy Gradient algorithm
        
        Args:
            model (parl.Model): policy的前向网络.
            lr (float): 学习率.
            
        """

        self.model = model
        assert isinstance(lr, float)
        self.lr = lr

    def predict(self, obs):
        """ 使用policy model预测输出的动作概率"""
        return self.model(obs)

    def learn(self, obs, action, reward):
        """ 用policy gradient 算法更新policy model """
        act_prob = self.model(obs)  # 获取输出动作概率
        # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵
        log_prob = layers.reduce_sum(
            -1.0 * layers.log(act_prob) * layers.one_hot(
                action, act_prob.shape[1]),
            dim=1)
        cost = log_prob * reward
        cost = layers.reduce_mean(cost)

        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(cost)
        return cost

In [6]:
class Agent(parl.Agent):
    def __init__(self, algorithm, obs_dim, act_dim):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        super(Agent, self).__init__(algorithm)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.pred_program):  # 搭建计算图用于 预测动作，定义输入输出变量
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            self.act_prob = self.alg.predict(obs)

        with fluid.program_guard(
                self.learn_program):  # 搭建计算图用于 更新policy网络，定义输入输出变量
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            act = layers.data(name='act', shape=[1], dtype='int64')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            self.cost = self.alg.learn(obs, act, reward)

    def sample(self, obs):
        obs = np.expand_dims(obs, axis=0)  # 增加一维维度
        act_prob = self.fluid_executor.run(
            self.pred_program,
            feed={'obs': obs.astype('float32')},
            fetch_list=[self.act_prob])[0]
        act_prob = np.squeeze(act_prob, axis=0)  # 减少一维维度
        act = np.random.choice(range(self.act_dim), p=act_prob)  # 根据动作概率选取动作
        return act

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        act_prob = self.fluid_executor.run(
            self.pred_program,
            feed={'obs': obs.astype('float32')},
            fetch_list=[self.act_prob])[0]
        act_prob = np.squeeze(act_prob, axis=0)
        act = np.argmax(act_prob)  # 根据动作概率选择概率最高的动作
        return act

    def learn(self, obs, act, reward):
        act = np.expand_dims(act, axis=-1)
        feed = {
            'obs': obs.astype('float32'),
            'act': act.astype('int64'),
            'reward': reward.astype('float32')
        }
        cost = self.fluid_executor.run(
            self.learn_program, feed=feed, fetch_list=[self.cost])[0]
        return cost

In [7]:
def run_episode(env, agent):
    obs_list, action_list, reward_list = [], [], []
    obs = env.reset()
    while True:
        obs_list.append(obs)
        action = agent.sample(obs) # 采样动作
        action_list.append(action)

        obs, reward, done, info = env.step(action)
        reward_list.append(reward)

        if done:
            break
    return obs_list, action_list, reward_list

# 评估 agent, 跑 5 个episode，总reward求平均
def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        episode_reward = 0
        while True:
            action = agent.predict(obs) # 选取最优动作
            obs, reward, isOver, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if isOver:
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)

In [11]:
# 根据一个episode的每个step的reward列表，计算每一个Step的Gt
def calc_reward_to_go(reward_list, gamma=1.0):
    for i in range(len(reward_list) - 2, -1, -1):
        # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
        reward_list[i] += gamma * reward_list[i + 1]  # Gt
    return np.array(reward_list)


# 创建环境
env = gym.make('CartPole-v0')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

# 根据parl框架构建agent
model = Model(act_dim=act_dim)
alg = PolicyGradient(model, lr=LEARNING_RATE)
agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

# 加载模型
# if os.path.exists('./model.ckpt'):
#     agent.restore('./model.ckpt')
#     run_episode(env, agent, train_or_test='test', render=True)
#     exit()

for i in range(1000):
    obs_list, action_list, reward_list = run_episode(env, agent)
    if i % 10 == 0:
        logger.info("Episode {}, Reward Sum {}.".format(
            i, sum(reward_list)))

    batch_obs = np.array(obs_list)
    batch_action = np.array(action_list)
    batch_reward = calc_reward_to_go(reward_list)

    agent.learn(batch_obs, batch_action, batch_reward)
    if (i + 1) % 100 == 0:
        total_reward = evaluate(env, agent, render=False) # render=True 查看渲染效果，需要在本地运行，AIStudio无法显示
        logger.info('Test reward: {}'.format(total_reward))

[32m[08-03 16:30:58 MainThread @<ipython-input-11-6b5a2e549c1a>:13][0m obs_dim 4, act_dim 2
[32m[08-03 16:30:58 MainThread @machine_info.py:86][0m nvidia-smi -L found gpu count: 1
[32m[08-03 16:30:58 MainThread @machine_info.py:86][0m nvidia-smi -L found gpu count: 1
[32m[08-03 16:30:58 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 0, Reward Sum 10.0.
[32m[08-03 16:30:59 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 10, Reward Sum 19.0.
[32m[08-03 16:30:59 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 20, Reward Sum 28.0.
[32m[08-03 16:30:59 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 30, Reward Sum 9.0.
[32m[08-03 16:30:59 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 40, Reward Sum 14.0.
[32m[08-03 16:30:59 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 50, Reward Sum 27.0.
[32m[08-03 16:31:00 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 60, Reward Sum 29.0.
[32m[08-03 16

[32m[08-03 16:33:23 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 710, Reward Sum 168.0.
[32m[08-03 16:33:25 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 720, Reward Sum 146.0.
[32m[08-03 16:33:27 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 730, Reward Sum 125.0.
[32m[08-03 16:33:29 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 740, Reward Sum 120.0.
[32m[08-03 16:33:31 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 750, Reward Sum 185.0.
[32m[08-03 16:33:32 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 760, Reward Sum 144.0.
[32m[08-03 16:33:34 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 770, Reward Sum 200.0.
[32m[08-03 16:33:36 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 780, Reward Sum 200.0.
[32m[08-03 16:33:38 MainThread @<ipython-input-11-6b5a2e549c1a>:30][0m Episode 790, Reward Sum 139.0.
[32m[08-03 16:33:54 MainThread @<ipython-input-11-6b5a2e549c1a>