In [20]:
# !export PARL_BACKEND=torch
import sys, os
os.environ['PARL_BACKEND'] = 'torch'
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import parl
# import paddle as torch
# import paddle.nn as nn
# import paddle.nn.functional as F
# import parl


In [21]:
# 第一层抽象，Model，是个策略网络(Policy Network)或者一个值函数网络(Value Function Network)
# 这里我们写的是一个策略网络，它的输入是一个4维的向量，输出是一个2维的向量，分别是：[向左的概率，向右的概率]
class CartpoleModel(parl.Model):
    def __init__(self, obs_dim=4, act_dim=2):
        super(CartpoleModel, self).__init__()
        hid1_size = act_dim * 10
        self.fc1 = nn.Linear(obs_dim, hid1_size)
        self.fc2 = nn.Linear(hid1_size, act_dim)

    def forward(self, x):
        # x是一个4维的向量，分别是：[cart_position, cart_velocity, pole_angle, pole_velocity]
        out = torch.tanh(self.fc1(x))
        prob = F.softmax(self.fc2(out))
        return prob 
model = CartpoleModel(act_dim=2)
# 定义Model的训练算法 Algorithm ，PolicyGradient是一个基于策略的强化学习算法。
algorithm = parl.algorithms.PolicyGradient(model, lr=1e-4)

In [22]:
# 第三层抽象，Agent，是一个智能体，它除了包含上面的一个策略网络，一个优化器
# 还增加了 一个学习算法，一个经验池，一个探索策略。 主要就是获得数据，用数据区训练上面的Algorithm。
# Agent 不包含 Environment，但是与 Environment交互
class CartpoleAgent(parl.Agent):
    def __init__(self, algorithm):
        # 存储外面定义的算法
        super().__init__(algorithm)

    def sample(self, obs):
        # 根据环境状态返回动作（action），一般用于训练时候采样action进行探索。
        # 这个是训练用的。
        obs = torch.Tensor(obs).to(torch.float32)
        prob = self.alg.predict(obs)
        prob = prob.detach().numpy()
        # 按照概率去选择一个动作，以便采样。
        act = np.random.choice(len(prob), 1, p=prob)[0]

        return act

    def predict(self, obs):
        # 根据环境状态返回预测动作（action），一般只是套一层算法的预测结果。
        # 这个是最后的部署用的。
        obs = torch.Tensor(obs).to(torch.float32)
        prob = self.alg.predict(obs)
        act = int(prob.argmax())
        return act

    def learn(self, obs, act, reward):
        # 给出一个loss，以便反向传播？
        act = np.expand_dims(act, axis=-1)
        reward = np.expand_dims(reward, axis=-1)
        obs = torch.Tensor(obs).to(torch.float32)
        act = torch.Tensor(act).to(torch.int32)
        reward = torch.Tensor(reward).to(torch.float32)
        loss = self.alg.learn(obs, act, reward)
        return float(loss)
agent = CartpoleAgent(algorithm)

In [25]:
import gym
env = gym.make("CartPole-v0")
from parl.utils import logger
# 训练代码
def run_train_episode(env, agent):
    obs_list, action_list, reward_list = [], [], []
    obs = env.reset()
    while True:
        obs_list.append(obs)
        action = agent.sample(obs)
        action_list.append(action)
        obs, reward, done, info = env.step(action)
        reward_list.append(reward)
        if done:
            break
    return obs_list, action_list, reward_list

# evaluate 5 episodes
def run_evaluate_episodes(env, agent, eval_episodes=200, render=False):
    eval_reward = []
    for i in range(eval_episodes):
        obs = env.reset()
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            obs, reward, isOver, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if isOver:
                break
        eval_reward.append(episode_reward)
    return np.sum(eval_reward)

def calc_reward_to_go(reward_list, gamma=1.0):
    for i in range(len(reward_list) - 2, -1, -1):
        # G_i = r_i + γ·G_i+1
        reward_list[i] += gamma * reward_list[i + 1]  # Gt
    return np.array(reward_list)

for i in range(1000):
      obs_list, action_list, reward_list = run_train_episode(env, agent)
      if i % 10 == 0:
          logger.info("Episode {}, Reward Sum {}.".format(i, sum(reward_list)))
      batch_obs = np.array(obs_list)
      batch_action = np.array(action_list)
      batch_reward = calc_reward_to_go(reward_list)

      agent.learn(batch_obs, batch_action, batch_reward)

      if (i + 1) % 100 == 0:
        #   _, _, reward_list = run_evaluate_episodes(env, agent)
          reward = run_evaluate_episodes(env, agent)

          logger.info('Test reward: {}'.format(reward))

[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 0, Reward Sum 18.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 10, Reward Sum 18.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 20, Reward Sum 11.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 30, Reward Sum 20.0.


  prob = F.softmax(self.fc2(out))


[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 40, Reward Sum 16.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 50, Reward Sum 15.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 60, Reward Sum 16.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 70, Reward Sum 28.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 80, Reward Sum 13.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 90, Reward Sum 17.0.
[32m[04-11 21:18:40 MainThread @4191327931.py:55][0m Test reward: 1879.0
[32m[04-11 21:18:40 MainThread @4191327931.py:44][0m Episode 100, Reward Sum 21.0.
[32m[04-11 21:18:41 MainThread @4191327931.py:44][0m Episode 110, Reward Sum 10.0.
[32m[04-11 21:18:41 MainThread @4191327931.py:44][0m Episode 120, Reward Sum 12.0.
[32m[04-11 21:18:41 MainThread @4191327931.py:44][0m Episode 130, Reward Sum 10.0.
[32m[04-11 21:18:41 MainThread @4191327931.py:44][0m Episode 140, Reward Sum 15