In [None]:
# Ref: https://medium.com/@ts1829/policy-gradient-reinforcement-learning-in-pytorch-df1383ea0baf
# Ref: https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py?source=post_page---------------------------
# Ref: https://www.youtube.com/watch?v=y8UPGr36ccI (11分開始有說明loss)

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical

In [2]:
env = gym.make('CartPole-v1')
env.seed(1)
torch.manual_seed(1)

<torch._C.Generator at 0x7f10f02b44b0>

### Policy Gradients
A policy gradient attempts to train an agent without explicitly mapping the value for every state-action pair in an environment by taking small steps and updating the policy based on the reward associated with that step. The agent can receive a reward immediately for an action or the agent can receive the award at a later time such as the end of the episode.  We’ll designate the policy function our agent is trying to learn as 

$\pi_\theta(a,s)$, where $\theta$ is the parameter vector, $s$ is a particular state, and $a$ is an action.

We'll apply a technique called Monte-Carlo Policy Gradient which means we will have the agent run through an entire episode and then update our policy based on the rewards obtained.

### Model Construction
Create Neural Network Model
We will use a simple feed forward neural network with one hidden layer of 128 neurons and a dropout of 0.6. We'll use Adam as our optimizer and a learning rate of 0.01. Using dropout will significantly improve the performance of our policy. I encourage you to compare results with and without dropout and experiment with other hyper-parameter values.

In [12]:
#Hyperparameters
learning_rate = 0.0002
gamma = 0.98

In [13]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        
        self.l1 = nn.Linear(self.state_space, 128, bias=True)
        self.l2 = nn.Linear(128, self.action_space, bias=True)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.policy_history = torch.Tensor()
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
        )
        return model(x)

In [14]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

### Select Action
The select_action function chooses an action based on our policy probability distribution using the PyTorch distributions package. Our policy returns a probability for each possible action in our action space (move left or move right) as an array of length two such as [0.7, 0.3]. We then choose an action based on these probabilities, record our history, and return our action.

功能：根据概率分布来产生sample，产生的sample是输入tensor的index
如：
- m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
- m.sample() 　　# equal probability of 0, 1, 2, 3
- tensor(3)

In [15]:
def select_action(state):
    # Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
    state = torch.from_numpy(state).type(torch.FloatTensor)
    state = policy(state)
    c = Categorical(state)
    # 從機率分布中去抽樣action
    action = c.sample()
    
    # Add log probability of our chosen action to our history
    if policy.policy_history.dim() != 0:
        # 將機率記錄下來
        # print(c.log_prob(action))
        # print(torch.tensor([c.log_prob(action)]).dim())
        # policy.policy_history = torch.cat([policy.policy_history, torch.tensor([c.log_prob(action)])])
#         print(c.log_prob(action))
        policy.policy_history = torch.cat([policy.policy_history,  torch.tensor([c.log_prob(action)])])
    else:
        print(c.log_prob(action))
        policy.policy_history = torch.tensor([c.log_prob(action)])
    return action

### Reward $v_t$
We update our policy by taking a sample of the action value function $Q^{\pi_\theta} (s_t,a_t)$ by playing through episodes of the game.  $Q^{\pi_\theta} (s_t,a_t)$ is defined as the expected return by taking action $a$ in state $s$ following policy $\pi$.

We know that for every step the simulation continues we receive a reward of 1. We can use this to calculate the policy gradient at each time step, where $r$ is the reward for a particular state-action pair. Rather than using the instantaneous reward, $r$, we instead use a long term reward $ v_{t} $ where $v_t$ is the discounted sum of all future rewards for the length of the episode. In this way, the longer the episode runs into the future, the greater the reward for a particular state-action pair in the present. $v_{t}$ is then,

$$ v_{t} = \sum_{k=0}^{N} \gamma^{k}r_{t+k} $$
where $\gamma$ is the discount factor (0.99). For example, if an episode lasts 5 steps, the reward for each step will be [4.90, 3.94, 2.97, 1.99, 1]. Next we scale our reward vector by substracting the mean from each element and scaling to unit variance by dividing by the standard deviation. This practice is common for machine learning applications and the same operation as Scikit Learn's StandardScaler. It also has the effect of compensating for future uncertainty.

Update Policy
After each episode we apply Monte-Carlo Policy Gradient to improve our policy according to the equation:

$$\Delta\theta_t = \alpha\nabla_\theta \, \log \pi_\theta (s_t,a_t)v_t  $$
We will then feed our policy history multiplied by our rewards to our optimizer and update the weights of our neural network using stochastic gradent ascent. This should increase the likelihood of actions that got our agent a larger reward.

In [21]:
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0, R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    
    # Calculate loss
#     print(policy.policy_history)
#     print(policy.reward_episode)
    loss = torch.sum(torch.mul(policy.policy_history, rewards).mul(-1), -1) # -log
    loss = Variable(loss, requires_grad=True) ## 這個卡很久，loss要給grad，但不是depreciated?
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.item())  ## 不用data[0]
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = torch.Tensor()
    policy.reward_episode = []

In [22]:
a = torch.randn(3)
print(a)
print(torch.mul(a, 100))
print(torch.mul(a, 100).mul(-1))
print(torch.sum(torch.mul(a, 100).mul(-1), -1))

tensor([0.2844, 0.0922, 0.0966])
tensor([28.4384,  9.2221,  9.6575])
tensor([-28.4384,  -9.2221,  -9.6575])
tensor(-47.3180)


### Training
This is our main policy training loop. For each step in a training episode, we choose an action, take a step through the environment, and record the resulting new state and reward. We call update_policy() at the end of each episode to feed the episode history to our neural network and improve our policy.

In [23]:
# episode開始 -> 將state丟net，得到機率分布，選action (過程中要記錄action以及prob) -> 實際玩一整場episode，紀錄state、當下reward ->
# 將剛剛一整場的R加上discount處理、正規化後，乘上每一輪的action機率，乘上負號加總，當作這episode的loss -> 更新網路 -> 玩下一個episode
# 根據step與time制定玩到什麼程度算是訓練完畢

def main(episodes):
    running_reward = 10
    for episode in range(episodes):
        state = env.reset() # Reset environment and record the starting state
        done = False       
    
        for time in range(1000):
            action = select_action(state)
#              print(action.item())
            # Step through environment using chosen action
            state, reward, done, _ = env.step(action.item())

            # Save reward
            policy.reward_episode.append(reward)
            if done:
                break
        
        # Used to determine when the environment is solved.
        running_reward = (running_reward * 0.99) + (time * 0.01)
        # https://stackoverflow.com/questions/54737990/how-does-one-determine-when-the-cartpole-environment-has-been-solved

        update_policy()

        if episode % 50 == 0:
            print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(episode, time, running_reward))

        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, time))
            break

In [24]:
env.reset()
env.spec.reward_threshold
env.action_space
env.step(1)

(array([-0.00259079,  0.21040909,  0.02555058, -0.27808369]), 1.0, False, {})

### Run Model

In [None]:
episodes = 10000
main(episodes)

Episode 0	Last length:    35	Average length: 10.25
Episode 50	Last length:    17	Average length: 16.03
Episode 100	Last length:    32	Average length: 19.37
Episode 150	Last length:    37	Average length: 21.29
Episode 200	Last length:    22	Average length: 23.41
Episode 250	Last length:    16	Average length: 24.76
Episode 300	Last length:    19	Average length: 25.46
Episode 350	Last length:    43	Average length: 25.54
Episode 400	Last length:    37	Average length: 24.37
Episode 450	Last length:    21	Average length: 25.11
Episode 500	Last length:    60	Average length: 25.91
Episode 550	Last length:    17	Average length: 25.75
Episode 600	Last length:    12	Average length: 25.77
Episode 650	Last length:    19	Average length: 24.47
Episode 700	Last length:    31	Average length: 24.20
Episode 750	Last length:    17	Average length: 24.36
Episode 800	Last length:    36	Average length: 23.60
Episode 850	Last length:    35	Average length: 24.00
Episode 900	Last length:    12	Average length: 23