In [43]:
import torch.nn as nn
import torch 
import random 
import gym
from torch import optim
import numpy as np
import matplotlib.pyplot as plt

from actor_critic import ActorCritic


def run_episodes_policy_gradient(env, num_episodes, max_steps, discount_factor, learn_rate, nstep=1, n_hidden=256):
    
    n_state_features = len(env.reset())
    n_actions = env.action_space.n
    
    model =  ActorCritic(n_state_features, n_actions, n_hidden)
    
    optimizer = optim.Adam(model.parameters(), learn_rate)
    
    episode_durations = []

    # loop for each episode
    for episode in range(num_episodes):
        
        # initialize S (first state of episode)
        s = env.reset()
        print('start state', s)
        I = 1
        step = 0
        
        T = float('inf')
        t = 0
        
        states = [s]
        rewards = []
        while True:
            if t < T:
            
                with torch.no_grad():
                    v_s, pi_s_a = model.forward(s)
            
                # select action
                a = torch.multinomial(pi_s_a, 1).item()  
                log_prob = torch.log(pi_s_a.squeeze(0)[a])
            
                # take action
                s_new, r, done, _ = env.step(a)
                rewards.append(r)
                
                if done:
                    T = t + 1
                else:
                    states.append(s_new)
                    
            tau = t - nstep + 1
            
            if tau >= 0:
                print('tau larger than 0')
                
                G = np.sum(gamma**(i-tau) * rewards[i] for i in range(tau, min(tau+nstep, T)))
                if tau + nstep < T:
                    with torch.no_grad():
                        # Look ahead one step (t+1)
                        v_s_new, _ = model.forward(s_new)
                        G += (gamma**nstep) + v_s_new

                state_tau = states[tau]
                v_tau, pi_s_a_tau = model.forward(state_tau)
                log_prob_tau = torch.log(pi_s_a_tau.squeeze(0)[a])
                
                delta = G - v_tau
                loss_a =  - delta * I * log_prob_tau
                loss_c =  - delta * v_tau # should this be negative or positive?
                loss = loss_a + loss_c
                print('loss', loss)

                # backprop
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                print('t:', t, 'v_s_t:',v_s.item(), 'v_s_t+1:', v_s_new.item(), 
                      'tau:', tau, 'G:', G, 'delta:', delta, 'done:', done)
            else:
                print('t:', t)
            # update episode and records
            if t < T:  # we took an action above
                s = s_new
                
            # episode step
            t += 1
            if tau == T - 1:
                break
            
        if episode % 10 == 0:
            #print("E{0}- Steps:{1} Loss:{2}".format(episode, step, loss))
            pass
                  
        episode_durations.append(step)
        
    return episode_durations

Loop for each step of the episode $t=0,1, \ldots, T-1:$
$$
\begin{array}{l}{G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}} \\ {\boldsymbol{\theta} \leftarrow \boldsymbol{\theta}+\alpha \gamma^{t} G \nabla \ln \pi\left(A_{t} | S_{t}, \boldsymbol{\theta}\right)}\end{array}
$$
* The REINFORCE loss is defined as $- \sum_t \log \pi_\theta(a_t|s_t) G_t$, which means that you should compute the (discounted) return $G_t$ for all $t$



Here we have
Take action $A,$ observe $S^{\prime}, R$ 
$$
\delta \leftarrow R+\gamma \hat{v}\left(S^{\prime}, \mathbf{w}\right)-\hat{v}(S, \mathbf{w}) \\
\mathbf{w} \leftarrow \mathbf{w}+\alpha^{\mathbf{w}} \delta \nabla \hat{v}(S, \mathbf{w}) \\ \boldsymbol{\theta} \leftarrow \boldsymbol{\theta}+\alpha^{\boldsymbol{\theta}} I \delta \nabla \ln \pi(A | S, \boldsymbol{\theta})
$$

In [44]:
def smooth(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

num_episodes = 400
discount_factor = 0.99
learn_rate = 0.001
seed = 42
random.seed(seed)
torch.manual_seed(seed)

env = gym.envs.make("CartPole-v0")
env.seed(seed)

max_steps =  300 # max steps per episode

episode_durations_policy_gradient = run_episodes_policy_gradient(env, 
                                                                 num_episodes, 
                                                                 max_steps,
                                                                 discount_factor, 
                                                                 learn_rate,
                                                                 nstep=3,
                                                                 n_hidden=256)
plt.plot(smooth(episode_durations_policy_gradient, 20))
plt.title('Episode durations per episode')
plt.legend(['Policy gradient'])

start state [ 0.00560942  0.01842265 -0.03590751 -0.0120678 ]
t: 0
t: 1
tau larger than 0
loss tensor([[2.3530]])


  input = module(input)


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
import torch.nn as nn
import torch 
import random 
import gym
from torch import optim
import numpy as np
import matplotlib.pyplot as plt

from actor_critic import ActorCritic


def run_episodes_policy_gradient(env, num_episodes, max_steps, discount_factor, learn_rate, nstep=1, n_hidden=256):
    
    n_state_features = len(env.reset())
    n_actions = env.action_space.n
    
    model =  ActorCritic(n_state_features, n_actions, n_hidden)
    
    optimizer = optim.Adam(model.parameters(), learn_rate)
    
    episode_durations = []

    # loop for each episode
    for episode in range(num_episodes):
        
        # initialize S (first state of episode)
        s = env.reset()
        print('start state', s)
        I = 1
        step = 0
        
        T = float('inf')
        t = 0
        
        states = [s]
        rewards = []
        while True:
            print('t', t)
            if t < T:
            
                with torch.no_grad():
                    v_s, pi_s_a = model.forward(s)
                    print('v_s', v_s)
                    print('pi_s_a', pi_s_a)
            
                # select action
                a = torch.multinomial(pi_s_a, 1).item()  
                print('a', a)
                log_prob = torch.log(pi_s_a.squeeze(0)[a])
            
                # take action
                s_new, r, done, _ = env.step(a)
                print('s_new', s_new)
                print('r', r)
                print('done', done)
                rewards.append(r)
                
                if done:
                    T = t + 1
                else:
                    states.append(s_new)
                
                        
            # compute delta 
#             v_s_new = 0 
#             with torch.no_grad():
#                 if not done:
#                     v_s_new, _ = model.forward(s_new)
#                     v_s_new = v_s_new.item() 
#                 delta = r + (gamma * v_s_new) - v_s.item()
            
            tau = t - nstep + 1
            print('tau', tau)
            if tau >= 0:
                G = np.sum(gamma**(i-tau) * rewards[i] for i in range(tau, min(tau+nstep, T)))
                if tau + nstep < T:
                    with torch.no_grad():
                        # Look ahead one step (t+1)
                        v_s_new, _ = model.forward(s_new)
                        G += (gamma**nstep) + v_s_new
                print('G', G)

                state_tau = states[tau]
                print('state_tau', state_tau)
                v_tau, pi_s_a_tau = model.forward(state_tau)
                print('v_tau', v_tau)
                log_prob_tau = torch.log(pi_s_a_tau.squeeze(0)[a])
                
                delta = G - v_tau
                print('delta', delta)

                # from another github where the whole thing works:
                # adv = r - v_s.item()
                # loss_a = -log_prob * adv
                # loss_c = torch.nn.functional.smooth_l1_loss(v_s, torch.Tensor([r]).float())

                # compute gradient
                # minus
                loss_a =  - delta * I * log_prob_tau
                loss_c =  - delta * v_tau # should this be negative or positive?
                print('loss_a', loss_a)
                print('loss_c', loss_c)
                loss = loss_a + loss_c

                # backprop
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            # update episode and records
            if t < T:  # we took an action above
                s = s_new
                
            # episode step
            t += 1
            if tau == T - 1:
                break

#             I = discount_factor*I
#             s = s_new
#             step += 1

            # until s is a terminal state or we used the max steps
            if done or step > max_steps:
                break
            
        if episode % 10 == 0:
            #print("E{0}- Steps:{1} Loss:{2}".format(episode, step, loss))
            pass
                  
        episode_durations.append(step)
        
    return episode_durations

