# Pendulum

In [1]:
# custom utilies for displaying animation, collecting rollouts and more
import pong_utils
from parallelEnv import parallelEnv
import numpy as np
import gym
import time
%matplotlib inline

# check which device is being used. 
# I recommend disabling gpu until you've made sure that the code runs
device = pong_utils.device
print("using device: ",device)

using device:  cpu


## Environment

In [2]:
nb_agent=4
envs = parallelEnv('Pendulum-v0', n=nb_agent)
envs.reset()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


array([[-0.70963291,  0.70457159, -0.29201214],
       [-0.88121757,  0.47271091, -0.30510363],
       [-0.24428608,  0.96970321, -0.41104684],
       [-0.33505524, -0.94219849, -0.81791723]])

  u = np.clip(u, -self.max_torque, self.max_torque)[0]
  u = np.clip(u, -self.max_torque, self.max_torque)[0]
  u = np.clip(u, -self.max_torque, self.max_torque)[0]
  u = np.clip(u, -self.max_torque, self.max_torque)[0]


## Policy

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

class Policy(nn.Module):
    def __init__(self, state_size, action_size, fc1_units,fc11_units, std=0.0):
        super(Policy, self).__init__()
        self.fc1  = nn.Linear(state_size, fc1_units)
        self.fc11 = nn.Linear(fc1_units, fc11_units)
        self.fc2v = nn.Linear(fc11_units, 1)
        self.fc2a = nn.Linear(fc11_units, action_size)
        self.reset_parameters()
        self.log_std = nn.Parameter(torch.ones(1, action_size) * std)   
   

    def reset_parameters(self):
        self.fc1.weight.data.normal_(mean=0., std=0.1)
        self.fc11.weight.data.normal_(mean=0., std=0.1)
        self.fc2a.weight.data.normal_(mean=0., std=0.1)
        self.fc2v.weight.data.normal_(mean=0., std=0.1)
       
    def forward(self, state):
 
        value = F.relu(self.fc1(state))
        value = F.relu(self.fc11(value))
        value = self.fc2v(value)

        mu = F.relu(self.fc1(state))
        mu = F.relu(self.fc11(mu))
        mu = self.fc2a(mu)


        std   = self.log_std.exp().expand_as(mu)
        distribution  = Normal(mu, std)
        return distribution, value
    
 
# run your own policy!
# policy=Policy().to(device)
policy=Policy(state_size=3, action_size=1,fc1_units=128,fc11_units=64).to(device)

# we use the adam optimizer with learning rate 2e-4
# optim.SGD is also possible
import torch.optim as optim
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

## Function Definitions

In [4]:
def collect_trajectories(envs, policy, tmax=200, nrand=5):
        
    n=len(envs.ps)
    #initialize returning lists and start the game!
    state_list =[]
    reward_list=[]
    prob_list  =[]
    action_list=[]
    value_list =[]

    envs.reset()
    
    #start all parallel agents

#    envs.step([[1]]*n)
    
    # perform nrand random steps
    for i in range(nrand):
        action = [[np.random.uniform(-2, 2)] for _ in range(n)]
        next_states, rewards, dones, _ = envs.step(action)

    log_probs=[]
    
    for t in range(tmax):
        actions=[]
        state_list.append(next_states)
        values=[]
        for state in next_states:
 
            state=torch.tensor([state], dtype=torch.float, device=device)
            distribution, value = policy(state)
            action = distribution.sample()
            log_prob = distribution.log_prob(action).detach()
 #           entropy  = distribution.entropy()
            actions.append(action.squeeze())
            values.append(value.squeeze().data)
            log_probs.append(log_prob.squeeze()) 
 #            entropies.append(entropy.squeeze())
        next_states, rewards, is_dones,_= envs.step(actions)

        # store the result
 #       print("rewards:",rewards)
 #       print("values:",values)
 #       print("rewards_values:",rewards-values)

        reward_list.append(rewards)
        action_list.append(actions)
        value_list.append(actions)        
        # stop if any of the trajectories is done
        # we want all the lists to be retangular
        if is_dones.any():
            break


    # return pi_theta, states, actions, rewards, probability
    return log_probs, state_list,action_list, reward_list

In [5]:
old_probs, states, actions, rewards, values = collect_trajectories(envs, policy, tmax=4,nrand=2)
print("actions",actions)
print("probs",old_probs)
print("values",values)

ValueError: not enough values to unpack (expected 5, got 4)

## Clipped surrogate
In PPO algorithm the scalar function is given by
$\frac{1}{T}\sum^T_t \min\left\{R_{t}^{\rm future}\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)},R_{t}^{\rm future}{\rm clip}_{\epsilon}\!\left(\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)}\right)\right\}$

the ${\rm clip}_\epsilon$ function is implemented in pytorch as ```torch.clamp(ratio, 1-epsilon, 1+epsilon)```

In [27]:
# clipped surrogate function
# similar as -policy_loss for REINFORCE, but for PPO
def clipped_surrogate(policy, old_log_probs, states, actions, rewards,values,discount=0.995,epsilon=0.1, beta=0.01):
    actions=[]
    new_log_probs=[]
    
    discount = discount**np.arange(len(rewards))
    rewards = np.asarray(rewards)*discount[:,np.newaxis]
    
    # convert rewards to future rewards
    rewards_future = rewards[::-1].cumsum(axis=0)[::-1]-values
  #  print("rewards F:",rewards)
 #   print("values ",values)
    # normalize the reward: (x-mean)/std 
    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10

    rewards_normalized = (rewards_future - mean[:,np.newaxis])/std[:,np.newaxis]
    
    # convert everything into pytorch tensors and move to gpu if available
    actions = torch.tensor(actions, dtype=torch.float, device=device)
    rewards = torch.tensor(rewards_normalized, dtype=torch.float, device=device)
    # convert states to policy (or probability)
    # evaluate the new prob: this is what we have:(s,a,r,s') for old prob. 
    new_log_probs=[]
 
    for state in states:
        for s in state:
            s=torch.tensor([s], dtype=torch.float, device=device)
            distribution, value = policy(s)   
            action = distribution.sample()
            log_prob = distribution.log_prob(action)
            new_log_probs.append(log_prob.squeeze())

    new_log_probs=torch.stack(new_log_probs)
    old_log_probs=torch.stack(old_log_probs)
    
    # ratio for clipping
    ratio=(new_log_probs-old_log_probs).exp()

    # clipped function
    clip = torch.clamp(ratio,min= 1-epsilon,max= 1+epsilon)
    rewards=rewards.view(1,-1).squeeze()
    clipped_surrogate = torch.min(ratio*rewards, clip*rewards)
    entropy = -(new_log_probs.exp()*torch.log(old_log_probs.exp()+1.e-10)+ \
        (1.0-new_log_probs.exp())*torch.log(1.0-old_log_probs.exp()+1.e-10))
    # this returns an average of all the entries of the tensor
    # effective computing L_sur^clip / T
    # averaged over time-step and number of trajectories
    # this is desirable because we have normalized our rewards
    return torch.mean(clipped_surrogate + beta*entropy)



In [28]:
old_probs, states, actions, rewards,values = collect_trajectories(envs, policy, tmax=4)

## Training

In [29]:

# keep track of how long training takes
# WARNING: running through all 800 episodes will take 30-45 minutes

# training loop max iterations
episode = 1

# widget bar to display progress
!pip install progressbar
import progressbar as pb
widget = ['training loop: ', pb.Percentage(), ' ', 
          pb.Bar(), ' ', pb.ETA() ]
timer = pb.ProgressBar(widgets=widget, maxval=episode).start()


envs = parallelEnv('Pendulum-v0', n=nb_agent, seed=1234)

discount_rate = .99
epsilon = 0.1
beta = .00001
tmax = 15
SGD_epoch = 4

# keep track of progress
mean_rewards = []

for e in range(episode):

    # collect trajectories
    old_probs, states, actions, rewards,values = collect_trajectories(envs, policy, tmax=tmax)
        
    total_rewards = np.sum(rewards, axis=0)


    # gradient ascent step
    for _ in range(SGD_epoch):
        
        # uncomment to utilize your own clipped function!
        # L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta)

        L = -clipped_surrogate(policy, old_probs, states, actions, rewards,values,epsilon=epsilon, beta=beta)
        optimizer.zero_grad()
        L.backward()
        optimizer.step()
        del L
    
    # the clipping parameter reduces as time goes on
    epsilon*=.999
    
    # the regulation term also reduces
    # this reduces exploration in later runs
    beta*=.995
    
    # get the average reward of the parallel environments
    mean_rewards.append(np.mean(total_rewards))
    
    # display some progress every 20 iterations
    if (e+1)%5==0 :
        print("Episode: {0:d}, score: {1:f}".format(e+1,np.mean(total_rewards)))
        print(total_rewards)
        
    # update progress widget bar
    timer.update(e+1)
    
timer.finish()
torch.save(policy, 'PPO.policy')


/bin/sh: 1: pip: not found


training loop:   0% |                                          | ETA:  --:--:--

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  u = np.clip(u, -self.max_torque, self.max_torque)[0]
  u = np.clip(u, -self.max_torque, self.max_torque)[0]
  u = np.clip(u, -self.max_torque, self.max_torque)[0]
  u = np.clip(u, -self.max_torque, self.max_torque)[0]


rewards F: [[ -3.75804079  -3.97351025  -7.23574814  -0.11624235]
 [ -4.92076418  -5.11010118  -8.11144059  -0.14097114]
 [ -6.68368676  -6.5909249   -9.14566223  -0.1816862 ]
 [ -8.67663005  -8.52035806 -10.32038886  -0.21980922]
 [-10.94420271 -10.49861404 -10.29599292  -0.37125198]
 [-12.9833666  -12.56264345  -9.28055054  -0.65921392]
 [-13.38678613 -12.00565198  -8.22751792  -0.93328958]
 [-10.96269897 -10.1828688   -7.19661239  -1.24173368]
 [ -8.86831915  -8.51365152  -6.32883441  -1.58341299]
 [ -6.95468268  -6.71823082  -5.61744979  -2.42042804]
 [ -5.31555995  -5.22873563  -5.00551909  -3.14466467]
 [ -4.03368941  -4.10770787  -4.64158874  -4.1877801 ]
 [ -3.0668535   -3.12393453  -4.44884767  -5.57785857]
 [ -2.27463859  -2.32897973  -4.49262009  -7.09503327]
 [ -1.74724868  -1.91805086  -4.78751228  -9.17950146]]
values  [[tensor(-0.2342), tensor(-0.1999), tensor(-0.1758), tensor(-0.1711)], [tensor(-0.2635), tensor(-0.2115), tensor(-0.1779), tensor(-0.1724)], [tensor(-0.313

RuntimeError: The size of tensor a (60) must match the size of tensor b (4) at non-singleton dimension 0

# Testing

In [None]:
def play(env, policy, time=2000, nrand=5,n=1):

    env.reset()
    # perform nrand random steps in the beginning

    for i in range(nrand):
        next_state, reward, done, _ = env.step([np.random.uniform(-2, 2)])
    
    for _ in range(time):
        next_state=torch.tensor([next_state], dtype=torch.float, device=device)   
        distribution, value = policy(next_state)
        action = distribution.sample()
        env.render()
        next_state, reward, done, _ = env.step(action)
        if done:
            break 
    
    env.close()
    return 

In [None]:
# save your policy!
policy = torch.load('PPO.policy')

In [None]:
#policy_solution = torch.load('PPO_solution.policy')
env = gym.make('Pendulum-v0')
env.reset() 

In [None]:
play(env, policy, time=2000, nrand=5,n=1)