In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.distributions import MultivariateNormal
import numpy as np
import gym

In [9]:
class policynet(nn.Module):
    def __init__(self, s, h, h2, a): # s-state space, hidden layer size, second hidden layer size, action space
        super(policynet, self).__init__()
        self.hl = nn.Linear(s,h)
        self.hl2 = nn.Linear(h,h2)
        self.out = nn.Linear(h2,a)
        
    def forward(self,x):
        x = F.relu(self.hl(x))
        x = F.relu(self.hl2(x))
        x = self.out(x)
        
        return x
            
        

In [11]:
## some hyper parameters
env_name='Pendulum-v0'
hidden_sizes=[32]
lr=5e-3
total_time=1000000
time_per_batch = 3000
max_steps_episode = 1500
gamma = .95 # discount factor
updates_per_iteration = 5
clip = 0.2


In [None]:
# make environment
env = gym.make(env_name)

# get dimensions for policy network

#observation dimensions
obs_dim = env.observation_space.shape[0]

# action dimensions
n_acts = env.action_space.shape[0]

# make actor and critic policy networks

# this suggests actions for us
actor = policynet(obs_dim, 50,50, n_acts)

# this estimates value for us
critic = policynet(obs_dim, 50,50, 1)

# actor adam optimizer

a_optimizer = torch.optim.Adam(actor.parameters(), lr=lr)

# critic optimizer

c_optimizer = torch.optim.Adam(critic.parameters(), lr=lr)

# covariance matrix for normal distribution
cov_vec = torch.full(size=(n_acts,), fill_value=0.5)
cov_mat = torch.diag(cov_vec)


# make action selection function (outputs int actions, sampled from policy)
def get_action(obs):
    
    # mean action
    mean = actor(obs)
    
    # create distribution
    dist = MultivariateNormal(mean, cov_mat)
    
    #get action
    act = dist.sample()
    
    # get log prob
    log_prob = dist.log_prob(act)
    
    # return action and log prob, don't add to existing computational graph
    return act.detach().numpy(), log_prob.detach()
    

# compute rewards to go, rewards gained after taking an action
def compute_rtg(batch_rew):
    batch_rtg = []
    
    for ep_rews in reversed(batch_rew):
        
        # discounted reward
        discounted_reward = 0
        
        for rew in reversed(ep_rews):
            # getting rewards by going backwards is very simple last action has one reward, second to
            # last has two subsequent rewards which is gamma times last action reward, etc.
            discounted_reward = rew + discounted_reward * gamma
            
            # can't append since we are going backwards
            batch_rtg.insert(0, discounted_reward)
    
    batch_rtg = torch.tensor(batch_rtg, dtype=torch.float)
    return batch_rtg


def train():

    batch_obs = []          # for observations
    batch_acts = []         # for actions
    batch_rtg = []      # for rewards to go
    batch_rew = []         # for measuring rewards
    batch_lens = []         # for measuring episode lengths
    batch_log_probs = []    # log probs

    t = 0
    ep_rews = []            
    
    while t < time_per_batch:
        # reset episode-specific variables
        obs = env.reset()      
        done = False            # tracks if episode has ended
        ep_rews = []            # episode rewards

        # collect experience by acting in the environment with current policy
        for ep_t in range(max_steps_episode):
            # save obs
            
            t+=1
            batch_obs.append(obs.copy())

            # act in the environment
            act, log_prob = get_action(torch.as_tensor(obs, dtype=torch.float32))
            obs, rew, done, _ = env.step(act)

            # save action,reward, log probs
            batch_acts.append(act)
            ep_rews.append(rew)
            batch_log_probs.append(log_prob)

            # check if episode is done, then record some info
            if done:
                break

        ## track episode reward and length
        #ep_len = len(ep_rews)
        batch_rew.append(ep_rews)
        batch_lens.append(ep_t+1)
        
    batch_obs = torch.tensor(np.array(batch_obs), dtype=torch.float)
    batch_acts = torch.tensor(np.array(batch_acts), dtype=torch.float)
    batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float)
    batch_rtg = compute_rtg(batch_rew)    
    
    return batch_obs, batch_acts, batch_log_probs, batch_rtg, batch_lens, batch_rew

# time so far
t_yet = 0

# iterations so far
i_yet = 0

while t_yet < total_time:
    batch_obs, batch_acts, batch_log_probs, batch_rtg, batch_lens, batch_rew = train()
    
    t_yet += np.sum(batch_lens)
    i_yet +=1
    
    # critic network gives value of actions
    Val = critic(batch_obs).squeeze()
    
    # get log probs of actions according to actor network
    #mean = actor(batch_obs)
    #dist = MultivariateNormal(mean,cov_mat)
    #log_prob = dist.log_probs(batch_acts)
    
    # get advantage values
    A_k = batch_rtg - Val.detach()
    
    # normalize advantaage values for faster convergence
    A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)
    
    for _ in range(updates_per_iteration):
        Val = critic(batch_obs).squeeze()
    
        # get log probs of actions according to actor network
        mean = actor(batch_obs)
        dist = MultivariateNormal(mean,cov_mat)
        log_prob = dist.log_prob(batch_acts)
        
        # subtracting logs is like dividing, then exponentiate to get out of log space
        ratios = torch.exp(log_prob - batch_log_probs)
        
        # calculate potential losses, one being the ppo clip.
        # we take the minimum of the two as our loss
        # because we want to limit change in action probability to clip
        surr1 = ratios * A_k
        surr2 = torch.clamp(ratios, 1 - clip, 1 + clip) * A_k
        
        # get losses, negative because we maximize actor "loss"
        actor_loss = (-torch.min(surr1, surr2)).mean()
        critic_loss = nn.MSELoss()(Val, batch_rtg)
        
        #backprop actor
        a_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        a_optimizer.step()

        # backprop critic
        c_optimizer.zero_grad()
        critic_loss.backward()
        c_optimizer.step()

    avg_ep_rews = np.mean([np.sum(ep_rews) for ep_rews in batch_rew])
    
    print('time: %3d \t reward: %.3f \t ep_len: %.3f'%
            (t_yet, avg_ep_rews, np.mean(batch_lens)))



time: 3000 	 reward: -1218.923 	 ep_len: 200.000
time: 6000 	 reward: -1341.225 	 ep_len: 200.000
time: 9000 	 reward: -1289.657 	 ep_len: 200.000
time: 12000 	 reward: -1254.246 	 ep_len: 200.000
time: 15000 	 reward: -1219.515 	 ep_len: 200.000
time: 18000 	 reward: -1184.793 	 ep_len: 200.000
time: 21000 	 reward: -1183.111 	 ep_len: 200.000
time: 24000 	 reward: -1187.248 	 ep_len: 200.000
time: 27000 	 reward: -1244.862 	 ep_len: 200.000
time: 30000 	 reward: -1213.684 	 ep_len: 200.000
time: 33000 	 reward: -1204.177 	 ep_len: 200.000
time: 36000 	 reward: -1215.105 	 ep_len: 200.000
time: 39000 	 reward: -1141.333 	 ep_len: 200.000
time: 42000 	 reward: -1218.544 	 ep_len: 200.000
time: 45000 	 reward: -1179.629 	 ep_len: 200.000
time: 48000 	 reward: -1000.417 	 ep_len: 200.000
time: 51000 	 reward: -1036.261 	 ep_len: 200.000
time: 54000 	 reward: -1036.914 	 ep_len: 200.000
time: 57000 	 reward: -1145.020 	 ep_len: 200.000
time: 60000 	 reward: -1077.608 	 ep_len: 200.000
tim

time: 498000 	 reward: -224.439 	 ep_len: 200.000
time: 501000 	 reward: -180.687 	 ep_len: 200.000
time: 504000 	 reward: -182.592 	 ep_len: 200.000
time: 507000 	 reward: -171.317 	 ep_len: 200.000
time: 510000 	 reward: -141.821 	 ep_len: 200.000
time: 513000 	 reward: -167.635 	 ep_len: 200.000
time: 516000 	 reward: -182.513 	 ep_len: 200.000
time: 519000 	 reward: -170.660 	 ep_len: 200.000
time: 522000 	 reward: -110.802 	 ep_len: 200.000
time: 525000 	 reward: -142.170 	 ep_len: 200.000
time: 528000 	 reward: -153.640 	 ep_len: 200.000
time: 531000 	 reward: -217.211 	 ep_len: 200.000
time: 534000 	 reward: -158.668 	 ep_len: 200.000
time: 537000 	 reward: -136.337 	 ep_len: 200.000
time: 540000 	 reward: -93.810 	 ep_len: 200.000
time: 543000 	 reward: -167.153 	 ep_len: 200.000
time: 546000 	 reward: -144.865 	 ep_len: 200.000
time: 549000 	 reward: -162.022 	 ep_len: 200.000
time: 552000 	 reward: -178.888 	 ep_len: 200.000
time: 555000 	 reward: -146.996 	 ep_len: 200.000
t

In [7]:
# test our model

# use this if you want to save video output, otherwise comment out
env = gym.wrappers.Monitor(gym.make('Pendulum-v0'), './', force = True)

for i_episode in range(1):
    
    observation = env.reset()
    
    for t in range(200):
        env.render()
        #action = get_action(torch.as_tensor(observation, dtype=torch.float32))
        action,_ = get_action(torch.as_tensor(observation, dtype=torch.float32))
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

Episode finished after 200 timesteps
