In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from src.PPO.PPO import PPO

env = gym.make('CartPole-v1')
# env = gym.make('LunarLander-v2')
env.seed(1234)

state_size = env.observation_space.shape[0]
action_size =env.action_space.n

# PPO Settings
update_every = 128 # This is the roll out length
num_learn = 20
win_condition = 200

# Agent settings
hidden_size=512
epsilon=0.2
entropy_beta=0.01
gamma=0.99
lr=0.002

agent = PPO(state_size, action_size, hidden_size=hidden_size, epsilon=epsilon, entropy_beta=entropy_beta, gamma=gamma, lr=lr)

In [5]:
def train(n_episodes=2000, max_t=700):
  steps = 0
  scores_deque = deque(maxlen=100)
  scores = []
  average_scores = []
  max_score = -np.Inf

#   agent = PPO(state_size, action_size, hidden_size=hidden_size, epsilon=epsilon, entropy_beta=entropy_beta, gamma=gamma, lr=lr)

  for episode in range(1, n_episodes+1):
    state = env.reset()
    score = 0
    
    for t in range(max_t):
      steps += 1

      action, log_prob = agent.act(torch.FloatTensor(state))
      _, _, value, entropy = agent.model.evaluate(torch.FloatTensor(state), action)
      next_state, reward, done, _ = env.step(action.item())

      agent.mem.add(torch.FloatTensor(state), action, reward, log_prob, done)

      # Update 
      state = next_state
      score += reward

      if steps >= update_every:
#         agent.learn(num_learn)
        agent.learn_gae(num_learn, last_value=value, last_done=done)
        agent.mem.clear()
        steps = 0

      if done:
        break
    
    # Book Keeping
    scores_deque.append(score)
    scores.append(score)
    average_scores.append(np.mean(scores_deque))
      
    if episode % 10 == 0:
      print("\rEpisode {}	Average Score: {:.2f}	Score: {:.2f}".format(episode, np.mean(scores_deque), score), end="")
    if episode % 100 == 0:
      print("\rEpisode {}	Average Score: {:.2f}".format(episode, np.mean(scores_deque)))   
    
    if np.mean(scores_deque) > win_condition:
      print("\rEnvironment Solved in {} episodes!".format(episode))
      break


  return scores, average_scores

In [None]:
scores, average_scores = train(n_episodes=500)

> /Users/darylrodrigo/Documents/Reinforcement Learning/rl_lib/Policy Gradient/src/PPO/PPO.py(113)learn_gae()
-> discounted_returns = advantage + values
(Pdb) advantage
[tensor([10.4267], grad_fn=<AddBackward0>), tensor([9.4507], grad_fn=<AddBackward0>), tensor([8.4430], grad_fn=<AddBackward0>), tensor([7.4207], grad_fn=<AddBackward0>), tensor([6.3915], grad_fn=<AddBackward0>), tensor([5.3481], grad_fn=<AddBackward0>), tensor([4.2965], grad_fn=<AddBackward0>), tensor([3.2385], grad_fn=<AddBackward0>), tensor([2.1669], grad_fn=<AddBackward0>), tensor([1.0906], grad_fn=<AddBackward0>), tensor([22.4086], grad_fn=<AddBackward0>), tensor([21.6100], grad_fn=<AddBackward0>), tensor([20.7486], grad_fn=<AddBackward0>), tensor([19.8932], grad_fn=<AddBackward0>), tensor([19.0156], grad_fn=<AddBackward0>), tensor([18.1094], grad_fn=<AddBackward0>), tensor([17.1856], grad_fn=<AddBackward0>), tensor([16.2617], grad_fn=<AddBackward0>), tensor([15.3185], grad_fn=<AddBackward0>), tensor([14.3679], grad_

(Pdb) values
tensor([[-0.0705],
        [-0.0922],
        [-0.0965],
        [-0.0931],
        [-0.0968],
        [-0.0946],
        [-0.0904],
        [-0.0932],
        [-0.0873],
        [-0.0906],
        [-0.0829],
        [-0.0689],
        [-0.0543],
        [-0.0678],
        [-0.0873],
        [-0.0956],
        [-0.0866],
        [-0.0963],
        [-0.0943],
        [-0.0970],
        [-0.0946],
        [-0.0978],
        [-0.0949],
        [-0.0982],
        [-0.0956],
        [-0.0981],
        [-0.0870],
        [-0.0982],
        [-0.0967],
        [-0.0933],
        [-0.0892],
        [-0.0898],
        [-0.0940],
        [-0.0872],
        [-0.0703],
        [-0.0538],
        [-0.0698],
        [-0.0880],
        [-0.0955],
        [-0.0882],
        [-0.0962],
        [-0.0952],
        [-0.0933],
        [-0.0891],
        [-0.0863],
        [-0.0855],
        [-0.0884],
        [-0.0821],
        [-0.0692],
        [-0.0881],
        [-0.0948],
        [-0.0939],

(Pdb) advantage + values
*** TypeError: can only concatenate list (not "Tensor") to list
(Pdb) values.unsqueeze()
*** TypeError: unsqueeze() missing 1 required positional arguments: "dim"
(Pdb) values.unsqueeze(dim=0)
tensor([[[-0.0705],
         [-0.0922],
         [-0.0965],
         [-0.0931],
         [-0.0968],
         [-0.0946],
         [-0.0904],
         [-0.0932],
         [-0.0873],
         [-0.0906],
         [-0.0829],
         [-0.0689],
         [-0.0543],
         [-0.0678],
         [-0.0873],
         [-0.0956],
         [-0.0866],
         [-0.0963],
         [-0.0943],
         [-0.0970],
         [-0.0946],
         [-0.0978],
         [-0.0949],
         [-0.0982],
         [-0.0956],
         [-0.0981],
         [-0.0870],
         [-0.0982],
         [-0.0967],
         [-0.0933],
         [-0.0892],
         [-0.0898],
         [-0.0940],
         [-0.0872],
         [-0.0703],
         [-0.0538],
         [-0.0698],
         [-0.0880],
         [-0.0955],
  

(Pdb) values.squeeze()
tensor([-0.0705, -0.0922, -0.0965, -0.0931, -0.0968, -0.0946, -0.0904, -0.0932,
        -0.0873, -0.0906, -0.0829, -0.0689, -0.0543, -0.0678, -0.0873, -0.0956,
        -0.0866, -0.0963, -0.0943, -0.0970, -0.0946, -0.0978, -0.0949, -0.0982,
        -0.0956, -0.0981, -0.0870, -0.0982, -0.0967, -0.0933, -0.0892, -0.0898,
        -0.0940, -0.0872, -0.0703, -0.0538, -0.0698, -0.0880, -0.0955, -0.0882,
        -0.0962, -0.0952, -0.0933, -0.0891, -0.0863, -0.0855, -0.0884, -0.0821,
        -0.0692, -0.0881, -0.0948, -0.0939, -0.0962, -0.0880, -0.0968, -0.0947,
        -0.0953, -0.0932, -0.0910, -0.0882, -0.0870, -0.0854, -0.0700, -0.0887,
        -0.0694, -0.0886, -0.0690, -0.0529, -0.0685, -0.0881, -0.0680, -0.0530,
        -0.0502, -0.0525, -0.0667, -0.0523, -0.0510, -0.0525, -0.0649, -0.0536,
        -0.0525, -0.0551, -0.0604, -0.0567, -0.0586, -0.0732, -0.0891, -0.0706,
        -0.0874, -0.0951, -0.0925, -0.0948, -0.0849, -0.0949, -0.0831, -0.0957,
        -0.0819, 

(Pdb) valyes
*** NameError: name 'valyes' is not defined
(Pdb) values
tensor([[-0.0705],
        [-0.0922],
        [-0.0965],
        [-0.0931],
        [-0.0968],
        [-0.0946],
        [-0.0904],
        [-0.0932],
        [-0.0873],
        [-0.0906],
        [-0.0829],
        [-0.0689],
        [-0.0543],
        [-0.0678],
        [-0.0873],
        [-0.0956],
        [-0.0866],
        [-0.0963],
        [-0.0943],
        [-0.0970],
        [-0.0946],
        [-0.0978],
        [-0.0949],
        [-0.0982],
        [-0.0956],
        [-0.0981],
        [-0.0870],
        [-0.0982],
        [-0.0967],
        [-0.0933],
        [-0.0892],
        [-0.0898],
        [-0.0940],
        [-0.0872],
        [-0.0703],
        [-0.0538],
        [-0.0698],
        [-0.0880],
        [-0.0955],
        [-0.0882],
        [-0.0962],
        [-0.0952],
        [-0.0933],
        [-0.0891],
        [-0.0863],
        [-0.0855],
        [-0.0884],
        [-0.0821],
        [-0.0692],

In [None]:
plt.plot(scores)
plt.plot(average_scores)

In [None]:
# torch.save(agent.model.state_dict(), "lunar_lander_ppo_model.pth")
# torch.save(agent.model_old.state_dict(), "lunar_lander_ppo_model_old.pth")

In [None]:
model_path="trained_models/ppo/lunar_lander_ppo_model.pth"
model_old_path="trained_models/ppo/lunar_lander_ppo_model_old.pth"

agent.model.load_state_dict(torch.load(model_path))
agent.model_old.load_state_dict(torch.load(model_old_path))

In [None]:
env = gym.wrappers.Monitor(env, "./vid", video_callable=lambda episode_id: True,force=True)

for episode in range(1model_path="trained_models/ppo/Pendulum-v0_ppo_model.pth"
model_old_path="trained_models/ppo/Pendulum-v0_ppo_model_old.pth"

agent.model.load_state_dict(torch.load(model_path))
agent.model_old.load_state_dict(torch.load(model_old_path))):
    state = env.reset()
    score = 0
    input()

    for t in range(700):
        action, log_prob = agent.act(torch.FloatTensor(state))
        next_state, reward, done, _ = env.step(action.item())
        env.render()
        
        score += reward
        
        if done:
            print(score)
            break;
        
        state = next_state

env.close()