In [1]:
import gym
import time
from tqdm import tqdm
import random
import os

import numpy as np
import time
import torch as T
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.distributions import MultivariateNormal

Fix environment

In [2]:
seed = 666 # Do not change this
device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
def fix(env, seed):
    env.seed(seed)
    env.action_space.seed(seed)
    T.manual_seed(seed)
    T.cuda.manual_seed(seed)
    T.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
  #torch.set_deterministic(True)
    T.backends.cudnn.benchmark = False
    T.backends.cudnn.deterministic = True

Construct the (linear) network structure--

In [3]:
class FeedForwardNN(nn.Module):
    def __init__(self, in_dim, out_dim, name, chkpt_dir):
        super(FeedForwardNN, self).__init__()
    
        self.chkpt_file = os.path.join(chkpt_dir, name)
    
        self.layer1 = nn.Linear(in_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, out_dim)
    
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
    
        self.to(self.device)
    
    def forward(self, obs):
        x = F.relu(self.layer1(obs))
        x = F.relu(self.layer2(x))
        output = self.layer3(x)
        return output

    def save_checkpoint(self):
        T.save(self.state_dict(), self.chkpt_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.chkpt_file))

Create buffer

Construct our PPO class (which is also our agent) --

In [4]:
class PPO:
    def __init__(self, network, obs_dim, act_dim,lr = 0.005, gamma = 0.99, clip = 0.2, n_updates_per_iteration = 5, \
                max_timesteps_per_episode = 200, timesteps_per_batch = 2048, chkpt_dir = 'model/'):
        self.chkpt_dir = chkpt_dir
        
        self.timesteps_per_batch = timesteps_per_batch                 # Number of timesteps to run per batch
        self.max_timesteps_per_episode = max_timesteps_per_episode          # Max number of timesteps per episode
        self.n_updates_per_iteration = n_updates_per_iteration                # Number of times to update actor/critic per iteration
        self.lr = lr                                 # Learning rate of actor optimizer
        self.gamma = gamma                               # Discount factor to be applied when calculating Rewards-To-Go
        self.clip = clip                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
        
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        
        self.actor = network(self.obs_dim, self.act_dim, 'ppo_actor.pth', self.chkpt_dir) 
        self.critic = network(self.obs_dim, 1, 'ppo_critic.pth', self.chkpt_dir)
        
        # Initialize optimizers for actor and critic
        self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)
        
        # Initialize the covariance matrix used to query the actor for actions
        self.cov_var = T.full(size=(self.act_dim,), fill_value=0.5)
        self.cov_mat = T.diag(self.cov_var)
        
    def learn(self, batch_obs, batch_acts, batch_rews, batch_log_probs):
        
        device = self.actor.device
        
        batch_rtgs = self.compute_rtgs(batch_rews)
        batch_rtgs = T.tensor(np.array(batch_rtgs), dtype=T.float).to(device)
        batch_obs = T.tensor(np.array(batch_obs), dtype=T.float).to(device)
        batch_acts = T.tensor(np.array(batch_acts), dtype=T.float).to(device)
        batch_log_probs = T.tensor(np.array(batch_log_probs), dtype=T.float).to(device)
        
        # Calculate advantage at k-th iteration
        V, _ = self.evaluate(batch_obs, batch_acts)
        A_k = batch_rtgs - V.detach()          
        A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10) # Normalizing the advantage
        
        for _ in range(self.n_updates_per_iteration):
            V, curr_log_probs = self.evaluate(batch_obs, batch_acts)
            
            ratios = T.exp(curr_log_probs - batch_log_probs)
            
            # Calculate surrogate losses.
            surr1 = ratios * A_k
            surr2 = T.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k
            
            actor_loss = (-T.min(surr1, surr2)).mean()
            critic_loss = nn.MSELoss()(V, batch_rtgs)
            
            # Calculate gradients and perform backward propagation for actor network
            self.actor_optim.zero_grad()
            actor_loss.backward(retain_graph=True)
            self.actor_optim.step()
            
            # Calculate gradients and perform backward propagation for critic network
            self.critic_optim.zero_grad()
            critic_loss.backward()
            self.critic_optim.step()    
            
    def compute_rtgs(self, batch_rews):
        batch_rtgs = []
        
        for epoch_rews in reversed(batch_rews):
            discounted_reward = 0
            for rew in reversed(epoch_rews):
                discounted_reward = rew + discounted_reward * self.gamma
                batch_rtgs.insert(0, discounted_reward)
                
        
        return batch_rtgs
        
    def get_action(self, obs):
        state = T.tensor(obs, dtype=T.float).to(self.actor.device)
        mean = self.actor.forward(state)
        
        distr = MultivariateNormal(mean, self.cov_mat)
        action = distr.sample()
        log_prob = distr.log_prob(action)
        return action.detach().cpu().numpy(), log_prob.detach().cpu().numpy()
    
        
    def evaluate(self, batch_obs, batch_acts):
        # Estimate the values of each observation, and the log probs of
        # each action in the most recent batch with the most recent
        # iteration of the actor network. Should be called from learn.
        
        # Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
        V = self.critic.forward(batch_obs).squeeze()
        
        # Calculate the log probabilities of batch actions using most recent actor network.
        mean = self.actor.forward(batch_obs)
        dist = MultivariateNormal(mean, self.cov_mat)
        log_probs = dist.log_prob(batch_acts)

        # V - the predicted values of batch_obs
        # log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
        return V, log_probs
    
        

Define the function to collect the on-policy trajectories--

In [5]:
def rollout(policy, env, timesteps_per_batch, max_timesteps_per_episode):
    # Batch data.
    batch_obs = []
    batch_acts = []
    batch_log_probs = []
    batch_rews = []
    batch_lens = []
    
    epoch_rews = []
    t=0
    while t < timesteps_per_batch:
        epoch_rews = [] # rewards collected per episode
        obs = env.reset()
        done = False
        for h in range(max_timesteps_per_episode):
            batch_obs.append(obs)
            action, log_prob = policy.get_action(obs)
            obs, rew, done, _ = env.step(action)
            t +=1
            
            epoch_rews.append(rew)
            batch_acts.append(action)
            batch_log_probs.append(log_prob)
            
            if done:
                break
            
        batch_lens.append(h+1)
        batch_rews.append(epoch_rews)
    
    
    
    return batch_obs, batch_acts, batch_rews, batch_log_probs, batch_lens

In [6]:
hyperparameters = {
    'timesteps_per_batch': 2048, 
    'max_timesteps_per_episode': 200, 
    'gamma': 0.99, 
    'n_updates_per_iteration': 10,
    'lr': 3e-4, 
    'clip': 0.2,
    'seed': 666
    }

The training loop----

In [7]:
env = gym.make('Pendulum-v0')
fix(env, seed)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

model = PPO(network=FeedForwardNN, obs_dim = obs_dim, act_dim = act_dim)
model.actor.load_checkpoint()
model.critic.load_checkpoint()
model.actor.train()
model.critic.train()


FeedForwardNN(
  (layer1): Linear(in_features=3, out_features=64, bias=True)
  (layer2): Linear(in_features=64, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=1, bias=True)
)

Train--

In [8]:
ITERATIONS = 100
tmp = 0.0

prg_bar = tqdm(range(ITERATIONS))
for i in prg_bar:
    batch_obs, batch_acts, batch_rews, batch_log_probs, batch_lens = \
        rollout(model, env, hyperparameters['timesteps_per_batch'], hyperparameters['max_timesteps_per_episode'])

    avg_ep_lens = np.mean(batch_lens)
    avg_ep_rews = np.mean([np.sum(ep_rews) for ep_rews in batch_rews])
    
    # Round decimal places for more aesthetic logging messages
    avg_ep_lens = str(round(avg_ep_lens, 2))
    avg_ep_rews_1 = str(round(avg_ep_rews, 2))

    print(f"-------------------- Iteration --------------------", flush=True)
    print(f"Average Episodic Length: {avg_ep_lens}", flush=True)
    print(f"Average Episodic Return: {avg_ep_rews_1}", flush=True)
    
    model.learn(batch_obs, batch_acts, batch_rews, batch_log_probs)
    if avg_ep_rews > tmp:
        model.actor.save_checkpoint()
        model.critic.save_checkpoint()
        print("Successfully save the model!",flush=True)
    tmp = avg_ep_rews
    

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -150.86


  1%|▊                                                                                 | 1/100 [00:01<02:58,  1.81s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -181.43


  2%|█▋                                                                                | 2/100 [00:03<02:56,  1.80s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -197.6


  3%|██▍                                                                               | 3/100 [00:05<03:01,  1.87s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -191.29


  4%|███▎                                                                              | 4/100 [00:07<03:02,  1.90s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -150.25


  5%|████                                                                              | 5/100 [00:09<02:58,  1.88s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -131.23


  6%|████▉                                                                             | 6/100 [00:11<02:54,  1.85s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -129.51


  7%|█████▋                                                                            | 7/100 [00:12<02:50,  1.83s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -130.54


  8%|██████▌                                                                           | 8/100 [00:14<02:51,  1.86s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -148.67


  9%|███████▍                                                                          | 9/100 [00:16<02:51,  1.89s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -162.22


 10%|████████                                                                         | 10/100 [00:18<02:50,  1.89s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -193.43


 11%|████████▉                                                                        | 11/100 [00:20<02:51,  1.92s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -140.24


 12%|█████████▋                                                                       | 12/100 [00:22<02:45,  1.88s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -131.89


 13%|██████████▌                                                                      | 13/100 [00:24<02:42,  1.87s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -149.66


 14%|███████████▎                                                                     | 14/100 [00:26<02:46,  1.93s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -176.95


 15%|████████████▏                                                                    | 15/100 [00:28<02:45,  1.95s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -183.58


 16%|████████████▉                                                                    | 16/100 [00:30<02:45,  1.97s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -185.47


 17%|█████████████▊                                                                   | 17/100 [00:32<02:41,  1.95s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -119.24


 18%|██████████████▌                                                                  | 18/100 [00:34<02:35,  1.90s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -228.4


 19%|███████████████▍                                                                 | 19/100 [00:36<02:36,  1.93s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -169.62


 20%|████████████████▏                                                                | 20/100 [00:38<02:36,  1.96s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -181.09


 21%|█████████████████                                                                | 21/100 [00:40<02:36,  1.98s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -198.75


 22%|█████████████████▊                                                               | 22/100 [00:42<02:36,  2.01s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -188.86


 23%|██████████████████▋                                                              | 23/100 [00:44<02:33,  1.99s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -166.68


 24%|███████████████████▍                                                             | 24/100 [00:46<02:31,  1.99s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -203.69


 25%|████████████████████▎                                                            | 25/100 [00:48<02:26,  1.96s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -187.64


 26%|█████████████████████                                                            | 26/100 [00:50<02:26,  1.99s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -133.08


 27%|█████████████████████▊                                                           | 27/100 [00:51<02:21,  1.94s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -168.4


 28%|██████████████████████▋                                                          | 28/100 [00:53<02:17,  1.90s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -169.52


 29%|███████████████████████▍                                                         | 29/100 [00:55<02:14,  1.89s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -197.12


 30%|████████████████████████▎                                                        | 30/100 [00:57<02:11,  1.88s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -206.28


 31%|█████████████████████████                                                        | 31/100 [00:59<02:09,  1.87s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -185.16


 32%|█████████████████████████▉                                                       | 32/100 [01:01<02:07,  1.87s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -191.45


 33%|██████████████████████████▋                                                      | 33/100 [01:03<02:04,  1.86s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -200.61


 34%|███████████████████████████▌                                                     | 34/100 [01:04<02:02,  1.85s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -99.21


 35%|████████████████████████████▎                                                    | 35/100 [01:06<02:01,  1.87s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -142.38


 36%|█████████████████████████████▏                                                   | 36/100 [01:08<01:57,  1.84s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -172.29


 37%|█████████████████████████████▉                                                   | 37/100 [01:10<01:56,  1.86s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -130.78


 38%|██████████████████████████████▊                                                  | 38/100 [01:12<01:56,  1.88s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -109.51


 39%|███████████████████████████████▌                                                 | 39/100 [01:14<02:00,  1.97s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -201.99


 40%|████████████████████████████████▍                                                | 40/100 [01:16<01:58,  1.97s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -174.84


 41%|█████████████████████████████████▏                                               | 41/100 [01:18<01:53,  1.92s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -154.14


 42%|██████████████████████████████████                                               | 42/100 [01:20<01:49,  1.89s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -141.97


 43%|██████████████████████████████████▊                                              | 43/100 [01:21<01:46,  1.87s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -141.6


 44%|███████████████████████████████████▋                                             | 44/100 [01:24<01:47,  1.93s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -196.22


 45%|████████████████████████████████████▍                                            | 45/100 [01:26<01:48,  1.97s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -151.37


 46%|█████████████████████████████████████▎                                           | 46/100 [01:28<01:46,  1.98s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -211.98


 47%|██████████████████████████████████████                                           | 47/100 [01:30<01:45,  1.99s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -155.03


 48%|██████████████████████████████████████▉                                          | 48/100 [01:32<01:44,  2.00s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -129.95


 49%|███████████████████████████████████████▋                                         | 49/100 [01:34<01:40,  1.96s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -193.5


 50%|████████████████████████████████████████▌                                        | 50/100 [01:35<01:34,  1.89s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -164.48


 51%|█████████████████████████████████████████▎                                       | 51/100 [01:37<01:31,  1.87s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -129.24


 52%|██████████████████████████████████████████                                       | 52/100 [01:39<01:30,  1.88s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -163.19


 53%|██████████████████████████████████████████▉                                      | 53/100 [01:41<01:27,  1.86s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -128.19


 54%|███████████████████████████████████████████▋                                     | 54/100 [01:43<01:24,  1.84s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -195.32


 55%|████████████████████████████████████████████▌                                    | 55/100 [01:44<01:21,  1.82s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -125.86


 56%|█████████████████████████████████████████████▎                                   | 56/100 [01:46<01:19,  1.82s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -142.73


 57%|██████████████████████████████████████████████▏                                  | 57/100 [01:48<01:18,  1.83s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -98.42


 58%|██████████████████████████████████████████████▉                                  | 58/100 [01:50<01:16,  1.83s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -160.87


 59%|███████████████████████████████████████████████▊                                 | 59/100 [01:52<01:14,  1.83s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -120.88


 60%|████████████████████████████████████████████████▌                                | 60/100 [01:53<01:12,  1.82s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -120.66


 61%|█████████████████████████████████████████████████▍                               | 61/100 [01:56<01:16,  1.97s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -128.32


 62%|██████████████████████████████████████████████████▏                              | 62/100 [01:58<01:14,  1.96s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -160.93


 63%|███████████████████████████████████████████████████                              | 63/100 [02:00<01:14,  2.01s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -154.44


 64%|███████████████████████████████████████████████████▊                             | 64/100 [02:02<01:11,  1.98s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -162.66


 65%|████████████████████████████████████████████████████▋                            | 65/100 [02:04<01:11,  2.04s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -173.48


 66%|█████████████████████████████████████████████████████▍                           | 66/100 [02:06<01:10,  2.07s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -170.07


 67%|██████████████████████████████████████████████████████▎                          | 67/100 [02:08<01:08,  2.07s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -141.38


 68%|███████████████████████████████████████████████████████                          | 68/100 [02:10<01:05,  2.05s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -108.66


 69%|███████████████████████████████████████████████████████▉                         | 69/100 [02:12<01:02,  2.03s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -167.62


 70%|████████████████████████████████████████████████████████▋                        | 70/100 [02:14<01:00,  2.02s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -122.5


 71%|█████████████████████████████████████████████████████████▌                       | 71/100 [02:16<00:58,  2.03s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -139.41


 72%|██████████████████████████████████████████████████████████▎                      | 72/100 [02:18<00:58,  2.08s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -183.37


 73%|███████████████████████████████████████████████████████████▏                     | 73/100 [02:20<00:54,  2.02s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -153.85


 74%|███████████████████████████████████████████████████████████▉                     | 74/100 [02:22<00:50,  1.96s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -155.25


 75%|████████████████████████████████████████████████████████████▊                    | 75/100 [02:24<00:46,  1.87s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -131.81


 76%|█████████████████████████████████████████████████████████████▌                   | 76/100 [02:26<00:44,  1.87s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -146.84


 77%|██████████████████████████████████████████████████████████████▎                  | 77/100 [02:27<00:42,  1.85s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -167.11


 78%|███████████████████████████████████████████████████████████████▏                 | 78/100 [02:29<00:41,  1.88s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -130.16


 79%|███████████████████████████████████████████████████████████████▉                 | 79/100 [02:31<00:39,  1.89s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -150.41


 80%|████████████████████████████████████████████████████████████████▊                | 80/100 [02:33<00:39,  1.95s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -198.07


 81%|█████████████████████████████████████████████████████████████████▌               | 81/100 [02:35<00:37,  1.97s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -99.81


 82%|██████████████████████████████████████████████████████████████████▍              | 82/100 [02:37<00:34,  1.94s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -161.4


 83%|███████████████████████████████████████████████████████████████████▏             | 83/100 [02:39<00:32,  1.92s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -152.06


 84%|████████████████████████████████████████████████████████████████████             | 84/100 [02:41<00:30,  1.93s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -174.83


 85%|████████████████████████████████████████████████████████████████████▊            | 85/100 [02:43<00:29,  1.95s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -175.87


 86%|█████████████████████████████████████████████████████████████████████▋           | 86/100 [02:45<00:27,  1.95s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -194.76


 87%|██████████████████████████████████████████████████████████████████████▍          | 87/100 [02:47<00:25,  1.96s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -193.3


 88%|███████████████████████████████████████████████████████████████████████▎         | 88/100 [02:49<00:23,  1.93s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -177.65


 89%|████████████████████████████████████████████████████████████████████████         | 89/100 [02:51<00:21,  1.96s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -155.28


 90%|████████████████████████████████████████████████████████████████████████▉        | 90/100 [02:53<00:19,  1.99s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -190.77


 91%|█████████████████████████████████████████████████████████████████████████▋       | 91/100 [02:55<00:17,  1.99s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -131.9


 92%|██████████████████████████████████████████████████████████████████████████▌      | 92/100 [02:57<00:15,  1.92s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -205.07


 93%|███████████████████████████████████████████████████████████████████████████▎     | 93/100 [02:59<00:13,  1.89s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -150.97


 94%|████████████████████████████████████████████████████████████████████████████▏    | 94/100 [03:01<00:11,  1.93s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -132.82


 95%|████████████████████████████████████████████████████████████████████████████▉    | 95/100 [03:02<00:09,  1.91s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -139.37


 96%|█████████████████████████████████████████████████████████████████████████████▊   | 96/100 [03:04<00:07,  1.93s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -174.22


 97%|██████████████████████████████████████████████████████████████████████████████▌  | 97/100 [03:06<00:05,  1.90s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -180.63


 98%|███████████████████████████████████████████████████████████████████████████████▍ | 98/100 [03:08<00:03,  1.87s/it]


-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -167.67


 99%|████████████████████████████████████████████████████████████████████████████████▏| 99/100 [03:10<00:01,  1.91s/it]

Successfully save the model!

-------------------- Iteration --------------------
Average Episodic Length: 200.0
Average Episodic Return: -154.52


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:12<00:00,  1.92s/it]

Successfully save the model!





Test----

In [9]:
env = gym.make('Pendulum-v0')
fix(env, seed)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

model = PPO(network=FeedForwardNN, obs_dim = obs_dim, act_dim = act_dim)
model.actor.load_checkpoint()
model.critic.load_checkpoint()
model.actor.eval()  # turn network to evaluation mode
model.critic.eval()
NUM_OF_TEST = 5 # Do not revise it !!!!!
test_total_reward = []
action_list = []
for i in range(NUM_OF_TEST):
    actions = []
    obs = env.reset()

  #img = plt.imshow(env.render(mode='rgb_array'))
    env.render()
  
    total_reward = 0

    done = False
    while not done:
        action, _ = model.get_action(obs)
        actions.append(action)
        obs, reward, done, _ = env.step(action)

        total_reward += reward

      #img.set_data(env.render(mode='rgb_array'))
        env.render()
      #display.display(plt.gcf())
      #display.clear_output(wait=True)
    print(total_reward)
    test_total_reward.append(total_reward)


env.close()

print(f"Your final reward is : %.2f"%np.mean(test_total_reward))

-120.21181737412071
-125.41116676883169
-243.96041123861386
-338.05871844153194
-242.14872698933092
Your final reward is : -213.96
