# Proximal Policy Optimization (PPO) Tutorial

## 1. Environment Preparation




### 1.1 Download Packages for BipedalWalker-v3

In [30]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender==1.0.2
!pip install swig
!pip install box2d-py
!pip install 'gym[Box2D]'



### 1.2 Mount Drive and Set Project Path

In [31]:
from google.colab import drive
drive.mount('/content/drive')

import sys
project_root = '/content/drive/My Drive/ppo_tutorial/'
sys.path.append(project_root)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1.3 Test the BipedalWalker-v3 Environment

In [32]:
import os
import gym
import torch
import torch.nn as nn
import numpy as np
from colabgymrender.recorder import Recorder

env = gym.make('BipedalWalker-v3')
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
print(s_dim)
print(a_dim)

  deprecation(

  deprecation(



24
4


## 2. Policy Network & Value Network Construction

### 2.1 Diagonal Gaussian Distribution Module

In [33]:
#AddBias module
class AddBias(nn.Module):
    def __init__(self, bias):
        super(AddBias, self).__init__()
        self._bias = nn.Parameter(bias.unsqueeze(1))

    def forward(self, x):
        bias = self._bias.t().view(1, -1)
        return x + bias

#Gaussian distribution with given mean & std.
class FixedNormal(torch.distributions.Normal):
    def log_probs(self, x):
        return super().log_prob(x).sum(-1)

    def entropy(self):
        return super().entropy().sum(-1)

    def mode(self):
        return self.mean

#Diagonal Gaussian module
class DiagGaussian(nn.Module):
    def __init__(self, inp_dim, out_dim):
        super(DiagGaussian, self).__init__()
        self.fc_mean = nn.Linear(inp_dim, out_dim)
        self.b_logstd = AddBias(torch.zeros(out_dim))

    def forward(self, x):
        mean = self.fc_mean(x)
        logstd = self.b_logstd(torch.zeros_like(mean))
        return FixedNormal(mean, logstd.exp())

### 2.2 Policy Network & Value Network Module

In [34]:
#Policy Network
class PolicyNet(nn.Module):
    #Constructor
    def __init__(self, s_dim, a_dim):
        super(PolicyNet, self).__init__()
        #TODO(Lab-1): Policy Network Architecture
        self.main = nn.Sequential(
            nn.Linear(s_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU()
        )
        self.dist = DiagGaussian(128, a_dim)

    #Forward pass
    def forward(self, state, deterministic=False):
        feature = self.main(state)
        dist = self.dist(feature)

        if deterministic:
            action = dist.mode()
        else:
            action = dist.sample()

        return action, dist.log_probs(action)

    #Choose an action (stochastically or deterministically)
    def choose_action(self, state, deterministic=False):
        feature = self.main(state)
        dist = self.dist(feature)

        if deterministic:
            return dist.mode()

        return dist.sample()

    #Evaluate a state-action pair (output log-prob. & entropy)
    def evaluate(self, state, action):
        feature = self.main(state)
        dist = self.dist(feature)
        return dist.log_probs(action), dist.entropy()

#Value Network
class ValueNet(nn.Module):
    #Constructor
    def __init__(self, s_dim):
        super(ValueNet, self).__init__()
        #TODO(Lab-2): Value Network Architecture
        self.main = nn.Sequential(
            nn.Linear(s_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    #Forward pass
    def forward(self, state):
        return self.main(state)[:, 0]

### 2.3 Create Policy Network & Value Network

In [35]:
policy_net = PolicyNet(s_dim, a_dim)
value_net = ValueNet(s_dim)
print(policy_net)
print(value_net)

PolicyNet(
  (main): Sequential(
    (0): Linear(in_features=24, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
  )
  (dist): DiagGaussian(
    (fc_mean): Linear(in_features=128, out_features=4, bias=True)
    (b_logstd): AddBias()
  )
)
ValueNet(
  (main): Sequential(
    (0): Linear(in_features=24, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)


## 3. Environment Runner Construction

### 3.1 EnvRunner Class

In [36]:
class EnvRunner:
    #Constructor
    def __init__(self, s_dim, a_dim, gamma=0.99, lamb=0.95, max_step=2048, device='cpu'):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.gamma = gamma
        self.lamb = lamb
        self.max_step = max_step
        self.device = device

        #Storages (state, action, value, reward, a_logp)
        self.mb_states = np.zeros((self.max_step, self.s_dim), dtype=np.float32)
        self.mb_actions = np.zeros((self.max_step, self.a_dim), dtype=np.float32)
        self.mb_values = np.zeros((self.max_step,), dtype=np.float32)
        self.mb_rewards = np.zeros((self.max_step,), dtype=np.float32)
        self.mb_a_logps = np.zeros((self.max_step,), dtype=np.float32)

    #Compute discounted return
    def compute_discounted_return(self, rewards, last_value):
        #TODO(Lab-3): Compute discounted return
        returns = np.zeros_like(rewards)
        n_step = len(rewards)

        for t in reversed(range(n_step)):
            if t == n_step - 1:
                returns[t] = rewards[t] + self.gamma * last_value
            else:
                returns[t] = rewards[t] + self.gamma * returns[t+1]

        return returns

    #Compute generalized advantage estimation (Optional)
    def compute_gae(self, rewards, values, last_value):
        advs = np.zeros_like(rewards)
        n_step = len(rewards)
        last_gae_lam = 0.0

        for t in reversed(range(n_step)):
            if t == n_step - 1:
                next_value = last_value
            else:
                next_value = values[t+1]

            delta = rewards[t] + self.gamma*next_value - values[t]
            advs[t] = last_gae_lam = delta + self.gamma*self.lamb*last_gae_lam

        return advs + values

    #Run an episode using the policy net & value net
    def run(self, env, policy_net, value_net):
        #TODO(Lab-4): Run an episode to collect data
        state = env.reset()   #Initial state
        episode_len = self.max_step

        for step in range(self.max_step):
            state_tensor = torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device=self.device)
            action, a_logp = policy_net(state_tensor)
            value = value_net(state_tensor)

            action = action.cpu().numpy()[0]
            a_logp = a_logp.cpu().numpy()
            value  = value.cpu().numpy()

            self.mb_states[step] = state
            self.mb_actions[step] = action
            self.mb_a_logps[step] = a_logp
            self.mb_values[step] = value

            state, reward, done, info = env.step(action)
            self.mb_rewards[step] = reward

            if done:
                episode_len = step + 1
                break
        #Compute returns
        last_value = value_net(
            torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device=self.device)
        ).cpu().numpy()

        mb_returns = self.compute_discounted_return(self.mb_rewards[:episode_len], last_value)
        '''
        mb_returns = self.compute_gae(
            self.mb_rewards[:episode_len],
            self.mb_values[:episode_len],
            last_value
        )
        '''
        return self.mb_states[:episode_len], \
                self.mb_actions[:episode_len], \
                self.mb_a_logps[:episode_len], \
                self.mb_values[:episode_len], \
                mb_returns, \
                self.mb_rewards[:episode_len]

### 3.2 Create EnvRunner

In [37]:
runner = EnvRunner(s_dim, a_dim)

## 4. PPO Algorithm

### 4.1 PPO Class

In [38]:
class PPO:
    #Constructor
    def __init__(self, policy_net, value_net, lr=1e-4, max_grad_norm=0.5, ent_weight=0.01, clip_val=0.2, sample_n_epoch=4, sample_mb_size=64, device='cpu'):
        self.policy_net = policy_net
        self.value_net = value_net
        self.max_grad_norm = max_grad_norm
        self.ent_weight = ent_weight
        self.clip_val = clip_val
        self.sample_n_epoch = sample_n_epoch
        self.sample_mb_size = sample_mb_size
        self.device = device
        self.opt_polcy = torch.optim.Adam(policy_net.parameters(), lr)
        self.opt_value = torch.optim.Adam(value_net.parameters(), lr)

    #Train the policy net & value net using PPO
    def train(self, mb_states, mb_actions, mb_old_values, mb_advs, mb_returns, mb_old_a_logps):
        #Convert numpy array to tensor
        mb_states = torch.from_numpy(mb_states).to(self.device)
        mb_actions = torch.from_numpy(mb_actions).to(self.device)
        mb_old_values = torch.from_numpy(mb_old_values).to(self.device)
        mb_advs = torch.from_numpy(mb_advs).to(self.device)
        mb_returns = torch.from_numpy(mb_returns).to(self.device)
        mb_old_a_logps = torch.from_numpy(mb_old_a_logps).to(self.device)
        episode_length = len(mb_states)
        rand_idx = np.arange(episode_length)
        sample_n_mb = episode_length // self.sample_mb_size

        if sample_n_mb <= 0:
            sample_mb_size = episode_length
            sample_n_mb = 1
        else:
            sample_mb_size = self.sample_mb_size

        for i in range(self.sample_n_epoch):
            np.random.shuffle(rand_idx)

            for j in range(sample_n_mb):
                #Randomly sample a batch for training
                sample_idx = rand_idx[j*sample_mb_size : (j+1)*sample_mb_size]
                sample_states = mb_states[sample_idx]
                sample_actions = mb_actions[sample_idx]
                sample_old_values = mb_old_values[sample_idx]
                sample_advs = mb_advs[sample_idx]
                sample_returns = mb_returns[sample_idx]
                sample_old_a_logps = mb_old_a_logps[sample_idx]

                sample_a_logps, sample_ents = self.policy_net.evaluate(sample_states, sample_actions)
                sample_values = self.value_net(sample_states)
                ent = sample_ents.mean()

                #TODO(Lab-5): Compute value loss & policy gradient loss
                v_pred_clip = sample_old_values + torch.clamp(sample_values - sample_old_values, -self.clip_val, self.clip_val)
                v_loss1 = (sample_returns - sample_values).pow(2)
                v_loss2 = (sample_returns - v_pred_clip).pow(2)
                v_loss = torch.max(v_loss1, v_loss2).mean()


                ratio = (sample_a_logps - sample_old_a_logps).exp()
                pg_loss1 = -sample_advs * ratio
                pg_loss2 = -sample_advs * torch.clamp(ratio, 1.0-self.clip_val, 1.0+self.clip_val)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean() - self.ent_weight*ent

                #Train actor
                self.opt_polcy.zero_grad()
                pg_loss.backward()
                nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.max_grad_norm)
                self.opt_polcy.step()

                #Train critic
                self.opt_value.zero_grad()
                v_loss.backward()
                nn.utils.clip_grad_norm_(self.value_net.parameters(), self.max_grad_norm)
                self.opt_value.step()

        return pg_loss.item(), v_loss.item(), ent.item()

### 4.2 Create PPO Agent

In [39]:
agent = PPO(policy_net, value_net)

## 5. Training and Testing Process

### 5.1 Play an Episode for Evaluation

In [40]:
def play(policy_net):
    render_env = Recorder(gym.make('BipedalWalker-v3'), project_root + 'video')

    with torch.no_grad():
        #TODO(Lab-6): Play an episode and evaluate the performance
        state = render_env.reset()
        total_reward = 0
        length = 0

        while True:
            state_tensor = torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device='cpu')
            action = policy_net.choose_action(state_tensor, deterministic=True).cpu().numpy()
            state, reward, done, info = render_env.step(action[0])
            total_reward += reward
            length += 1

            if done:
                print("[Evaluation] Total reward = {:.6f}, length = {:d}".format(total_reward, length), flush=True)
                break

    render_env.play()
    render_env.close()

In [41]:
play(policy_net)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = -93.606786, length = 110
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




### 5.2 Train the Networks using PPO

In [42]:
def train(env, runner, policy_net, value_net, agent, max_episode=5000):
    mean_total_reward = 0
    mean_length = 0
    save_dir = project_root + '/save'

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    for i in range(max_episode):
        #TODO(Lab-7): Run an episode to collect data and then train the model
        with torch.no_grad():
            mb_states, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run(env, policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        pg_loss, v_loss, ent = agent.train(mb_states, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps)
        mean_total_reward += mb_rewards.sum()
        mean_length += len(mb_states)
        print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format(i, mb_rewards.sum(), len(mb_states)))
        #Show the current result & save the model
        if i % 200 == 0:
            print("\n[{:5d} / {:5d}]".format(i, max_episode))
            print("----------------------------------")
            print("actor loss = {:.6f}".format(pg_loss))
            print("critic loss = {:.6f}".format(v_loss))
            print("entropy = {:.6f}".format(ent))
            print("mean return = {:.6f}".format(mean_total_reward / 200))
            print("mean length = {:.2f}".format(mean_length / 200))
            print("\nSaving the model ... ", end="")
            torch.save({
                "it": i,
                "PolicyNet": policy_net.state_dict(),
                "ValueNet": value_net.state_dict()
            }, os.path.join(save_dir, "model.pt"))
            print("Done.")
            print()
            play(policy_net)
            mean_total_reward = 0
            mean_length = 0

  and should_run_async(code)



In [43]:
train(env, runner, policy_net, value_net, agent)
env.close()

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode    0] total reward = -103.598045, length = 72

[    0 /  5000]
----------------------------------
actor loss = -0.074274
critic loss = 5421.066895
entropy = 5.676342
mean return = -0.517990
mean length = 0.36

Saving the model ... Done.



See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = -93.455244, length = 111
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode    1] total reward = -118.160950, length = 63
[Episode    2] total reward = -117.706200, length = 72
[Episode    3] total reward = -116.483932, length = 77
[Episode    4] total reward = -117.556396, length = 1600
[Episode    5] total reward = -122.286667, length = 89
[Episode    6] total reward = -99.642181, length = 81
[Episode    7] total reward = -120.423820, length = 1600
[Episode    8] total reward = -100.709152, length = 84
[Episode    9] total reward = -131.500320, length = 104
[Episode   10] total reward = -103.816338, length = 46
[Episode   11] total reward = -126.600876, length = 1600
[Episode   12] total reward = -104.180794, length = 43
[Episode   13] total reward = -103.128967, length = 75
[Episode   14] total reward = -126.965851, length = 1600
[Episode   15] total reward = -115.601425, length = 1600
[Episode   16] total reward = -124.467926, length = 1600
[Episode   17] total reward = -119.188805, length = 114


  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode  200] total reward = -25.257645, length = 1600

[  200 /  5000]
----------------------------------
actor loss = -0.016049
critic loss = 8.293098
entropy = 5.622211
mean return = -100.947085
mean length = 644.03

Saving the model ... Done.



See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = -76.798800, length = 1600
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode  201] total reward = -125.333191, length = 650
[Episode  202] total reward = -39.089767, length = 1600
[Episode  203] total reward = -120.397858, length = 99
[Episode  204] total reward = -32.595627, length = 1600
[Episode  205] total reward = -109.428932, length = 45
[Episode  206] total reward = -120.337280, length = 130
[Episode  207] total reward = -27.939465, length = 1600
[Episode  208] total reward = -109.580505, length = 38
[Episode  209] total reward = -21.250378, length = 1600
[Episode  210] total reward = -17.750156, length = 1600
[Episode  211] total reward = -118.095222, length = 85
[Episode  212] total reward = -12.181019, length = 1600
[Episode  213] total reward = -108.926231, length = 87
[Episode  214] total reward = -100.395905, length = 262
[Episode  215] total reward = -21.908672, length = 1600
[Episode  216] total reward = -5.949561, length = 1600
[Episode  217] total reward = -111.452507, length = 223
[E

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode  400] total reward = 77.420334, length = 1600

[  400 /  5000]
----------------------------------
actor loss = -0.063462
critic loss = 29.572426
entropy = 5.644776
mean return = -37.887783
mean length = 860.49

Saving the model ... Done.

[Evaluation] Total reward = 192.666888, length = 1600
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode  401] total reward = 58.909210, length = 1600
[Episode  402] total reward = 130.409424, length = 1600
[Episode  403] total reward = 100.050339, length = 1600
[Episode  404] total reward = 122.665848, length = 1600
[Episode  405] total reward = 146.802017, length = 1600
[Episode  406] total reward = -55.245609, length = 709
[Episode  407] total reward = 138.501450, length = 1600
[Episode  408] total reward = -126.349823, length = 135
[Episode  409] total reward = -105.190079, length = 95
[Episode  410] total reward = 100.449921, length = 1600
[Episode  411] total reward = 131.164627, length = 1600
[Episode  412] total reward = 140.767899, length = 1600
[Episode  413] total reward = 141.908997, length = 1600
[Episode  414] total reward = 121.887390, length = 1600
[Episode  415] total reward = 153.096817, length = 1600
[Episode  416] total reward = 142.639740, length = 1600
[Episode  417] total reward = -101.527473, length = 145

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode  600] total reward = 153.274353, length = 1600

[  600 /  5000]
----------------------------------
actor loss = -0.132086
critic loss = 30.292105
entropy = 5.765182
mean return = -10.147544
mean length = 805.94

Saving the model ... Done.

[Evaluation] Total reward = 189.284608, length = 1600
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode  601] total reward = 141.780609, length = 1600
[Episode  602] total reward = 150.629852, length = 1600
[Episode  603] total reward = 144.914398, length = 1600
[Episode  604] total reward = 137.229523, length = 1600
[Episode  605] total reward = 125.154366, length = 1600
[Episode  606] total reward = -120.436142, length = 200
[Episode  607] total reward = 115.069855, length = 1600
[Episode  608] total reward = 135.380676, length = 1600
[Episode  609] total reward = -112.417763, length = 83
[Episode  610] total reward = 127.746475, length = 1600
[Episode  611] total reward = 142.251572, length = 1600
[Episode  612] total reward = 129.558960, length = 1600
[Episode  613] total reward = 121.286392, length = 1600
[Episode  614] total reward = 141.220642, length = 1600
[Episode  615] total reward = 154.339630, length = 1600
[Episode  616] total reward = 148.402527, length = 1600
[Episode  617] total reward = 148.647980, length = 16

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode  800] total reward = 141.748688, length = 1600

[  800 /  5000]
----------------------------------
actor loss = -0.111813
critic loss = 20.239407
entropy = 5.798512
mean return = 30.850663
mean length = 978.62

Saving the model ... Done.

[Evaluation] Total reward = 197.187477, length = 1600
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode  801] total reward = -106.467674, length = 57
[Episode  802] total reward = 19.528160, length = 1153
[Episode  803] total reward = -42.808117, length = 838
[Episode  804] total reward = 165.722565, length = 1600
[Episode  805] total reward = 153.527893, length = 1600
[Episode  806] total reward = -89.132629, length = 263
[Episode  807] total reward = -131.212784, length = 144
[Episode  808] total reward = 138.067413, length = 1600
[Episode  809] total reward = -107.160500, length = 71
[Episode  810] total reward = 150.875793, length = 1600
[Episode  811] total reward = -101.348686, length = 98
[Episode  812] total reward = 156.750610, length = 1600
[Episode  813] total reward = 27.713120, length = 1253
[Episode  814] total reward = -107.962509, length = 69
[Episode  815] total reward = 156.765717, length = 1600
[Episode  816] total reward = 15.723949, length = 1333
[Episode  817] total reward = 16.988693, length = 1444
[Episo

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 1000] total reward = -18.277428, length = 948

[ 1000 /  5000]
----------------------------------
actor loss = -0.024497
critic loss = 12468.790039
entropy = 5.917737
mean return = 5.823344
mean length = 872.77

Saving the model ... Done.

[Evaluation] Total reward = -32.084762, length = 913
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 1001] total reward = 163.530014, length = 1600
[Episode 1002] total reward = 125.220215, length = 1600
[Episode 1003] total reward = 131.115723, length = 1600
[Episode 1004] total reward = -134.189636, length = 265
[Episode 1005] total reward = 31.003716, length = 1460
[Episode 1006] total reward = 148.536545, length = 1600
[Episode 1007] total reward = -31.082115, length = 924
[Episode 1008] total reward = 137.183075, length = 1600
[Episode 1009] total reward = -136.900818, length = 175
[Episode 1010] total reward = 163.861755, length = 1600
[Episode 1011] total reward = -104.299026, length = 658
[Episode 1012] total reward = -126.086586, length = 235
[Episode 1013] total reward = -94.579277, length = 143
[Episode 1014] total reward = -113.583946, length = 99
[Episode 1015] total reward = -104.569817, length = 150
[Episode 1016] total reward = -132.697144, length = 88
[Episode 1017] total reward = -130.205261, length = 89
[E

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 1200] total reward = 23.140205, length = 1266

[ 1200 /  5000]
----------------------------------
actor loss = -0.344435
critic loss = 1979.959961
entropy = 6.028644
mean return = 38.687748
mean length = 1019.52

Saving the model ... Done.

[Evaluation] Total reward = -55.721525, length = 424
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 1201] total reward = 15.679482, length = 1174
[Episode 1202] total reward = -57.081226, length = 471
[Episode 1203] total reward = 158.164124, length = 1600
[Episode 1204] total reward = 176.469482, length = 1600
[Episode 1205] total reward = -146.401627, length = 358
[Episode 1206] total reward = -58.172119, length = 441
[Episode 1207] total reward = 149.502762, length = 1600
[Episode 1208] total reward = -68.688667, length = 633
[Episode 1209] total reward = -128.367752, length = 181
[Episode 1210] total reward = -51.662556, length = 726
[Episode 1211] total reward = -122.284492, length = 87
[Episode 1212] total reward = 61.105316, length = 1528
[Episode 1213] total reward = -108.807259, length = 48
[Episode 1214] total reward = -122.830688, length = 196
[Episode 1215] total reward = -129.306183, length = 135
[Episode 1216] total reward = -130.475204, length = 245
[Episode 1217] total reward = -116.173599, length = 83
[Epis

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 1400] total reward = -0.142262, length = 1075

[ 1400 /  5000]
----------------------------------
actor loss = -0.111235
critic loss = 5959.371582
entropy = 6.145634
mean return = 18.032604
mean length = 930.50

Saving the model ... Done.



See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = 226.782592, length = 1586
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 1401] total reward = -104.311272, length = 128
[Episode 1402] total reward = 170.938171, length = 1600
[Episode 1403] total reward = 11.004482, length = 1048
[Episode 1404] total reward = -107.503098, length = 100
[Episode 1405] total reward = 170.415176, length = 1600
[Episode 1406] total reward = 16.215473, length = 1191
[Episode 1407] total reward = 159.132401, length = 1600
[Episode 1408] total reward = -120.394203, length = 85
[Episode 1409] total reward = -104.562057, length = 72
[Episode 1410] total reward = 165.470627, length = 1600
[Episode 1411] total reward = 139.434586, length = 1600
[Episode 1412] total reward = 158.361450, length = 1600
[Episode 1413] total reward = 182.560211, length = 1600
[Episode 1414] total reward = 146.571442, length = 1600
[Episode 1415] total reward = 49.486977, length = 1347
[Episode 1416] total reward = -27.068245, length = 717
[Episode 1417] total reward = 166.096252, length = 1600
[E

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 1600] total reward = -34.758369, length = 602

[ 1600 /  5000]
----------------------------------
actor loss = -0.053839
critic loss = 27931.859375
entropy = 6.175501
mean return = 55.660912
mean length = 1076.31

Saving the model ... Done.

[Evaluation] Total reward = -9.433673, length = 845
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 1601] total reward = -99.099968, length = 173
[Episode 1602] total reward = 185.290909, length = 1600
[Episode 1603] total reward = -92.392120, length = 174
[Episode 1604] total reward = 179.027985, length = 1600
[Episode 1605] total reward = 185.516388, length = 1600
[Episode 1606] total reward = -110.101028, length = 90
[Episode 1607] total reward = -6.811958, length = 830
[Episode 1608] total reward = 60.318420, length = 1331
[Episode 1609] total reward = 1.149681, length = 904
[Episode 1610] total reward = 190.239502, length = 1600
[Episode 1611] total reward = 184.326508, length = 1600
[Episode 1612] total reward = -89.938942, length = 277
[Episode 1613] total reward = 194.948242, length = 1600
[Episode 1614] total reward = 182.582611, length = 1600
[Episode 1615] total reward = -102.597046, length = 135
[Episode 1616] total reward = 17.330734, length = 991
[Episode 1617] total reward = 186.320801, length = 1600
[Episode

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 1800] total reward = 187.900696, length = 1600

[ 1800 /  5000]
----------------------------------
actor loss = 0.034627
critic loss = 0.413292
entropy = 6.311312
mean return = 91.616568
mean length = 1228.12

Saving the model ... Done.

[Evaluation] Total reward = -93.298714, length = 209
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 1801] total reward = 180.936401, length = 1600
[Episode 1802] total reward = 171.432999, length = 1600
[Episode 1803] total reward = 80.931831, length = 1547
[Episode 1804] total reward = 171.959015, length = 1600
[Episode 1805] total reward = 14.207771, length = 1180
[Episode 1806] total reward = 46.578823, length = 1330
[Episode 1807] total reward = 40.031067, length = 1210
[Episode 1808] total reward = 186.605957, length = 1600
[Episode 1809] total reward = 184.757065, length = 1600
[Episode 1810] total reward = 189.952805, length = 1600
[Episode 1811] total reward = 20.188107, length = 1126
[Episode 1812] total reward = -76.611908, length = 386
[Episode 1813] total reward = 194.841095, length = 1600
[Episode 1814] total reward = -102.048996, length = 134
[Episode 1815] total reward = 183.150711, length = 1600
[Episode 1816] total reward = -105.097443, length = 191
[Episode 1817] total reward = 47.420227, length = 1428
[Ep

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 2000] total reward = 162.737717, length = 1600

[ 2000 /  5000]
----------------------------------
actor loss = -0.036674
critic loss = 904.472717
entropy = 6.498082
mean return = 94.822649
mean length = 1248.36

Saving the model ... Done.

[Evaluation] Total reward = 216.419529, length = 1574
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 2001] total reward = 47.804840, length = 1468
[Episode 2002] total reward = 188.897217, length = 1600
[Episode 2003] total reward = -123.513985, length = 101
[Episode 2004] total reward = 43.148384, length = 1257
[Episode 2005] total reward = 2.216396, length = 932
[Episode 2006] total reward = 160.497406, length = 1600
[Episode 2007] total reward = 180.671143, length = 1600
[Episode 2008] total reward = 18.355522, length = 1089
[Episode 2009] total reward = 192.114304, length = 1600
[Episode 2010] total reward = -95.114342, length = 153
[Episode 2011] total reward = 184.487000, length = 1600
[Episode 2012] total reward = 188.240112, length = 1600
[Episode 2013] total reward = 179.279419, length = 1600
[Episode 2014] total reward = 173.366028, length = 1600
[Episode 2015] total reward = 180.791901, length = 1600
[Episode 2016] total reward = 185.932983, length = 1600
[Episode 2017] total reward = -37.307583, length = 650
[Epi

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 2200] total reward = 173.567200, length = 1600

[ 2200 /  5000]
----------------------------------
actor loss = 0.089825
critic loss = 8.854609
entropy = 6.685821
mean return = 112.676045
mean length = 1290.85

Saving the model ... Done.

[Evaluation] Total reward = -44.529887, length = 466
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 2201] total reward = 197.483490, length = 1600
[Episode 2202] total reward = 167.405334, length = 1600
[Episode 2203] total reward = 200.713989, length = 1600
[Episode 2204] total reward = 187.029724, length = 1600
[Episode 2205] total reward = 202.383255, length = 1600
[Episode 2206] total reward = 207.113312, length = 1580
[Episode 2207] total reward = 20.246731, length = 1045
[Episode 2208] total reward = 201.716812, length = 1600
[Episode 2209] total reward = 44.320511, length = 1323
[Episode 2210] total reward = -59.430878, length = 422
[Episode 2211] total reward = 196.477707, length = 1600
[Episode 2212] total reward = 193.517471, length = 1600
[Episode 2213] total reward = 201.780853, length = 1600
[Episode 2214] total reward = -3.374687, length = 798
[Episode 2215] total reward = 207.712280, length = 1562
[Episode 2216] total reward = 198.733063, length = 1600
[Episode 2217] total reward = 197.255814, length = 1600
[

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 2400] total reward = 195.377014, length = 1600

[ 2400 /  5000]
----------------------------------
actor loss = -0.170729
critic loss = 68.026337
entropy = 6.992589
mean return = 91.959275
mean length = 1201.54

Saving the model ... Done.

[Evaluation] Total reward = 208.891284, length = 1487
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 2401] total reward = -115.395081, length = 70
[Episode 2402] total reward = -105.633827, length = 139
[Episode 2403] total reward = 139.114716, length = 1600
[Episode 2404] total reward = 177.639130, length = 1600
[Episode 2405] total reward = 202.352295, length = 1600
[Episode 2406] total reward = 186.682785, length = 1600
[Episode 2407] total reward = 206.335144, length = 1572
[Episode 2408] total reward = 197.917755, length = 1600
[Episode 2409] total reward = 204.345413, length = 1584
[Episode 2410] total reward = 203.099091, length = 1600
[Episode 2411] total reward = 174.728058, length = 1600
[Episode 2412] total reward = 195.676575, length = 1600
[Episode 2413] total reward = 197.584106, length = 1600
[Episode 2414] total reward = 185.777527, length = 1600
[Episode 2415] total reward = -104.276779, length = 113
[Episode 2416] total reward = 186.053528, length = 1600
[Episode 2417] total reward = 57.975105, length = 147

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 2600] total reward = 202.866089, length = 1564

[ 2600 /  5000]
----------------------------------
actor loss = -0.042558
critic loss = 3.273355
entropy = 7.183928
mean return = 112.022149
mean length = 1244.73

Saving the model ... Done.

[Evaluation] Total reward = 205.055504, length = 1531
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 2601] total reward = -38.108444, length = 759
[Episode 2602] total reward = 70.132935, length = 1457
[Episode 2603] total reward = 201.777618, length = 1574
[Episode 2604] total reward = -73.534668, length = 333
[Episode 2605] total reward = -107.090836, length = 218
[Episode 2606] total reward = 194.684784, length = 1600
[Episode 2607] total reward = 169.997574, length = 1600
[Episode 2608] total reward = 195.917572, length = 1600
[Episode 2609] total reward = 190.251495, length = 1600
[Episode 2610] total reward = 204.460052, length = 1546
[Episode 2611] total reward = 164.770660, length = 1600
[Episode 2612] total reward = 207.171570, length = 1529
[Episode 2613] total reward = 198.679718, length = 1600
[Episode 2614] total reward = 203.734360, length = 1545
[Episode 2615] total reward = 203.831665, length = 1556
[Episode 2616] total reward = 209.112671, length = 1510
[Episode 2617] total reward = 188.225006, length = 1600

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 2800] total reward = 203.876968, length = 1499

[ 2800 /  5000]
----------------------------------
actor loss = 0.001281
critic loss = 4.081909
entropy = 7.452927
mean return = 115.108210
mean length = 1223.99

Saving the model ... Done.

[Evaluation] Total reward = 204.005602, length = 1493
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 2801] total reward = 168.946564, length = 1600
[Episode 2802] total reward = 171.440170, length = 1600
[Episode 2803] total reward = -118.028458, length = 68
[Episode 2804] total reward = 198.808899, length = 1575
[Episode 2805] total reward = 196.966309, length = 1573
[Episode 2806] total reward = 198.440643, length = 1559
[Episode 2807] total reward = 35.406101, length = 1154
[Episode 2808] total reward = 155.469315, length = 1600
[Episode 2809] total reward = 203.421539, length = 1526
[Episode 2810] total reward = 196.621399, length = 1583
[Episode 2811] total reward = 192.194397, length = 1600
[Episode 2812] total reward = 202.557831, length = 1535
[Episode 2813] total reward = 201.101990, length = 1533
[Episode 2814] total reward = 197.955154, length = 1569
[Episode 2815] total reward = 200.369629, length = 1566
[Episode 2816] total reward = 192.518768, length = 1600
[Episode 2817] total reward = 188.186707, length = 160

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 3000] total reward = 63.243465, length = 1453

[ 3000 /  5000]
----------------------------------
actor loss = -0.131905
critic loss = 96335.601562
entropy = 7.756992
mean return = 124.122156
mean length = 1286.40

Saving the model ... Done.

[Evaluation] Total reward = -106.154819, length = 109
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 3001] total reward = -10.240917, length = 775
[Episode 3002] total reward = 196.498260, length = 1580
[Episode 3003] total reward = -103.247971, length = 190
[Episode 3004] total reward = 182.837006, length = 1600
[Episode 3005] total reward = 197.964539, length = 1564
[Episode 3006] total reward = 200.031967, length = 1545
[Episode 3007] total reward = 59.717602, length = 1479
[Episode 3008] total reward = 198.521484, length = 1568
[Episode 3009] total reward = 195.187500, length = 1591
[Episode 3010] total reward = 193.884033, length = 1591
[Episode 3011] total reward = 200.282898, length = 1534
[Episode 3012] total reward = -85.051773, length = 306
[Episode 3013] total reward = -120.503502, length = 98
[Episode 3014] total reward = 200.190277, length = 1523
[Episode 3015] total reward = 196.503647, length = 1568
[Episode 3016] total reward = 198.496063, length = 1544
[Episode 3017] total reward = 198.372559, length = 1550


  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 3200] total reward = 200.857208, length = 1523

[ 3200 /  5000]
----------------------------------
actor loss = -0.130612
critic loss = 6.684066
entropy = 7.966743
mean return = 132.318074
mean length = 1305.51

Saving the model ... Done.

[Evaluation] Total reward = 209.244612, length = 1436
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 3201] total reward = 201.308456, length = 1512
[Episode 3202] total reward = 56.889328, length = 1273
[Episode 3203] total reward = 165.529633, length = 1600
[Episode 3204] total reward = 197.924530, length = 1562
[Episode 3205] total reward = 204.705765, length = 1490
[Episode 3206] total reward = 193.851273, length = 1600
[Episode 3207] total reward = 202.797592, length = 1514
[Episode 3208] total reward = 196.347519, length = 1577
[Episode 3209] total reward = 198.806213, length = 1550
[Episode 3210] total reward = 195.972778, length = 1584
[Episode 3211] total reward = 194.745392, length = 1592
[Episode 3212] total reward = 194.444397, length = 1600
[Episode 3213] total reward = -100.305344, length = 110
[Episode 3214] total reward = -103.606873, length = 133
[Episode 3215] total reward = 203.397797, length = 1521
[Episode 3216] total reward = 200.113052, length = 1551
[Episode 3217] total reward = 206.334839, length = 14

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 3400] total reward = -59.926979, length = 382

[ 3400 /  5000]
----------------------------------
actor loss = 0.019180
critic loss = 422992.937500
entropy = 8.363580
mean return = 135.626432
mean length = 1278.63

Saving the model ... Done.



See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = 25.050481, length = 1114
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 3401] total reward = 200.990692, length = 1520
[Episode 3402] total reward = 199.354156, length = 1539
[Episode 3403] total reward = 206.273636, length = 1468
[Episode 3404] total reward = 202.907776, length = 1506
[Episode 3405] total reward = 199.581848, length = 1539
[Episode 3406] total reward = 199.712250, length = 1534
[Episode 3407] total reward = 202.743591, length = 1502
[Episode 3408] total reward = 203.951660, length = 1491
[Episode 3409] total reward = 210.632217, length = 1433
[Episode 3410] total reward = 193.851883, length = 1590
[Episode 3411] total reward = 197.368210, length = 1559
[Episode 3412] total reward = -128.718140, length = 165
[Episode 3413] total reward = 202.726196, length = 1513
[Episode 3414] total reward = 201.969040, length = 1512
[Episode 3415] total reward = 197.278915, length = 1572
[Episode 3416] total reward = 202.439606, length = 1503
[Episode 3417] total reward = -35.619904, length = 7

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 3599] total reward = 200.595108, length = 1509
[Episode 3600] total reward = -111.965607, length = 48

[ 3600 /  5000]
----------------------------------
actor loss = -0.106636
critic loss = 915806.750000
entropy = 8.669530
mean return = 91.586298
mean length = 1077.68

Saving the model ... Done.



See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = 204.143983, length = 1471
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 3601] total reward = 191.785004, length = 1600
[Episode 3602] total reward = 168.832001, length = 1600
[Episode 3603] total reward = 196.524231, length = 1576
[Episode 3604] total reward = -94.010971, length = 226
[Episode 3605] total reward = -131.127716, length = 223
[Episode 3606] total reward = 172.031754, length = 1600
[Episode 3607] total reward = 27.605263, length = 988
[Episode 3608] total reward = -5.388195, length = 807
[Episode 3609] total reward = 192.587891, length = 1600
[Episode 3610] total reward = 95.107750, length = 1528
[Episode 3611] total reward = 201.330688, length = 1525
[Episode 3612] total reward = 181.689117, length = 1600
[Episode 3613] total reward = 203.731323, length = 1495
[Episode 3614] total reward = 203.802292, length = 1499
[Episode 3615] total reward = -96.328720, length = 169
[Episode 3616] total reward = 203.423492, length = 1512
[Episode 3617] total reward = -62.051537, length = 369
[Epi

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 3800] total reward = 191.189117, length = 1600

[ 3800 /  5000]
----------------------------------
actor loss = -0.049205
critic loss = 254.072189
entropy = 8.986384
mean return = 62.750076
mean length = 1005.04

Saving the model ... Done.



See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = 203.071507, length = 1460
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 3801] total reward = 183.792755, length = 1600
[Episode 3802] total reward = 187.863388, length = 1600
[Episode 3803] total reward = 203.206238, length = 1512
[Episode 3804] total reward = 172.893753, length = 1600
[Episode 3805] total reward = 66.523277, length = 1327
[Episode 3806] total reward = 20.928883, length = 1331
[Episode 3807] total reward = 186.477509, length = 1600
[Episode 3808] total reward = -120.315475, length = 109
[Episode 3809] total reward = 191.018051, length = 1600
[Episode 3810] total reward = 207.010818, length = 1481
[Episode 3811] total reward = 83.148087, length = 1532
[Episode 3812] total reward = 180.716232, length = 1600
[Episode 3813] total reward = 6.712021, length = 1122
[Episode 3814] total reward = 196.934006, length = 1585
[Episode 3815] total reward = 197.485641, length = 1583
[Episode 3816] total reward = 190.120911, length = 1600
[Episode 3817] total reward = -110.700439, length = 63
[E

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 4000] total reward = 192.807892, length = 1579

[ 4000 /  5000]
----------------------------------
actor loss = -0.050755
critic loss = 9.442036
entropy = 9.213426
mean return = 136.787683
mean length = 1301.81

Saving the model ... Done.



See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = 196.244866, length = 1513
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 4001] total reward = 212.791840, length = 1390
[Episode 4002] total reward = -15.166370, length = 788
[Episode 4003] total reward = 199.993591, length = 1519
[Episode 4004] total reward = 194.113190, length = 1552
[Episode 4005] total reward = 196.894928, length = 1540
[Episode 4006] total reward = 203.581085, length = 1478
[Episode 4007] total reward = 197.304993, length = 1529
[Episode 4008] total reward = 194.557266, length = 1569
[Episode 4009] total reward = 201.572586, length = 1500
[Episode 4010] total reward = -64.753777, length = 385
[Episode 4011] total reward = 197.441910, length = 1543
[Episode 4012] total reward = 199.147430, length = 1528
[Episode 4013] total reward = 18.021656, length = 887
[Episode 4014] total reward = 197.516541, length = 1525
[Episode 4015] total reward = 190.311249, length = 1593
[Episode 4016] total reward = 205.782272, length = 1473
[Episode 4017] total reward = 195.908386, length = 1560


  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



Done.

[Evaluation] Total reward = 209.058035, length = 1406
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 4201] total reward = -75.454300, length = 386
[Episode 4202] total reward = 207.061996, length = 1445
[Episode 4203] total reward = 77.031425, length = 1248
[Episode 4204] total reward = 206.496246, length = 1446
[Episode 4205] total reward = -71.117874, length = 354
[Episode 4206] total reward = 203.454269, length = 1461
[Episode 4207] total reward = 209.878571, length = 1412
[Episode 4208] total reward = 194.694046, length = 1568
[Episode 4209] total reward = 44.368187, length = 1086
[Episode 4210] total reward = -34.008804, length = 575
[Episode 4211] total reward = -114.142395, length = 208
[Episode 4212] total reward = 207.338272, length = 1451
[Episode 4213] total reward = -50.415794, length = 390
[Episode 4214] total reward = 211.278839, length = 1397
[Episode 4215] total reward = 207.397827, length = 1451
[Episode 4216] total reward = 208.033997, length = 1419
[Episode 4217] total reward = 200.311188, length = 1497
[E

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 4400] total reward = 206.552063, length = 1455

[ 4400 /  5000]
----------------------------------
actor loss = -0.001754
critic loss = 1.607131
entropy = 9.979031
mean return = 113.678910
mean length = 1172.10

Saving the model ... Done.

[Evaluation] Total reward = 205.876456, length = 1434
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 4401] total reward = 194.832413, length = 1551
[Episode 4402] total reward = 200.210587, length = 1504
[Episode 4403] total reward = 192.978287, length = 1577
[Episode 4404] total reward = 191.853485, length = 1581
[Episode 4405] total reward = 180.015945, length = 1600
[Episode 4406] total reward = 202.838928, length = 1488
[Episode 4407] total reward = -58.804359, length = 407
[Episode 4408] total reward = 54.561394, length = 1309
[Episode 4409] total reward = 197.530731, length = 1526
[Episode 4410] total reward = 201.479141, length = 1495
[Episode 4411] total reward = 202.711151, length = 1477
[Episode 4412] total reward = 199.659882, length = 1508
[Episode 4413] total reward = 195.159637, length = 1558
[Episode 4414] total reward = 201.551086, length = 1486
[Episode 4415] total reward = 201.598755, length = 1493
[Episode 4416] total reward = 203.169159, length = 1483
[Episode 4417] total reward = 207.597717, length = 143

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 4600] total reward = -74.032265, length = 386

[ 4600 /  5000]
----------------------------------
actor loss = -0.014021
critic loss = 878135.562500
entropy = 10.338975
mean return = 139.787406
mean length = 1244.88

Saving the model ... Done.



See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Evaluation] Total reward = 203.201477, length = 1484
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 4601] total reward = 214.726974, length = 1372
[Episode 4602] total reward = -37.161861, length = 422
[Episode 4603] total reward = 85.986809, length = 1241
[Episode 4604] total reward = -52.528030, length = 463
[Episode 4605] total reward = 89.555695, length = 1289
[Episode 4606] total reward = -64.969131, length = 332
[Episode 4607] total reward = -130.533524, length = 110
[Episode 4608] total reward = 93.512238, length = 1156
[Episode 4609] total reward = 60.944229, length = 1048
[Episode 4610] total reward = -65.998283, length = 365
[Episode 4611] total reward = -74.211136, length = 280
[Episode 4612] total reward = 96.474640, length = 1437
[Episode 4613] total reward = -65.921890, length = 376
[Episode 4614] total reward = -21.937355, length = 622
[Episode 4615] total reward = 49.839531, length = 1083
[Episode 4616] total reward = -43.140545, length = 478
[Episode 4617] total reward = 57.497890, length = 1175
[Episode 46

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



[Episode 4800] total reward = 211.615295, length = 1401

[ 4800 /  5000]
----------------------------------
actor loss = 0.068177
critic loss = 34.540314
entropy = 10.660034
mean return = 134.739448
mean length = 1229.86

Saving the model ... Done.

[Evaluation] Total reward = -79.423875, length = 252
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4





Moviepy - Done !
Moviepy - video ready __temp__.mp4
[Episode 4801] total reward = 47.018066, length = 962
[Episode 4802] total reward = 217.801071, length = 1331
[Episode 4803] total reward = -24.949982, length = 887
[Episode 4804] total reward = 37.475883, length = 953
[Episode 4805] total reward = -29.852745, length = 630
[Episode 4806] total reward = 78.999680, length = 1223
[Episode 4807] total reward = -4.841831, length = 687
[Episode 4808] total reward = 211.722717, length = 1411
[Episode 4809] total reward = 210.600739, length = 1409
[Episode 4810] total reward = 210.891754, length = 1424
[Episode 4811] total reward = 201.300507, length = 1517
[Episode 4812] total reward = 201.715729, length = 1491
[Episode 4813] total reward = 211.887405, length = 1411
[Episode 4814] total reward = 210.868759, length = 1415
[Episode 4815] total reward = 210.244019, length = 1419
[Episode 4816] total reward = -38.320621, length = 590
[Episode 4817] total reward = 207.826080, length = 1447
[Episo

### 5.3 Load the Model and Play

In [44]:
save_dir = project_root + '/save'
model_path = os.path.join(save_dir, "model.pt")

if os.path.exists(model_path):
    print("Loading the model ... ", end="")
    checkpoint = torch.load(model_path)
    policy_net.load_state_dict(checkpoint["PolicyNet"])
    print("Done.")
else:
    print('ERROR: No model saved')

play(policy_net)

  and should_run_async(code)

  deprecation(

  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



Loading the model ... Done.
[Evaluation] Total reward = 212.249088, length = 1390
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
