# Proximal Policy Optimization (PPO) Tutorial

## 1. Environment Preparation




### 1.1 Download Packages for BipedalWalker-v3

In [1]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender==1.0.2
!pip install box2d-py
!pip install 'gym[Box2D]'



### 1.2 Mount Drive and Set Project Path

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import sys
project_root = '/content/drive/My Drive/ppo_tutorial/'
sys.path.append(project_root)

Mounted at /content/drive


### 1.3 Test the BipedalWalker-v3 Environment 

In [3]:
import os
import gym
import torch
import torch.nn as nn
import numpy as np
from colabgymrender.recorder import Recorder

env = gym.make('BipedalWalker-v3')
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
print(s_dim)
print(a_dim)

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)1810432/45929032 bytes (3.9%)4513792/45929032 bytes (9.8%)7520256/45929032 bytes (16.4%)10125312/45929032 bytes (22.0%)12976128/45929032 bytes (28.3%)16023552/45929032 bytes (34.9%)18882560/45929032 bytes (41.1%)21757952/45929032 bytes (47.4%)24657920/45929032 bytes (53.7%)27631616/45929032 bytes (60.2%)30466048/45929032 bytes (66.3%)33300480/45929032 bytes (72.5%)3

## 2. Policy Network & Value Network Construction

### 2.1 Diagonal Gaussian Distribution Module 

In [4]:
#AddBias module
class AddBias(nn.Module):
    def __init__(self, bias):
        super(AddBias, self).__init__()
        self._bias = nn.Parameter(bias.unsqueeze(1))
    
    def forward(self, x):
        bias = self._bias.t().view(1, -1)
        return x + bias

#Gaussian distribution with given mean & std.
class FixedNormal(torch.distributions.Normal):
    def log_probs(self, x):
        return super().log_prob(x).sum(-1)
    
    def entropy(self):
        return super().entropy().sum(-1)

    def mode(self):
        return self.mean

#Diagonal Gaussian module
class DiagGaussian(nn.Module):
    def __init__(self, inp_dim, out_dim):
        super(DiagGaussian, self).__init__()
        self.fc_mean = nn.Linear(inp_dim, out_dim)
        self.b_logstd = AddBias(torch.zeros(out_dim))
    
    def forward(self, x):
        mean = self.fc_mean(x)
        logstd = self.b_logstd(torch.zeros_like(mean))
        return FixedNormal(mean, logstd.exp())

### 2.2 Policy Network & Value Network Module

In [5]:
#Policy Network
class PolicyNet(nn.Module):
    #Constructor
    def __init__(self, s_dim, a_dim):
        super(PolicyNet, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(s_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU()
        )
        self.dist = DiagGaussian(128, a_dim)
    
    #Forward pass
    def forward(self, state, deterministic=False):
        feature = self.main(state)
        dist = self.dist(feature)

        if deterministic:
            action = dist.mode()
        else:
            action = dist.sample()
        
        return action, dist.log_probs(action)
    
    #Choose an action (stochastically or deterministically)
    def choose_action(self, state, deterministic=False):
        feature = self.main(state)
        dist = self.dist(feature)

        if deterministic:
            return dist.mode()

        return dist.sample()
    
    #Evaluate a state-action pair (output log-prob. & entropy)
    def evaluate(self, state, action):
        feature = self.main(state)
        dist = self.dist(feature)
        return dist.log_probs(action), dist.entropy()

#Value Network
class ValueNet(nn.Module):
    #Constructor
    def __init__(self, s_dim):
        super(ValueNet, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(s_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    
    #Forward pass
    def forward(self, state):
        return self.main(state)[:, 0]

### 2.3 Create Policy Network & Value Network

In [6]:
policy_net = PolicyNet(s_dim, a_dim)
value_net = ValueNet(s_dim)
print(policy_net)
print(value_net)

PolicyNet(
  (main): Sequential(
    (0): Linear(in_features=24, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
  )
  (dist): DiagGaussian(
    (fc_mean): Linear(in_features=128, out_features=4, bias=True)
    (b_logstd): AddBias()
  )
)
ValueNet(
  (main): Sequential(
    (0): Linear(in_features=24, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)


## 3. Environment Runner Construction

### 3.1 EnvRunner Class

In [7]:
class EnvRunner:
    #Constructor
    def __init__(self, s_dim, a_dim, gamma=0.99, lamb=0.95, max_step=2048, device='cpu'):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.gamma = gamma
        self.lamb = lamb
        self.max_step = max_step
        self.device = device

        #Storages (state, action, value, reward, a_logp)
        self.mb_states = np.zeros((self.max_step, self.s_dim), dtype=np.float32)
        self.mb_actions = np.zeros((self.max_step, self.a_dim), dtype=np.float32)
        self.mb_values = np.zeros((self.max_step,), dtype=np.float32)
        self.mb_rewards = np.zeros((self.max_step,), dtype=np.float32)
        self.mb_a_logps = np.zeros((self.max_step,), dtype=np.float32)
    
    #Compute discounted return
    def compute_discounted_return(self, rewards, last_value, gamma=0.99):
        returns = np.zeros_like(rewards)
        n_step = len(rewards)

        for t in reversed(range(n_step)):
            if t == n_step - 1:
                returns[t] = rewards[t] + gamma * last_value
            else:
                returns[t] = rewards[t] + gamma * returns[t+1]

        return returns
    
    #Compute generalized advantage estimation
    def compute_gae(self, rewards, values, last_value, gamma=0.99, lamb=0.95):
        advs = np.zeros_like(rewards)
        n_step = len(rewards)
        last_gae_lam = 0.0

        for t in reversed(range(n_step)):
            if t == n_step - 1:
                next_value = last_value
            else:
                next_value = values[t+1]

            delta = rewards[t] + gamma*next_value - values[t]
            advs[t] = last_gae_lam = delta + gamma*lamb*last_gae_lam

        return advs + values

    #Run an episode using the policy net & value net
    def run(self, env, policy_net, value_net):
        #Run an episode
        state = env.reset()   #Initial state
        episode_len = self.max_step

        for step in range(self.max_step):
            state_tensor = torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device=self.device)
            action, a_logp = policy_net(state_tensor)
            value = value_net(state_tensor)

            action = action.cpu().numpy()[0]
            a_logp = a_logp.cpu().numpy()
            value  = value.cpu().numpy()

            self.mb_states[step] = state
            self.mb_actions[step] = action
            self.mb_a_logps[step] = a_logp
            self.mb_values[step] = value

            state, reward, done, info = env.step(action)
            self.mb_rewards[step] = reward

            if done:
                episode_len = step + 1
                break
        
        #Compute returns
        last_value = value_net(
            torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device=self.device)
        ).cpu().numpy()

        mb_returns = self.compute_discounted_return(
            self.mb_rewards[:episode_len],
            last_value,
            self.lamb
        )
        '''
        mb_returns = self.compute_gae(
            self.mb_rewards[:episode_len],
            self.mb_values[:episode_len],
            last_value,
            self.gamma,
            self.lamb
        )
        '''

        return self.mb_states[:episode_len], \
                self.mb_actions[:episode_len], \
                self.mb_a_logps[:episode_len], \
                self.mb_values[:episode_len], \
                mb_returns, \
                self.mb_rewards[:episode_len]

### 3.2 Create EnvRunner

In [8]:
runner = EnvRunner(s_dim, a_dim)

## 4. PPO Algorithm

### 4.1 PPO Class

In [9]:
class PPO:
    #Constructor
    def __init__(self, policy_net, value_net, lr=1e-4, max_grad_norm=0.5, ent_weight=0.01, clip_val=0.2, sample_n_epoch=4, sample_mb_size=64, device='cpu'):
        self.policy_net = policy_net
        self.value_net = value_net
        self.max_grad_norm = max_grad_norm
        self.ent_weight = ent_weight
        self.clip_val = clip_val
        self.sample_n_epoch = sample_n_epoch
        self.sample_mb_size = sample_mb_size
        self.device = device
        self.opt_polcy = torch.optim.Adam(policy_net.parameters(), lr)
        self.opt_value = torch.optim.Adam(value_net.parameters(), lr)
    
    #Train the policy net & value net using PPO
    def train(self, mb_states, mb_actions, mb_old_values, mb_advs, mb_returns, mb_old_a_logps):
        #Convert numpy array to tensor
        mb_states = torch.from_numpy(mb_states).to(self.device)
        mb_actions = torch.from_numpy(mb_actions).to(self.device)
        mb_old_values = torch.from_numpy(mb_old_values).to(self.device)
        mb_advs = torch.from_numpy(mb_advs).to(self.device)
        mb_returns = torch.from_numpy(mb_returns).to(self.device)
        mb_old_a_logps = torch.from_numpy(mb_old_a_logps).to(self.device)
        episode_length = len(mb_states)
        rand_idx = np.arange(episode_length)
        sample_n_mb = episode_length // self.sample_mb_size

        if sample_n_mb <= 0:
            sample_mb_size = episode_length
            sample_n_mb = 1
        else:
            sample_mb_size = self.sample_mb_size

        for i in range(self.sample_n_epoch):
            np.random.shuffle(rand_idx)

            for j in range(sample_n_mb):
                #Randomly sample a batch for training
                sample_idx = rand_idx[j*sample_mb_size : (j+1)*sample_mb_size]
                sample_states = mb_states[sample_idx]
                sample_actions = mb_actions[sample_idx]
                sample_old_values = mb_old_values[sample_idx]
                sample_advs = mb_advs[sample_idx]
                sample_returns = mb_returns[sample_idx]
                sample_old_a_logps = mb_old_a_logps[sample_idx]

                sample_a_logps, sample_ents = self.policy_net.evaluate(sample_states, sample_actions)
                sample_values = self.value_net(sample_states)
                ent = sample_ents.mean()

                #Compute value loss
                v_pred_clip = sample_old_values + torch.clamp(sample_values - sample_old_values, -self.clip_val, self.clip_val)
                v_loss1 = (sample_returns - sample_values).pow(2)
                v_loss2 = (sample_returns - v_pred_clip).pow(2)
                v_loss = torch.max(v_loss1, v_loss2).mean()

                #Compute policy gradient loss
                ratio = (sample_a_logps - sample_old_a_logps).exp()
                pg_loss1 = -sample_advs * ratio
                pg_loss2 = -sample_advs * torch.clamp(ratio, 1.0-self.clip_val, 1.0+self.clip_val)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean() - self.ent_weight*ent

                #Train actor
                self.opt_polcy.zero_grad()
                pg_loss.backward()
                nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.max_grad_norm)
                self.opt_polcy.step()

                #Train critic
                self.opt_value.zero_grad()
                v_loss.backward()
                nn.utils.clip_grad_norm_(self.value_net.parameters(), self.max_grad_norm)
                self.opt_value.step()

        return pg_loss.item(), v_loss.item(), ent.item()

### 4.2 Create PPO Agent

In [10]:
agent = PPO(policy_net, value_net)

## 5. Training and Testing Process

### 5.1 Play an Episode for Evaluation

In [11]:
def play(policy_net):
    render_env = Recorder(gym.make('BipedalWalker-v3'), project_root + '/video')

    with torch.no_grad():
        state = render_env.reset()
        total_reward = 0
        length = 0

        while True:
            #render_env.render()
            state_tensor = torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device='cpu')
            action = policy_net.choose_action(state_tensor, deterministic=True).cpu().numpy()
            state, reward, done, info = render_env.step(action[0])
            total_reward += reward
            length += 1

            if done:
                print("[Evaluation] Total reward = {:.6f}, length = {:d}".format(total_reward, length), flush=True)
                break
    render_env.play()
    render_env.close()

In [12]:
play(policy_net)

[Evaluation] Total reward = -111.058200, length = 73


100%|██████████| 75/75 [00:00<00:00, 259.79it/s]


### 5.2 Train the Networks using PPO

In [13]:
def train(env, runner, policy_net, value_net, agent, max_episode=5000):
    mean_total_reward = 0
    mean_length = 0
    save_dir = project_root + '/save'

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    for i in range(max_episode):
        #Run an episode to collect data
        with torch.no_grad():
            mb_states, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run(env, policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)
        
        #Train the model using the collected data
        pg_loss, v_loss, ent = agent.train(mb_states, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps)
        mean_total_reward += mb_rewards.sum()
        mean_length += len(mb_states)
        print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format(i, mb_rewards.sum(), len(mb_states)))

        #Show the current result & save the model
        if i % 200 == 0:
            print("\n[{:5d} / {:5d}]".format(i, max_episode))
            print("----------------------------------")
            print("actor loss = {:.6f}".format(pg_loss))
            print("critic loss = {:.6f}".format(v_loss))
            print("entropy = {:.6f}".format(ent))
            print("mean return = {:.6f}".format(mean_total_reward / 100))
            print("mean length = {:.2f}".format(mean_length / 100))
            print("\nSaving the model ... ", end="")
            torch.save({
                "it": i,
                "PolicyNet": policy_net.state_dict(),
                "ValueNet": value_net.state_dict()
            }, os.path.join(save_dir, "model.pt"))
            print("Done.")
            print()
            play(policy_net)
            mean_total_reward = 0
            mean_length = 0

In [14]:
train(env, runner, policy_net, value_net, agent)
env.close()

[Episode    0] total reward = -101.405266, length = 1600

[    0 /  5000]
----------------------------------
actor loss = 0.056528
critic loss = 1.490289
entropy = 5.671003
mean return = -1.014053
mean length = 16.00

Saving the model ... Done.

[Evaluation] Total reward = -111.704622, length = 80


 99%|█████████▉| 81/82 [00:00<00:00, 262.01it/s]


[Episode    1] total reward = -114.313126, length = 1600
[Episode    2] total reward = -109.681152, length = 55
[Episode    3] total reward = -95.253601, length = 1600
[Episode    4] total reward = -120.235931, length = 102
[Episode    5] total reward = -115.665123, length = 106
[Episode    6] total reward = -110.038933, length = 100
[Episode    7] total reward = -113.464943, length = 174
[Episode    8] total reward = -110.747955, length = 57
[Episode    9] total reward = -114.234375, length = 76
[Episode   10] total reward = -109.079987, length = 48
[Episode   11] total reward = -108.891266, length = 43
[Episode   12] total reward = -103.560921, length = 78
[Episode   13] total reward = -111.903427, length = 71
[Episode   14] total reward = -101.470703, length = 1600
[Episode   15] total reward = -113.386108, length = 94
[Episode   16] total reward = -117.731384, length = 76
[Episode   17] total reward = -100.212234, length = 1600
[Episode   18] total reward = -113.922935, length = 65

 99%|█████████▊| 69/70 [00:00<00:00, 250.29it/s]


[Episode  201] total reward = -114.780693, length = 45
[Episode  202] total reward = -74.661919, length = 1600
[Episode  203] total reward = -116.780289, length = 106
[Episode  204] total reward = -115.490883, length = 123
[Episode  205] total reward = -107.334503, length = 44
[Episode  206] total reward = -115.129921, length = 69
[Episode  207] total reward = -108.348572, length = 88
[Episode  208] total reward = -116.296417, length = 92
[Episode  209] total reward = -113.881073, length = 108
[Episode  210] total reward = -113.793602, length = 125
[Episode  211] total reward = -122.296768, length = 180
[Episode  212] total reward = -116.237579, length = 74
[Episode  213] total reward = -114.821907, length = 91
[Episode  214] total reward = -113.709229, length = 170
[Episode  215] total reward = -118.929878, length = 118
[Episode  216] total reward = -115.741150, length = 164
[Episode  217] total reward = -114.143173, length = 79
[Episode  218] total reward = -116.044754, length = 74
[

100%|██████████| 1602/1602 [00:05<00:00, 282.39it/s]


[Episode  401] total reward = -97.323479, length = 209
[Episode  402] total reward = -106.035347, length = 347
[Episode  403] total reward = -127.819344, length = 1348
[Episode  404] total reward = -104.311462, length = 65
[Episode  405] total reward = -106.309433, length = 69
[Episode  406] total reward = -104.712723, length = 87
[Episode  407] total reward = -122.692680, length = 137
[Episode  408] total reward = -57.118557, length = 1600
[Episode  409] total reward = -126.016846, length = 608
[Episode  410] total reward = -113.879257, length = 287
[Episode  411] total reward = -113.287262, length = 62
[Episode  412] total reward = -124.217232, length = 101
[Episode  413] total reward = -108.233711, length = 85
[Episode  414] total reward = -110.613312, length = 76
[Episode  415] total reward = -115.312607, length = 76
[Episode  416] total reward = -125.421692, length = 795
[Episode  417] total reward = -148.787994, length = 1320
[Episode  418] total reward = -112.943146, length = 87

100%|██████████| 132/132 [00:00<00:00, 262.32it/s]


[Episode  601] total reward = -101.599030, length = 70
[Episode  602] total reward = -126.616035, length = 102
[Episode  603] total reward = -100.580605, length = 61
[Episode  604] total reward = -101.444206, length = 86
[Episode  605] total reward = -30.638226, length = 1600
[Episode  606] total reward = -21.123402, length = 1600
[Episode  607] total reward = -101.847023, length = 77
[Episode  608] total reward = -104.334465, length = 72
[Episode  609] total reward = -103.027351, length = 69
[Episode  610] total reward = -32.036919, length = 1600
[Episode  611] total reward = -100.932854, length = 60
[Episode  612] total reward = -101.684235, length = 60
[Episode  613] total reward = -102.710251, length = 78
[Episode  614] total reward = -99.627411, length = 88
[Episode  615] total reward = -106.355431, length = 111
[Episode  616] total reward = -107.997169, length = 101
[Episode  617] total reward = -122.544640, length = 93
[Episode  618] total reward = -102.227211, length = 74
[Epis

100%|██████████| 1602/1602 [00:06<00:00, 262.53it/s]


[Episode  801] total reward = -83.372337, length = 908
[Episode  802] total reward = -109.947975, length = 298
[Episode  803] total reward = -120.294868, length = 82
[Episode  804] total reward = -72.793877, length = 460
[Episode  805] total reward = 97.848083, length = 1600
[Episode  806] total reward = -120.514351, length = 105
[Episode  807] total reward = -81.004890, length = 606
[Episode  808] total reward = -91.148796, length = 339
[Episode  809] total reward = -98.045578, length = 132
[Episode  810] total reward = 98.031242, length = 1600
[Episode  811] total reward = -108.155602, length = 90
[Episode  812] total reward = 81.192062, length = 1600
[Episode  813] total reward = -87.391319, length = 482
[Episode  814] total reward = -117.835686, length = 102
[Episode  815] total reward = -120.907082, length = 134
[Episode  816] total reward = -45.413712, length = 684
[Episode  817] total reward = -80.282288, length = 801
[Episode  818] total reward = 61.341778, length = 1600
[Episo

 99%|█████████▊| 72/73 [00:00<00:00, 250.52it/s]


[Episode 1001] total reward = -115.602440, length = 74
[Episode 1002] total reward = -114.514282, length = 84
[Episode 1003] total reward = -108.126732, length = 112
[Episode 1004] total reward = -108.473160, length = 55
[Episode 1005] total reward = -110.327660, length = 75
[Episode 1006] total reward = -85.011810, length = 161
[Episode 1007] total reward = -117.154129, length = 63
[Episode 1008] total reward = -76.694801, length = 406
[Episode 1009] total reward = -36.677559, length = 1028
[Episode 1010] total reward = -108.281258, length = 76
[Episode 1011] total reward = 123.165497, length = 1600
[Episode 1012] total reward = -26.406765, length = 623
[Episode 1013] total reward = 144.923737, length = 1600
[Episode 1014] total reward = -36.247360, length = 1099
[Episode 1015] total reward = 179.816193, length = 1600
[Episode 1016] total reward = 175.680573, length = 1600
[Episode 1017] total reward = -114.212166, length = 71
[Episode 1018] total reward = -120.864029, length = 103
[E

100%|██████████| 66/66 [00:00<00:00, 254.90it/s]


[Episode 1201] total reward = -83.127510, length = 210
[Episode 1202] total reward = -125.051491, length = 118
[Episode 1203] total reward = -111.991928, length = 74
[Episode 1204] total reward = -113.317047, length = 65
[Episode 1205] total reward = -114.563507, length = 81
[Episode 1206] total reward = -23.805084, length = 708
[Episode 1207] total reward = -116.428940, length = 68
[Episode 1208] total reward = -114.620323, length = 87
[Episode 1209] total reward = 183.933090, length = 1600
[Episode 1210] total reward = -119.112610, length = 85
[Episode 1211] total reward = 207.223450, length = 1600
[Episode 1212] total reward = -111.848473, length = 66
[Episode 1213] total reward = -118.075577, length = 77
[Episode 1214] total reward = -108.021751, length = 85
[Episode 1215] total reward = -115.208984, length = 77
[Episode 1216] total reward = 188.126099, length = 1600
[Episode 1217] total reward = 175.810135, length = 1600
[Episode 1218] total reward = -106.047295, length = 110
[Epi

 99%|█████████▊| 75/76 [00:00<00:00, 255.86it/s]


[Episode 1401] total reward = -114.498474, length = 65
[Episode 1402] total reward = -83.741745, length = 278
[Episode 1403] total reward = -82.944565, length = 174
[Episode 1404] total reward = -124.180382, length = 124
[Episode 1405] total reward = 67.378746, length = 1198
[Episode 1406] total reward = -119.731247, length = 95
[Episode 1407] total reward = -118.355927, length = 73
[Episode 1408] total reward = -113.420738, length = 128
[Episode 1409] total reward = -116.055946, length = 70
[Episode 1410] total reward = 65.715958, length = 1244
[Episode 1411] total reward = 203.226990, length = 1600
[Episode 1412] total reward = -116.400040, length = 78
[Episode 1413] total reward = -48.179703, length = 500
[Episode 1414] total reward = 172.531296, length = 1600
[Episode 1415] total reward = -112.470932, length = 62
[Episode 1416] total reward = -115.803482, length = 76
[Episode 1417] total reward = -56.828049, length = 410
[Episode 1418] total reward = -46.645878, length = 428
[Episo

100%|██████████| 1324/1324 [00:05<00:00, 256.72it/s]


[Episode 1601] total reward = -117.016579, length = 62
[Episode 1602] total reward = -59.450417, length = 480
[Episode 1603] total reward = 113.509827, length = 1397
[Episode 1604] total reward = -51.642662, length = 451
[Episode 1605] total reward = 225.473770, length = 1558
[Episode 1606] total reward = 232.019348, length = 1477
[Episode 1607] total reward = -77.887466, length = 248
[Episode 1608] total reward = -44.434174, length = 479
[Episode 1609] total reward = 44.493370, length = 891
[Episode 1610] total reward = -62.026962, length = 381
[Episode 1611] total reward = -61.190598, length = 378
[Episode 1612] total reward = 76.606842, length = 1249
[Episode 1613] total reward = -89.966339, length = 126
[Episode 1614] total reward = -116.883690, length = 100
[Episode 1615] total reward = -113.124397, length = 63
[Episode 1616] total reward = -50.620605, length = 430
[Episode 1617] total reward = -86.291794, length = 162
[Episode 1618] total reward = -121.034065, length = 83
[Episod

100%|█████████▉| 1179/1180 [00:04<00:00, 256.10it/s]


[Episode 1801] total reward = -19.598480, length = 562
[Episode 1802] total reward = 219.998245, length = 1547
[Episode 1803] total reward = -37.048019, length = 505
[Episode 1804] total reward = -81.289009, length = 305
[Episode 1805] total reward = -111.777634, length = 95
[Episode 1806] total reward = -6.226067, length = 675
[Episode 1807] total reward = 186.790649, length = 1600
[Episode 1808] total reward = 225.612732, length = 1518
[Episode 1809] total reward = 218.453766, length = 1600
[Episode 1810] total reward = -117.676483, length = 71
[Episode 1811] total reward = -31.170992, length = 900
[Episode 1812] total reward = 211.593094, length = 1600
[Episode 1813] total reward = -37.581284, length = 506
[Episode 1814] total reward = -16.890118, length = 618
[Episode 1815] total reward = 77.410919, length = 1100
[Episode 1816] total reward = 104.148468, length = 1176
[Episode 1817] total reward = 232.562378, length = 1435
[Episode 1818] total reward = 59.292675, length = 1135
[Epi

100%|█████████▉| 885/886 [00:03<00:00, 238.22it/s]


[Episode 2001] total reward = -116.531448, length = 80
[Episode 2002] total reward = -114.856445, length = 60
[Episode 2003] total reward = -117.682800, length = 74
[Episode 2004] total reward = -118.377930, length = 79
[Episode 2005] total reward = -1.210793, length = 599
[Episode 2006] total reward = -71.459267, length = 186
[Episode 2007] total reward = -54.800758, length = 421
[Episode 2008] total reward = -120.284096, length = 77
[Episode 2009] total reward = -82.510452, length = 166
[Episode 2010] total reward = -121.377754, length = 108
[Episode 2011] total reward = -58.059597, length = 334
[Episode 2012] total reward = -106.617737, length = 110
[Episode 2013] total reward = -71.417244, length = 216
[Episode 2014] total reward = -82.168777, length = 206
[Episode 2015] total reward = 39.105824, length = 672
[Episode 2016] total reward = 145.246552, length = 1165
[Episode 2017] total reward = -117.255051, length = 121
[Episode 2018] total reward = 239.730408, length = 1329
[Episod

100%|██████████| 340/340 [00:01<00:00, 236.38it/s]


[Episode 2201] total reward = 10.416653, length = 509
[Episode 2202] total reward = -116.288872, length = 62
[Episode 2203] total reward = 238.402603, length = 1325
[Episode 2204] total reward = -26.680746, length = 507
[Episode 2205] total reward = 235.328461, length = 1342
[Episode 2206] total reward = 237.765656, length = 1346
[Episode 2207] total reward = -100.662552, length = 88
[Episode 2208] total reward = 240.109497, length = 1328
[Episode 2209] total reward = 90.284760, length = 921
[Episode 2210] total reward = -110.758087, length = 131
[Episode 2211] total reward = -111.522942, length = 74
[Episode 2212] total reward = -2.507076, length = 477
[Episode 2213] total reward = -78.203583, length = 143
[Episode 2214] total reward = -112.592911, length = 80
[Episode 2215] total reward = -115.203812, length = 75
[Episode 2216] total reward = -52.964806, length = 268
[Episode 2217] total reward = -104.299240, length = 124
[Episode 2218] total reward = -111.134224, length = 77
[Episod

 99%|█████████▉| 141/142 [00:00<00:00, 235.40it/s]


[Episode 2401] total reward = -119.399826, length = 176
[Episode 2402] total reward = 242.648712, length = 1269
[Episode 2403] total reward = 231.672791, length = 1391
[Episode 2404] total reward = -14.649719, length = 710
[Episode 2405] total reward = 222.250656, length = 1495
[Episode 2406] total reward = 239.076187, length = 1318
[Episode 2407] total reward = 56.905910, length = 924
[Episode 2408] total reward = 115.649437, length = 1152
[Episode 2409] total reward = -113.664078, length = 67
[Episode 2410] total reward = 35.977482, length = 860
[Episode 2411] total reward = 240.301758, length = 1306
[Episode 2412] total reward = -114.220360, length = 78
[Episode 2413] total reward = 236.248718, length = 1349
[Episode 2414] total reward = 244.857681, length = 1260
[Episode 2415] total reward = -106.927658, length = 123
[Episode 2416] total reward = 238.966309, length = 1317
[Episode 2417] total reward = -63.090752, length = 281
[Episode 2418] total reward = 234.098053, length = 1378


100%|██████████| 1140/1140 [00:04<00:00, 231.13it/s]


[Episode 2601] total reward = 68.340057, length = 871
[Episode 2602] total reward = 118.378647, length = 1367
[Episode 2603] total reward = 253.846649, length = 1149
[Episode 2604] total reward = -60.307293, length = 205
[Episode 2605] total reward = 55.872074, length = 742
[Episode 2606] total reward = 106.704247, length = 1230
[Episode 2607] total reward = -3.137451, length = 700
[Episode 2608] total reward = 20.359398, length = 611
[Episode 2609] total reward = -113.830742, length = 95
[Episode 2610] total reward = -69.049438, length = 259
[Episode 2611] total reward = 7.933125, length = 584
[Episode 2612] total reward = -30.172659, length = 394
[Episode 2613] total reward = 232.950836, length = 1391
[Episode 2614] total reward = -58.871105, length = 212
[Episode 2615] total reward = 236.167160, length = 1348
[Episode 2616] total reward = -80.735374, length = 151
[Episode 2617] total reward = 47.226475, length = 1014
[Episode 2618] total reward = 86.397110, length = 1223
[Episode 26

100%|█████████▉| 891/892 [00:03<00:00, 238.41it/s]


[Episode 2801] total reward = 2.297466, length = 545
[Episode 2802] total reward = -17.126160, length = 496
[Episode 2803] total reward = 98.688660, length = 996
[Episode 2804] total reward = 68.469315, length = 811
[Episode 2805] total reward = 223.093750, length = 1463
[Episode 2806] total reward = 1.972412, length = 626
[Episode 2807] total reward = 242.070068, length = 1247
[Episode 2808] total reward = 25.088448, length = 633
[Episode 2809] total reward = 242.451721, length = 1246
[Episode 2810] total reward = 26.894730, length = 639
[Episode 2811] total reward = -45.135975, length = 287
[Episode 2812] total reward = 229.881195, length = 1366
[Episode 2813] total reward = 54.455437, length = 816
[Episode 2814] total reward = 39.098267, length = 842
[Episode 2815] total reward = 55.811348, length = 1007
[Episode 2816] total reward = 250.137466, length = 1167
[Episode 2817] total reward = -61.959568, length = 245
[Episode 2818] total reward = -60.299713, length = 271
[Episode 2819] 

100%|██████████| 364/364 [00:01<00:00, 240.38it/s]


[Episode 3001] total reward = 237.410400, length = 1325
[Episode 3002] total reward = -29.184704, length = 382
[Episode 3003] total reward = -22.878670, length = 517
[Episode 3004] total reward = 103.844040, length = 1261
[Episode 3005] total reward = -29.678143, length = 370
[Episode 3006] total reward = 235.991028, length = 1350
[Episode 3007] total reward = 237.356644, length = 1312
[Episode 3008] total reward = -116.596306, length = 118
[Episode 3009] total reward = 107.333786, length = 1190
[Episode 3010] total reward = 48.702030, length = 1069
[Episode 3011] total reward = -79.184097, length = 150
[Episode 3012] total reward = 232.977890, length = 1365
[Episode 3013] total reward = 245.889526, length = 1231
[Episode 3014] total reward = -0.931465, length = 632
[Episode 3015] total reward = 244.329987, length = 1263
[Episode 3016] total reward = -21.311272, length = 427
[Episode 3017] total reward = 229.948120, length = 1415
[Episode 3018] total reward = -71.513016, length = 151
[

100%|█████████▉| 288/289 [00:01<00:00, 246.64it/s]


[Episode 3201] total reward = -113.883469, length = 81
[Episode 3202] total reward = 104.016403, length = 1184
[Episode 3203] total reward = 132.143005, length = 1225
[Episode 3204] total reward = 244.058243, length = 1251
[Episode 3205] total reward = -121.860634, length = 119
[Episode 3206] total reward = -48.960194, length = 298
[Episode 3207] total reward = 122.541496, length = 1045
[Episode 3208] total reward = 111.834328, length = 1103
[Episode 3209] total reward = -116.118866, length = 86
[Episode 3210] total reward = 249.048355, length = 1193
[Episode 3211] total reward = -116.298698, length = 97
[Episode 3212] total reward = -61.968651, length = 209
[Episode 3213] total reward = -114.932800, length = 76
[Episode 3214] total reward = -38.964699, length = 411
[Episode 3215] total reward = -78.742569, length = 165
[Episode 3216] total reward = -114.877029, length = 88
[Episode 3217] total reward = 17.808460, length = 718
[Episode 3218] total reward = -118.516815, length = 90
[Epi

100%|██████████| 120/120 [00:00<00:00, 232.13it/s]


[Episode 3401] total reward = -114.617882, length = 66
[Episode 3402] total reward = 12.464344, length = 571
[Episode 3403] total reward = 51.916748, length = 733
[Episode 3404] total reward = 239.938919, length = 1287
[Episode 3405] total reward = 123.151108, length = 1196
[Episode 3406] total reward = 91.129700, length = 962
[Episode 3407] total reward = 251.223633, length = 1172
[Episode 3408] total reward = -112.170715, length = 60
[Episode 3409] total reward = -108.302391, length = 147
[Episode 3410] total reward = -80.001747, length = 169
[Episode 3411] total reward = -39.232502, length = 389
[Episode 3412] total reward = -98.960503, length = 144
[Episode 3413] total reward = -7.737564, length = 506
[Episode 3414] total reward = 1.637459, length = 605
[Episode 3415] total reward = -112.635841, length = 103
[Episode 3416] total reward = -11.082756, length = 485
[Episode 3417] total reward = -100.834770, length = 130
[Episode 3418] total reward = -112.461395, length = 85
[Episode 3

100%|██████████| 289/289 [00:01<00:00, 243.61it/s]


[Episode 3601] total reward = 30.402168, length = 730
[Episode 3602] total reward = 236.806137, length = 1343
[Episode 3603] total reward = 242.143677, length = 1278
[Episode 3604] total reward = -130.043640, length = 189
[Episode 3605] total reward = 243.004456, length = 1277
[Episode 3606] total reward = -118.002678, length = 144
[Episode 3607] total reward = 255.348785, length = 1132
[Episode 3608] total reward = -1.542942, length = 602
[Episode 3609] total reward = 231.949677, length = 1393
[Episode 3610] total reward = 243.006439, length = 1265
[Episode 3611] total reward = 120.582214, length = 1085
[Episode 3612] total reward = 241.011658, length = 1290
[Episode 3613] total reward = 122.372726, length = 1201
[Episode 3614] total reward = 236.566101, length = 1331
[Episode 3615] total reward = 238.772339, length = 1301
[Episode 3616] total reward = -115.659935, length = 103
[Episode 3617] total reward = 243.367920, length = 1265
[Episode 3618] total reward = -28.948132, length = 3

100%|██████████| 301/301 [00:01<00:00, 239.93it/s]


[Episode 3801] total reward = 109.832138, length = 949
[Episode 3802] total reward = -40.979004, length = 348
[Episode 3803] total reward = -126.606789, length = 194
[Episode 3804] total reward = 31.248829, length = 580
[Episode 3805] total reward = 55.083588, length = 846
[Episode 3806] total reward = -18.861752, length = 427
[Episode 3807] total reward = 100.167435, length = 903
[Episode 3808] total reward = 5.452709, length = 560
[Episode 3809] total reward = -59.801929, length = 244
[Episode 3810] total reward = 130.640900, length = 1030
[Episode 3811] total reward = -52.909161, length = 285
[Episode 3812] total reward = 1.796497, length = 551
[Episode 3813] total reward = 62.191536, length = 946
[Episode 3814] total reward = -43.578491, length = 236
[Episode 3815] total reward = -52.469658, length = 314
[Episode 3816] total reward = 233.218384, length = 1384
[Episode 3817] total reward = -41.999989, length = 309
[Episode 3818] total reward = 11.670109, length = 696
[Episode 3819] 

100%|██████████| 121/121 [00:00<00:00, 240.37it/s]


[Episode 4001] total reward = -118.908600, length = 92
[Episode 4002] total reward = -121.127800, length = 152
[Episode 4003] total reward = -121.597885, length = 122
[Episode 4004] total reward = -119.579773, length = 174
[Episode 4005] total reward = 10.013477, length = 488
[Episode 4006] total reward = 252.503235, length = 1150
[Episode 4007] total reward = 237.022552, length = 1327
[Episode 4008] total reward = -113.077911, length = 84
[Episode 4009] total reward = -64.540970, length = 300
[Episode 4010] total reward = 3.065887, length = 567
[Episode 4011] total reward = -80.611160, length = 161
[Episode 4012] total reward = 57.061806, length = 873
[Episode 4013] total reward = 227.008224, length = 1418
[Episode 4014] total reward = 12.505753, length = 623
[Episode 4015] total reward = -112.752472, length = 73
[Episode 4016] total reward = 98.806717, length = 1236
[Episode 4017] total reward = -118.256004, length = 83
[Episode 4018] total reward = 51.534653, length = 747
[Episode 4

100%|█████████▉| 1005/1006 [00:04<00:00, 240.39it/s]


[Episode 4201] total reward = -16.575878, length = 403
[Episode 4202] total reward = 51.158081, length = 854
[Episode 4203] total reward = 255.660614, length = 1114
[Episode 4204] total reward = 120.262299, length = 991
[Episode 4205] total reward = 255.422913, length = 1119
[Episode 4206] total reward = 145.008179, length = 1074
[Episode 4207] total reward = 260.215363, length = 1066
[Episode 4208] total reward = 76.527374, length = 756
[Episode 4209] total reward = 1.732151, length = 499
[Episode 4210] total reward = -106.085251, length = 131
[Episode 4211] total reward = -128.334320, length = 149
[Episode 4212] total reward = 254.933899, length = 1114
[Episode 4213] total reward = 257.290771, length = 1078
[Episode 4214] total reward = -1.405212, length = 497
[Episode 4215] total reward = 252.785187, length = 1128
[Episode 4216] total reward = 130.328278, length = 1093
[Episode 4217] total reward = -100.890808, length = 212
[Episode 4218] total reward = 115.816757, length = 1002
[Ep

100%|█████████▉| 573/574 [00:02<00:00, 241.96it/s]


[Episode 4401] total reward = 250.968903, length = 1140
[Episode 4402] total reward = 252.157700, length = 1127
[Episode 4403] total reward = -47.570183, length = 232
[Episode 4404] total reward = -48.505695, length = 236
[Episode 4405] total reward = 17.553383, length = 519
[Episode 4406] total reward = 255.081726, length = 1076
[Episode 4407] total reward = -1.770920, length = 408
[Episode 4408] total reward = -89.318306, length = 214
[Episode 4409] total reward = 41.286392, length = 573
[Episode 4410] total reward = -58.323631, length = 272
[Episode 4411] total reward = 103.912895, length = 912
[Episode 4412] total reward = -29.116005, length = 361
[Episode 4413] total reward = 259.175293, length = 1050
[Episode 4414] total reward = 104.038742, length = 806
[Episode 4415] total reward = -35.848656, length = 438
[Episode 4416] total reward = -20.129868, length = 362
[Episode 4417] total reward = -129.176743, length = 125
[Episode 4418] total reward = 260.418579, length = 1012
[Episod

100%|██████████| 402/402 [00:01<00:00, 240.96it/s]


[Episode 4601] total reward = -26.217060, length = 384
[Episode 4602] total reward = 15.746296, length = 546
[Episode 4603] total reward = 103.324387, length = 870
[Episode 4604] total reward = 261.565704, length = 1021
[Episode 4605] total reward = 92.563553, length = 772
[Episode 4606] total reward = 132.895370, length = 957
[Episode 4607] total reward = -39.401783, length = 354
[Episode 4608] total reward = 257.718353, length = 1070
[Episode 4609] total reward = -63.037132, length = 316
[Episode 4610] total reward = 10.036678, length = 509
[Episode 4611] total reward = -66.479141, length = 185
[Episode 4612] total reward = 40.986740, length = 671
[Episode 4613] total reward = 1.266712, length = 438
[Episode 4614] total reward = -31.087475, length = 319
[Episode 4615] total reward = 250.049255, length = 1152
[Episode 4616] total reward = 258.742706, length = 1051
[Episode 4617] total reward = 257.053345, length = 1056
[Episode 4618] total reward = 250.825592, length = 1136
[Episode 4

100%|██████████| 1008/1008 [00:04<00:00, 239.38it/s]


[Episode 4801] total reward = 257.988037, length = 1067
[Episode 4802] total reward = 259.735229, length = 1046
[Episode 4803] total reward = 156.606216, length = 1043
[Episode 4804] total reward = 143.503250, length = 1035
[Episode 4805] total reward = -46.008705, length = 251
[Episode 4806] total reward = 260.293640, length = 1033
[Episode 4807] total reward = 251.653137, length = 1126
[Episode 4808] total reward = -79.600502, length = 139
[Episode 4809] total reward = 22.977398, length = 509
[Episode 4810] total reward = 87.512497, length = 794
[Episode 4811] total reward = 258.719391, length = 1058
[Episode 4812] total reward = 4.547318, length = 451
[Episode 4813] total reward = -53.362183, length = 239
[Episode 4814] total reward = 256.162903, length = 1082
[Episode 4815] total reward = 252.736618, length = 1123
[Episode 4816] total reward = 20.940910, length = 561
[Episode 4817] total reward = -0.508469, length = 506
[Episode 4818] total reward = 76.925415, length = 795
[Episode

### 5.3 Load the Model and Play

In [15]:
save_dir = project_root + '/save'
model_path = os.path.join(save_dir, "model.pt")

if os.path.exists(model_path):
    print("Loading the model ... ", end="")
    checkpoint = torch.load(model_path)
    policy_net.load_state_dict(checkpoint["PolicyNet"])
    print("Done.")
else:
    print('ERROR: No model saved')

play(policy_net)

Loading the model ... Done.
[Evaluation] Total reward = 85.513852, length = 758


100%|█████████▉| 759/760 [00:02<00:00, 257.18it/s]
