# Proximal Policy Optimization (PPO) Tutorial

## 1. Environment Preparation




### 1.1 Download Packages for BipedalWalker-v3

In [1]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender==1.0.2
!pip install box2d-py
!pip install 'gym[Box2D]'

Collecting colabgymrender==1.0.2
  Downloading https://files.pythonhosted.org/packages/94/de/8e81910e3d36183f91738ece48a68e543cb9567e07d83022419223058d8f/colabgymrender-1.0.2.tar.gz
Building wheels for collected packages: colabgymrender
  Building wheel for colabgymrender (setup.py) ... [?25l[?25hdone
  Created wheel for colabgymrender: filename=colabgymrender-1.0.2-cp37-none-any.whl size=2446 sha256=dd65d054c08d16709375913a090748a5dd0c5fae8941a6642a58637eea6e9e58
  Stored in directory: /root/.cache/pip/wheels/7e/75/39/ba42a666bfb04ef568d25f43e5e963bb30c275446271085240
Successfully built colabgymrender
Installing collected packages: colabgymrender
Successfully installed colabgymrender-1.0.2
Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/87/34/da5393985c3ff9a76351df6127c275dcb5749ae0abbe8d5210f06d97405d/box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448kB)
[K     |████████████████████████████████| 450kB 5.0MB/s 
[?25hInstalling collected packages: 

### 1.2 Mount Drive and Set Project Path

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import sys
project_root = '/content/drive/My Drive/ppo_tutorial/'
sys.path.append(project_root)

Mounted at /content/drive


### 1.3 Test the BipedalWalker-v3 Environment 

In [3]:
import os
import gym
import torch
import torch.nn as nn
import numpy as np
from colabgymrender.recorder import Recorder

env = gym.make('BipedalWalker-v3')
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
print(s_dim)
print(a_dim)

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)2080768/45929032 bytes (4.5%)5349376/45929032 bytes (11.6%)8429568/45929032 bytes (18.4%)11493376/45929032 bytes (25.0%)14680064/45929032 bytes (32.0%)17645568/45929032 bytes (38.4%)20676608/45929032 bytes (45.0%)23830528/45929032 bytes (51.9%)26877952/45929032 bytes (58.5%)29843456/45929032 bytes (65.0%)32915456/45929032 bytes (71.7%)35823616/45929032 bytes (78.0%)

## 2. Policy Network & Value Network Construction

### 2.1 Diagonal Gaussian Distribution Module 

In [4]:
#AddBias module
class AddBias(nn.Module):
    def __init__(self, bias):
        super(AddBias, self).__init__()
        self._bias = nn.Parameter(bias.unsqueeze(1))
    
    def forward(self, x):
        bias = self._bias.t().view(1, -1)
        return x + bias

#Gaussian distribution with given mean & std.
class FixedNormal(torch.distributions.Normal):
    def log_probs(self, x):
        return super().log_prob(x).sum(-1)
    
    def entropy(self):
        return super().entropy().sum(-1)

    def mode(self):
        return self.mean

#Diagonal Gaussian module
class DiagGaussian(nn.Module):
    def __init__(self, inp_dim, out_dim):
        super(DiagGaussian, self).__init__()
        self.fc_mean = nn.Linear(inp_dim, out_dim)
        self.b_logstd = AddBias(torch.zeros(out_dim))
    
    def forward(self, x):
        mean = self.fc_mean(x)
        logstd = self.b_logstd(torch.zeros_like(mean))
        return FixedNormal(mean, logstd.exp())

### 2.2 Policy Network & Value Network Module

In [5]:
#Policy Network
class PolicyNet(nn.Module):
    #Constructor
    def __init__(self, s_dim, a_dim):
        super(PolicyNet, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(s_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU()
        )
        self.dist = DiagGaussian(128, a_dim)
    
    #Forward pass
    def forward(self, state, deterministic=False):
        feature = self.main(state)
        dist = self.dist(feature)

        if deterministic:
            action = dist.mode()
        else:
            action = dist.sample()
        
        return action, dist.log_probs(action)
    
    #Choose an action (stochastically or deterministically)
    def choose_action(self, state, deterministic=False):
        feature = self.main(state)
        dist = self.dist(feature)

        if deterministic:
            return dist.mode()

        return dist.sample()
    
    #Evaluate a state-action pair (output log-prob. & entropy)
    def evaluate(self, state, action):
        feature = self.main(state)
        dist = self.dist(feature)
        return dist.log_probs(action), dist.entropy()

#Value Network
class ValueNet(nn.Module):
    #Constructor
    def __init__(self, s_dim):
        super(ValueNet, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(s_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    
    #Forward pass
    def forward(self, state):
        return self.main(state)[:, 0]

### 2.3 Create Policy Network & Value Network

In [6]:
policy_net = PolicyNet(s_dim, a_dim)
value_net = ValueNet(s_dim)
print(policy_net)
print(value_net)

PolicyNet(
  (main): Sequential(
    (0): Linear(in_features=24, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
  )
  (dist): DiagGaussian(
    (fc_mean): Linear(in_features=128, out_features=4, bias=True)
    (b_logstd): AddBias()
  )
)
ValueNet(
  (main): Sequential(
    (0): Linear(in_features=24, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)


## 3. Environment Runner Construction

### 3.1 EnvRunner Class

In [7]:
class EnvRunner:
    #Constructor
    def __init__(self, s_dim, a_dim, gamma=0.99, lamb=0.95, max_step=2048, device='cpu'):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.gamma = gamma
        self.lamb = lamb
        self.max_step = max_step
        self.device = device

        #Storages (state, action, value, reward, a_logp)
        self.mb_states = np.zeros((self.max_step, self.s_dim), dtype=np.float32)
        self.mb_actions = np.zeros((self.max_step, self.a_dim), dtype=np.float32)
        self.mb_values = np.zeros((self.max_step,), dtype=np.float32)
        self.mb_rewards = np.zeros((self.max_step,), dtype=np.float32)
        self.mb_a_logps = np.zeros((self.max_step,), dtype=np.float32)
    
    #Compute discounted return
    def compute_discounted_return(self, rewards, last_value):
        returns = np.zeros_like(rewards)
        n_step = len(rewards)

        for t in reversed(range(n_step)):
            if t == n_step - 1:
                returns[t] = rewards[t] + self.gamma * last_value
            else:
                returns[t] = rewards[t] + self.gamma * returns[t+1]

        return returns
    
    #Compute generalized advantage estimation (Optional)
    def compute_gae(self, rewards, values, last_value):
        advs = np.zeros_like(rewards)
        n_step = len(rewards)
        last_gae_lam = 0.0

        for t in reversed(range(n_step)):
            if t == n_step - 1:
                next_value = last_value
            else:
                next_value = values[t+1]

            delta = rewards[t] + self.gamma*next_value - values[t]
            advs[t] = last_gae_lam = delta + self.gamma*self.lamb*last_gae_lam

        return advs + values

    #Run an episode using the policy net & value net
    def run(self, env, policy_net, value_net):
        #Run an episode
        state = env.reset()   #Initial state
        episode_len = self.max_step

        for step in range(self.max_step):
            state_tensor = torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device=self.device)
            action, a_logp = policy_net(state_tensor)
            value = value_net(state_tensor)

            action = action.cpu().numpy()[0]
            a_logp = a_logp.cpu().numpy()
            value  = value.cpu().numpy()

            self.mb_states[step] = state
            self.mb_actions[step] = action
            self.mb_a_logps[step] = a_logp
            self.mb_values[step] = value

            state, reward, done, info = env.step(action)
            self.mb_rewards[step] = reward

            if done:
                episode_len = step + 1
                break
        
        #Compute returns
        last_value = value_net(
            torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device=self.device)
        ).cpu().numpy()

        mb_returns = self.compute_discounted_return(self.mb_rewards[:episode_len], last_value)
        '''
        mb_returns = self.compute_gae(
            self.mb_rewards[:episode_len], 
            self.mb_values[:episode_len],
            last_value
        )
        '''
        return self.mb_states[:episode_len], \
                self.mb_actions[:episode_len], \
                self.mb_a_logps[:episode_len], \
                self.mb_values[:episode_len], \
                mb_returns, \
                self.mb_rewards[:episode_len]

### 3.2 Create EnvRunner

In [8]:
runner = EnvRunner(s_dim, a_dim)

## 4. PPO Algorithm

### 4.1 PPO Class

In [9]:
class PPO:
    #Constructor
    def __init__(self, policy_net, value_net, lr=1e-4, max_grad_norm=0.5, ent_weight=0.01, clip_val=0.2, sample_n_epoch=4, sample_mb_size=64, device='cpu'):
        self.policy_net = policy_net
        self.value_net = value_net
        self.max_grad_norm = max_grad_norm
        self.ent_weight = ent_weight
        self.clip_val = clip_val
        self.sample_n_epoch = sample_n_epoch
        self.sample_mb_size = sample_mb_size
        self.device = device
        self.opt_polcy = torch.optim.Adam(policy_net.parameters(), lr)
        self.opt_value = torch.optim.Adam(value_net.parameters(), lr)
    
    #Train the policy net & value net using PPO
    def train(self, mb_states, mb_actions, mb_old_values, mb_advs, mb_returns, mb_old_a_logps):
        #Convert numpy array to tensor
        mb_states = torch.from_numpy(mb_states).to(self.device)
        mb_actions = torch.from_numpy(mb_actions).to(self.device)
        mb_old_values = torch.from_numpy(mb_old_values).to(self.device)
        mb_advs = torch.from_numpy(mb_advs).to(self.device)
        mb_returns = torch.from_numpy(mb_returns).to(self.device)
        mb_old_a_logps = torch.from_numpy(mb_old_a_logps).to(self.device)
        episode_length = len(mb_states)
        rand_idx = np.arange(episode_length)
        sample_n_mb = episode_length // self.sample_mb_size

        if sample_n_mb <= 0:
            sample_mb_size = episode_length
            sample_n_mb = 1
        else:
            sample_mb_size = self.sample_mb_size

        for i in range(self.sample_n_epoch):
            np.random.shuffle(rand_idx)

            for j in range(sample_n_mb):
                #Randomly sample a batch for training
                sample_idx = rand_idx[j*sample_mb_size : (j+1)*sample_mb_size]
                sample_states = mb_states[sample_idx]
                sample_actions = mb_actions[sample_idx]
                sample_old_values = mb_old_values[sample_idx]
                sample_advs = mb_advs[sample_idx]
                sample_returns = mb_returns[sample_idx]
                sample_old_a_logps = mb_old_a_logps[sample_idx]

                sample_a_logps, sample_ents = self.policy_net.evaluate(sample_states, sample_actions)
                sample_values = self.value_net(sample_states)
                ent = sample_ents.mean()

                #Compute value loss
                v_pred_clip = sample_old_values + torch.clamp(sample_values - sample_old_values, -self.clip_val, self.clip_val)
                v_loss1 = (sample_returns - sample_values).pow(2)
                v_loss2 = (sample_returns - v_pred_clip).pow(2)
                v_loss = torch.max(v_loss1, v_loss2).mean()

                #Compute policy gradient loss
                ratio = (sample_a_logps - sample_old_a_logps).exp()
                pg_loss1 = -sample_advs * ratio
                pg_loss2 = -sample_advs * torch.clamp(ratio, 1.0-self.clip_val, 1.0+self.clip_val)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean() - self.ent_weight*ent

                #Train actor
                self.opt_polcy.zero_grad()
                pg_loss.backward()
                nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.max_grad_norm)
                self.opt_polcy.step()

                #Train critic
                self.opt_value.zero_grad()
                v_loss.backward()
                nn.utils.clip_grad_norm_(self.value_net.parameters(), self.max_grad_norm)
                self.opt_value.step()

        return pg_loss.item(), v_loss.item(), ent.item()

### 4.2 Create PPO Agent

In [10]:
agent = PPO(policy_net, value_net)

## 5. Training and Testing Process

### 5.1 Play an Episode for Evaluation

In [11]:
def play(policy_net):
    render_env = Recorder(gym.make('BipedalWalker-v3'), project_root + '/video')

    with torch.no_grad():
        state = render_env.reset()
        total_reward = 0
        length = 0

        while True:
            #render_env.render()
            state_tensor = torch.tensor(np.expand_dims(state, axis=0), dtype=torch.float32, device='cpu')
            action = policy_net.choose_action(state_tensor, deterministic=True).cpu().numpy()
            state, reward, done, info = render_env.step(action[0])
            total_reward += reward
            length += 1

            if done:
                print("[Evaluation] Total reward = {:.6f}, length = {:d}".format(total_reward, length), flush=True)
                break
    render_env.play()
    render_env.close()

In [12]:
play(policy_net)

[Evaluation] Total reward = -91.896360, length = 107


 99%|█████████▉| 108/109 [00:00<00:00, 262.70it/s]


### 5.2 Train the Networks using PPO

In [13]:
def train(env, runner, policy_net, value_net, agent, max_episode=5000):
    mean_total_reward = 0
    mean_length = 0
    save_dir = project_root + '/save'

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    for i in range(max_episode):
        #Run an episode to collect data
        with torch.no_grad():
            mb_states, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run(env, policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)
        
        #Train the model using the collected data
        pg_loss, v_loss, ent = agent.train(mb_states, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps)
        mean_total_reward += mb_rewards.sum()
        mean_length += len(mb_states)
        print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format(i, mb_rewards.sum(), len(mb_states)))

        #Show the current result & save the model
        if i % 200 == 0:
            print("\n[{:5d} / {:5d}]".format(i, max_episode))
            print("----------------------------------")
            print("actor loss = {:.6f}".format(pg_loss))
            print("critic loss = {:.6f}".format(v_loss))
            print("entropy = {:.6f}".format(ent))
            print("mean return = {:.6f}".format(mean_total_reward / 200))
            print("mean length = {:.2f}".format(mean_length / 200))
            print("\nSaving the model ... ", end="")
            torch.save({
                "it": i,
                "PolicyNet": policy_net.state_dict(),
                "ValueNet": value_net.state_dict()
            }, os.path.join(save_dir, "model.pt"))
            print("Done.")
            print()
            play(policy_net)
            mean_total_reward = 0
            mean_length = 0

In [14]:
train(env, runner, policy_net, value_net, agent)
env.close()

[Episode    0] total reward = -100.726105, length = 58

[    0 /  5000]
----------------------------------
actor loss = -0.064683
critic loss = 5872.854004
entropy = 5.676354
mean return = -0.503631
mean length = 0.29

Saving the model ... Done.

[Evaluation] Total reward = -92.398783, length = 106


100%|██████████| 108/108 [00:00<00:00, 267.36it/s]


[Episode    1] total reward = -204.640762, length = 1031
[Episode    2] total reward = -101.502663, length = 71
[Episode    3] total reward = -120.150291, length = 102
[Episode    4] total reward = -108.430710, length = 1600
[Episode    5] total reward = -101.028984, length = 67
[Episode    6] total reward = -130.418747, length = 128
[Episode    7] total reward = -103.032234, length = 82
[Episode    8] total reward = -102.464142, length = 66
[Episode    9] total reward = -102.171600, length = 56
[Episode   10] total reward = -98.492165, length = 90
[Episode   11] total reward = -114.213806, length = 52
[Episode   12] total reward = -103.883438, length = 58
[Episode   13] total reward = -101.103027, length = 74
[Episode   14] total reward = -103.736259, length = 82
[Episode   15] total reward = -127.204727, length = 104
[Episode   16] total reward = -100.017090, length = 86
[Episode   17] total reward = -98.075729, length = 1600
[Episode   18] total reward = -116.049957, length = 63
[Ep

 99%|█████████▉| 114/115 [00:00<00:00, 257.48it/s]


[Episode  201] total reward = -118.356674, length = 64
[Episode  202] total reward = -111.698143, length = 62
[Episode  203] total reward = -103.272301, length = 56
[Episode  204] total reward = -108.632858, length = 55
[Episode  205] total reward = -99.083115, length = 110
[Episode  206] total reward = -109.133118, length = 51
[Episode  207] total reward = -112.712479, length = 85
[Episode  208] total reward = -117.089157, length = 159
[Episode  209] total reward = -142.641113, length = 1066
[Episode  210] total reward = -50.648354, length = 1600
[Episode  211] total reward = -106.942024, length = 71
[Episode  212] total reward = -64.845367, length = 1600
[Episode  213] total reward = -106.015991, length = 52
[Episode  214] total reward = -114.473694, length = 146
[Episode  215] total reward = -108.575813, length = 87
[Episode  216] total reward = -112.226074, length = 82
[Episode  217] total reward = -53.641434, length = 1600
[Episode  218] total reward = -161.816711, length = 1366
[

100%|██████████| 66/66 [00:00<00:00, 261.47it/s]


[Episode  401] total reward = -92.752304, length = 1307
[Episode  402] total reward = 29.778091, length = 1600
[Episode  403] total reward = 22.646507, length = 1600
[Episode  404] total reward = -108.195946, length = 62
[Episode  405] total reward = 48.109631, length = 1600
[Episode  406] total reward = -112.816071, length = 65
[Episode  407] total reward = -117.251099, length = 130
[Episode  408] total reward = -113.757683, length = 169
[Episode  409] total reward = 47.462372, length = 1600
[Episode  410] total reward = -111.650078, length = 72
[Episode  411] total reward = -99.704239, length = 1072
[Episode  412] total reward = 44.181351, length = 1600
[Episode  413] total reward = -100.746193, length = 259
[Episode  414] total reward = -120.212860, length = 117
[Episode  415] total reward = -109.695839, length = 253
[Episode  416] total reward = 30.500816, length = 1600
[Episode  417] total reward = -104.799248, length = 74
[Episode  418] total reward = -115.067375, length = 62
[Ep

100%|██████████| 57/57 [00:00<00:00, 255.30it/s]


[Episode  601] total reward = -106.942047, length = 62
[Episode  602] total reward = -113.596550, length = 65
[Episode  603] total reward = 48.217804, length = 1600
[Episode  604] total reward = 31.726194, length = 1600
[Episode  605] total reward = 36.221970, length = 1600
[Episode  606] total reward = -103.567345, length = 47
[Episode  607] total reward = 51.874870, length = 1600
[Episode  608] total reward = -105.215332, length = 56
[Episode  609] total reward = -104.930511, length = 98
[Episode  610] total reward = 46.044601, length = 1600
[Episode  611] total reward = -119.660637, length = 135
[Episode  612] total reward = -106.170525, length = 68
[Episode  613] total reward = -101.315300, length = 102
[Episode  614] total reward = 44.282074, length = 1600
[Episode  615] total reward = -103.173439, length = 60
[Episode  616] total reward = -101.417091, length = 57
[Episode  617] total reward = 63.056152, length = 1600
[Episode  618] total reward = -107.198380, length = 60
[Episode

100%|██████████| 1393/1393 [00:05<00:00, 277.24it/s]


[Episode  801] total reward = -38.348915, length = 934
[Episode  802] total reward = 164.327271, length = 1600
[Episode  803] total reward = -118.799713, length = 53
[Episode  804] total reward = 162.401855, length = 1600
[Episode  805] total reward = 162.635773, length = 1600
[Episode  806] total reward = 167.462616, length = 1600
[Episode  807] total reward = -113.312698, length = 59
[Episode  808] total reward = -114.363510, length = 76
[Episode  809] total reward = -66.343414, length = 486
[Episode  810] total reward = 144.157288, length = 1600
[Episode  811] total reward = 6.874947, length = 768
[Episode  812] total reward = -108.886757, length = 58
[Episode  813] total reward = 209.877441, length = 1600
[Episode  814] total reward = 169.432556, length = 1600
[Episode  815] total reward = 187.779144, length = 1600
[Episode  816] total reward = -107.334686, length = 46
[Episode  817] total reward = 180.037262, length = 1600
[Episode  818] total reward = -36.932457, length = 676
[Ep

100%|█████████▉| 1044/1045 [00:03<00:00, 272.93it/s]


[Episode 1001] total reward = -56.337631, length = 565
[Episode 1002] total reward = -82.784790, length = 266
[Episode 1003] total reward = -120.653168, length = 52
[Episode 1004] total reward = -114.613525, length = 57
[Episode 1005] total reward = -43.393814, length = 636
[Episode 1006] total reward = -119.229050, length = 63
[Episode 1007] total reward = -106.964073, length = 45
[Episode 1008] total reward = 54.788326, length = 1351
[Episode 1009] total reward = -111.276184, length = 63
[Episode 1010] total reward = 52.802986, length = 1255
[Episode 1011] total reward = 19.662521, length = 960
[Episode 1012] total reward = -105.449333, length = 104
[Episode 1013] total reward = 9.886837, length = 839
[Episode 1014] total reward = -50.860710, length = 437
[Episode 1015] total reward = 185.028580, length = 1600
[Episode 1016] total reward = -117.156342, length = 46
[Episode 1017] total reward = 215.929840, length = 1600
[Episode 1018] total reward = -116.521790, length = 45
[Episode 1

100%|██████████| 75/75 [00:00<00:00, 255.52it/s]


[Episode 1201] total reward = -112.773384, length = 55
[Episode 1202] total reward = 207.016663, length = 1600
[Episode 1203] total reward = -114.555489, length = 114
[Episode 1204] total reward = -119.287659, length = 67
[Episode 1205] total reward = -115.132843, length = 64
[Episode 1206] total reward = -116.362976, length = 65
[Episode 1207] total reward = -116.590034, length = 58
[Episode 1208] total reward = -123.345955, length = 79
[Episode 1209] total reward = 35.845921, length = 1217
[Episode 1210] total reward = -119.798195, length = 67
[Episode 1211] total reward = -118.823021, length = 65
[Episode 1212] total reward = -110.206100, length = 65
[Episode 1213] total reward = -31.404312, length = 768
[Episode 1214] total reward = -115.801880, length = 65
[Episode 1215] total reward = -118.873100, length = 60
[Episode 1216] total reward = -116.233589, length = 56
[Episode 1217] total reward = -115.892036, length = 55
[Episode 1218] total reward = -114.222946, length = 62
[Episode

100%|██████████| 73/73 [00:00<00:00, 272.56it/s]


[Episode 1401] total reward = 195.148041, length = 1600
[Episode 1402] total reward = 178.721588, length = 1600
[Episode 1403] total reward = 173.054321, length = 1600
[Episode 1404] total reward = 199.409668, length = 1600
[Episode 1405] total reward = -106.628838, length = 53
[Episode 1406] total reward = 172.093445, length = 1600
[Episode 1407] total reward = 170.799835, length = 1600
[Episode 1408] total reward = -41.889320, length = 817
[Episode 1409] total reward = -114.430504, length = 64
[Episode 1410] total reward = 213.925034, length = 1600
[Episode 1411] total reward = 190.360428, length = 1600
[Episode 1412] total reward = -117.721878, length = 67
[Episode 1413] total reward = -110.069374, length = 66
[Episode 1414] total reward = 177.393463, length = 1600
[Episode 1415] total reward = -111.354980, length = 67
[Episode 1416] total reward = 191.300140, length = 1600
[Episode 1417] total reward = 20.487759, length = 1079
[Episode 1418] total reward = 212.704041, length = 1600

100%|██████████| 1378/1378 [00:05<00:00, 271.83it/s]


[Episode 1601] total reward = 202.636642, length = 1600
[Episode 1602] total reward = 214.473160, length = 1574
[Episode 1603] total reward = 211.709076, length = 1597
[Episode 1604] total reward = 224.667664, length = 1451
[Episode 1605] total reward = -119.175972, length = 101
[Episode 1606] total reward = 45.767723, length = 1298
[Episode 1607] total reward = 217.548019, length = 1516
[Episode 1608] total reward = 209.548965, length = 1600
[Episode 1609] total reward = -51.429443, length = 483
[Episode 1610] total reward = 209.124573, length = 1600
[Episode 1611] total reward = 190.619751, length = 1600
[Episode 1612] total reward = 220.625412, length = 1514
[Episode 1613] total reward = 218.827087, length = 1511
[Episode 1614] total reward = 198.855972, length = 1600
[Episode 1615] total reward = 224.857300, length = 1452
[Episode 1616] total reward = 217.072235, length = 1537
[Episode 1617] total reward = -26.108765, length = 756
[Episode 1618] total reward = 222.733887, length = 

100%|██████████| 73/73 [00:00<00:00, 271.77it/s]


[Episode 1801] total reward = -32.242485, length = 659
[Episode 1802] total reward = 223.117859, length = 1472
[Episode 1803] total reward = 159.014648, length = 1600
[Episode 1804] total reward = 77.533867, length = 1375
[Episode 1805] total reward = 213.807983, length = 1561
[Episode 1806] total reward = -20.902672, length = 633
[Episode 1807] total reward = 214.766434, length = 1532
[Episode 1808] total reward = -115.129028, length = 61
[Episode 1809] total reward = -114.296608, length = 53
[Episode 1810] total reward = -138.586060, length = 126
[Episode 1811] total reward = -121.385681, length = 59
[Episode 1812] total reward = 218.231125, length = 1522
[Episode 1813] total reward = -116.465164, length = 61
[Episode 1814] total reward = -127.449753, length = 90
[Episode 1815] total reward = -116.917473, length = 45
[Episode 1816] total reward = 4.422207, length = 698
[Episode 1817] total reward = -111.763512, length = 42
[Episode 1818] total reward = 222.562057, length = 1472
[Epis

100%|██████████| 1207/1207 [00:04<00:00, 262.58it/s]


[Episode 2001] total reward = -126.042007, length = 97
[Episode 2002] total reward = -38.768005, length = 466
[Episode 2003] total reward = 230.956299, length = 1356
[Episode 2004] total reward = 93.081314, length = 1204
[Episode 2005] total reward = -116.925385, length = 43
[Episode 2006] total reward = -123.815117, length = 47
[Episode 2007] total reward = 232.656631, length = 1346
[Episode 2008] total reward = -111.888329, length = 58
[Episode 2009] total reward = -110.545197, length = 154
[Episode 2010] total reward = -114.613480, length = 38
[Episode 2011] total reward = 232.894623, length = 1342
[Episode 2012] total reward = -116.743240, length = 41
[Episode 2013] total reward = -107.440994, length = 189
[Episode 2014] total reward = -9.053169, length = 748
[Episode 2015] total reward = -10.228382, length = 666
[Episode 2016] total reward = 241.347748, length = 1249
[Episode 2017] total reward = -96.474251, length = 236
[Episode 2018] total reward = 63.657990, length = 1115
[Epis

100%|██████████| 1131/1131 [00:04<00:00, 261.33it/s]


[Episode 2201] total reward = 238.657837, length = 1305
[Episode 2202] total reward = -116.559464, length = 64
[Episode 2203] total reward = 4.905663, length = 804
[Episode 2204] total reward = -130.360458, length = 149
[Episode 2205] total reward = -117.458549, length = 48
[Episode 2206] total reward = -24.598701, length = 516
[Episode 2207] total reward = -113.959740, length = 87
[Episode 2208] total reward = 35.182419, length = 864
[Episode 2209] total reward = -116.706200, length = 41
[Episode 2210] total reward = -16.077728, length = 565
[Episode 2211] total reward = -125.940514, length = 137
[Episode 2212] total reward = 9.003151, length = 653
[Episode 2213] total reward = 34.874500, length = 899
[Episode 2214] total reward = -69.711960, length = 241
[Episode 2215] total reward = -77.905136, length = 270
[Episode 2216] total reward = 10.921844, length = 811
[Episode 2217] total reward = -118.933624, length = 101
[Episode 2218] total reward = -13.377232, length = 604
[Episode 2219

100%|██████████| 1275/1275 [00:05<00:00, 252.18it/s]


[Episode 2401] total reward = 232.747620, length = 1362
[Episode 2402] total reward = 219.118759, length = 1511
[Episode 2403] total reward = 223.192368, length = 1461
[Episode 2404] total reward = 228.665024, length = 1402
[Episode 2405] total reward = -118.847610, length = 66
[Episode 2406] total reward = 217.337219, length = 1522
[Episode 2407] total reward = 98.985161, length = 1458
[Episode 2408] total reward = -71.412277, length = 375
[Episode 2409] total reward = 218.964844, length = 1485
[Episode 2410] total reward = 227.164795, length = 1397
[Episode 2411] total reward = 219.868729, length = 1474
[Episode 2412] total reward = -116.331696, length = 519
[Episode 2413] total reward = -88.845963, length = 319
[Episode 2414] total reward = -22.673283, length = 876
[Episode 2415] total reward = -98.161850, length = 163
[Episode 2416] total reward = 225.253387, length = 1423
[Episode 2417] total reward = 227.605423, length = 1398
[Episode 2418] total reward = 224.959152, length = 142

100%|█████████▉| 1161/1162 [00:04<00:00, 258.56it/s]


[Episode 2601] total reward = 226.786545, length = 1351
[Episode 2602] total reward = 212.876144, length = 1508
[Episode 2603] total reward = -67.744469, length = 391
[Episode 2604] total reward = 218.811798, length = 1422
[Episode 2605] total reward = -63.263729, length = 311
[Episode 2606] total reward = -117.748856, length = 41
[Episode 2607] total reward = 220.527100, length = 1419
[Episode 2608] total reward = 191.662811, length = 1600
[Episode 2609] total reward = 14.518791, length = 1070
[Episode 2610] total reward = 213.859863, length = 1480
[Episode 2611] total reward = 225.453491, length = 1361
[Episode 2612] total reward = 215.808731, length = 1458
[Episode 2613] total reward = 203.281616, length = 1588
[Episode 2614] total reward = 225.466309, length = 1367
[Episode 2615] total reward = 216.804779, length = 1433
[Episode 2616] total reward = 78.278954, length = 1073
[Episode 2617] total reward = 232.395996, length = 1294
[Episode 2618] total reward = 221.115723, length = 14

 99%|█████████▊| 69/70 [00:00<00:00, 259.53it/s]


[Episode 2801] total reward = 251.267426, length = 1119
[Episode 2802] total reward = 248.229187, length = 1144
[Episode 2803] total reward = -116.600800, length = 65
[Episode 2804] total reward = -23.428719, length = 448
[Episode 2805] total reward = 230.208420, length = 1321
[Episode 2806] total reward = -113.949341, length = 75
[Episode 2807] total reward = -121.098694, length = 107
[Episode 2808] total reward = -114.418457, length = 39
[Episode 2809] total reward = 237.940186, length = 1259
[Episode 2810] total reward = -14.847847, length = 463
[Episode 2811] total reward = 240.498978, length = 1217
[Episode 2812] total reward = 249.222870, length = 1139
[Episode 2813] total reward = 47.277420, length = 720
[Episode 2814] total reward = -113.386391, length = 60
[Episode 2815] total reward = 241.201050, length = 1222
[Episode 2816] total reward = 243.980301, length = 1200
[Episode 2817] total reward = 242.880875, length = 1197
[Episode 2818] total reward = 235.734863, length = 1259


100%|██████████| 103/103 [00:00<00:00, 257.95it/s]


[Episode 3001] total reward = -34.061157, length = 365
[Episode 3002] total reward = -120.061325, length = 69
[Episode 3003] total reward = 249.722244, length = 1139
[Episode 3004] total reward = 251.639908, length = 1127
[Episode 3005] total reward = 245.187881, length = 1187
[Episode 3006] total reward = -78.129921, length = 202
[Episode 3007] total reward = -52.603996, length = 373
[Episode 3008] total reward = -43.290062, length = 365
[Episode 3009] total reward = -126.933159, length = 102
[Episode 3010] total reward = -119.094276, length = 64
[Episode 3011] total reward = 239.881958, length = 1246
[Episode 3012] total reward = 241.636932, length = 1242
[Episode 3013] total reward = -78.751678, length = 263
[Episode 3014] total reward = -117.668526, length = 110
[Episode 3015] total reward = -125.408333, length = 118
[Episode 3016] total reward = 248.939148, length = 1160
[Episode 3017] total reward = -126.734627, length = 87
[Episode 3018] total reward = 126.348633, length = 1128


100%|██████████| 1015/1015 [00:03<00:00, 263.67it/s]


[Episode 3201] total reward = 236.887146, length = 1272
[Episode 3202] total reward = 101.644249, length = 1063
[Episode 3203] total reward = 237.683609, length = 1269
[Episode 3204] total reward = 130.414825, length = 1173
[Episode 3205] total reward = 243.142395, length = 1221
[Episode 3206] total reward = 249.268753, length = 1151
[Episode 3207] total reward = -117.351303, length = 151
[Episode 3208] total reward = 250.039764, length = 1127
[Episode 3209] total reward = -113.690437, length = 66
[Episode 3210] total reward = 244.515594, length = 1190
[Episode 3211] total reward = 246.388947, length = 1170
[Episode 3212] total reward = 242.378662, length = 1221
[Episode 3213] total reward = 247.636398, length = 1152
[Episode 3214] total reward = 237.389496, length = 1259
[Episode 3215] total reward = -113.963501, length = 283
[Episode 3216] total reward = 137.203979, length = 1122
[Episode 3217] total reward = 243.821747, length = 1194
[Episode 3218] total reward = -54.745773, length 

100%|██████████| 997/997 [00:03<00:00, 258.61it/s]


[Episode 3401] total reward = 55.687790, length = 704
[Episode 3402] total reward = 255.685791, length = 1059
[Episode 3403] total reward = 254.036743, length = 1076
[Episode 3404] total reward = 254.456665, length = 1069
[Episode 3405] total reward = 250.805756, length = 1100
[Episode 3406] total reward = -56.869778, length = 359
[Episode 3407] total reward = -46.608727, length = 329
[Episode 3408] total reward = 256.207886, length = 1041
[Episode 3409] total reward = 20.546474, length = 598
[Episode 3410] total reward = -12.098328, length = 469
[Episode 3411] total reward = -74.904129, length = 217
[Episode 3412] total reward = 26.303978, length = 681
[Episode 3413] total reward = 252.657730, length = 1077
[Episode 3414] total reward = -11.365139, length = 597
[Episode 3415] total reward = 258.419464, length = 1022
[Episode 3416] total reward = 241.965485, length = 1197
[Episode 3417] total reward = 115.862770, length = 897
[Episode 3418] total reward = 244.517456, length = 1154
[Epi

100%|██████████| 79/79 [00:00<00:00, 266.65it/s]


[Episode 3601] total reward = -33.197662, length = 543
[Episode 3602] total reward = 232.627350, length = 1280
[Episode 3603] total reward = 236.840210, length = 1257
[Episode 3604] total reward = 22.110432, length = 684
[Episode 3605] total reward = -61.842739, length = 429
[Episode 3606] total reward = 233.907684, length = 1281
[Episode 3607] total reward = 236.047836, length = 1256
[Episode 3608] total reward = -104.822601, length = 142
[Episode 3609] total reward = 115.634659, length = 1318
[Episode 3610] total reward = -38.789761, length = 601
[Episode 3611] total reward = -123.470596, length = 84
[Episode 3612] total reward = 241.827667, length = 1191
[Episode 3613] total reward = 225.085373, length = 1364
[Episode 3614] total reward = 10.511883, length = 673
[Episode 3615] total reward = -49.701153, length = 429
[Episode 3616] total reward = 234.461548, length = 1268
[Episode 3617] total reward = 79.505142, length = 1131
[Episode 3618] total reward = 237.453171, length = 1234
[E

100%|██████████| 1005/1005 [00:03<00:00, 261.62it/s]


[Episode 3801] total reward = 230.478180, length = 1331
[Episode 3802] total reward = -123.439499, length = 99
[Episode 3803] total reward = 238.083344, length = 1234
[Episode 3804] total reward = 240.824005, length = 1205
[Episode 3805] total reward = -113.227409, length = 49
[Episode 3806] total reward = 245.202393, length = 1160
[Episode 3807] total reward = -130.268127, length = 105
[Episode 3808] total reward = 245.773697, length = 1177
[Episode 3809] total reward = 237.944611, length = 1250
[Episode 3810] total reward = -31.379698, length = 501
[Episode 3811] total reward = 243.735870, length = 1189
[Episode 3812] total reward = 239.762772, length = 1227
[Episode 3813] total reward = -134.102890, length = 111
[Episode 3814] total reward = 245.317886, length = 1164
[Episode 3815] total reward = 245.257477, length = 1149
[Episode 3816] total reward = -117.018967, length = 64
[Episode 3817] total reward = 247.053696, length = 1181
[Episode 3818] total reward = -111.055588, length = 

 99%|█████████▊| 75/76 [00:00<00:00, 261.48it/s]


[Episode 4001] total reward = -50.208549, length = 344
[Episode 4002] total reward = -118.635475, length = 72
[Episode 4003] total reward = -126.502060, length = 112
[Episode 4004] total reward = -118.285622, length = 72
[Episode 4005] total reward = 228.087265, length = 1354
[Episode 4006] total reward = -61.405533, length = 307
[Episode 4007] total reward = 231.007828, length = 1328
[Episode 4008] total reward = -120.692429, length = 72
[Episode 4009] total reward = 230.289673, length = 1323
[Episode 4010] total reward = 244.661072, length = 1191
[Episode 4011] total reward = -57.219055, length = 320
[Episode 4012] total reward = -88.882904, length = 257
[Episode 4013] total reward = -118.681473, length = 72
[Episode 4014] total reward = 234.847260, length = 1293
[Episode 4015] total reward = -22.282768, length = 567
[Episode 4016] total reward = -117.117386, length = 167
[Episode 4017] total reward = 115.669373, length = 1209
[Episode 4018] total reward = -123.199615, length = 98
[E

100%|██████████| 70/70 [00:00<00:00, 247.30it/s]


[Episode 4201] total reward = -123.115669, length = 108
[Episode 4202] total reward = -56.930031, length = 348
[Episode 4203] total reward = -115.356491, length = 104
[Episode 4204] total reward = -117.140213, length = 57
[Episode 4205] total reward = -122.957367, length = 123
[Episode 4206] total reward = -110.760223, length = 37
[Episode 4207] total reward = 239.123260, length = 1240
[Episode 4208] total reward = -120.156036, length = 127
[Episode 4209] total reward = 238.212723, length = 1216
[Episode 4210] total reward = 233.304291, length = 1290
[Episode 4211] total reward = 242.144531, length = 1200
[Episode 4212] total reward = -3.420776, length = 604
[Episode 4213] total reward = -123.020309, length = 144
[Episode 4214] total reward = 241.871811, length = 1209
[Episode 4215] total reward = -111.268867, length = 115
[Episode 4216] total reward = -117.268204, length = 52
[Episode 4217] total reward = -114.544144, length = 88
[Episode 4218] total reward = -113.974220, length = 53


100%|█████████▉| 1098/1099 [00:04<00:00, 250.97it/s]


[Episode 4401] total reward = -119.712265, length = 44
[Episode 4402] total reward = 202.810364, length = 1599
[Episode 4403] total reward = 94.124634, length = 1366
[Episode 4404] total reward = -124.372292, length = 81
[Episode 4405] total reward = 225.030975, length = 1374
[Episode 4406] total reward = 98.798874, length = 1328
[Episode 4407] total reward = 229.467102, length = 1328
[Episode 4408] total reward = -115.548027, length = 52
[Episode 4409] total reward = 230.629868, length = 1327
[Episode 4410] total reward = 221.039108, length = 1421
[Episode 4411] total reward = 234.790283, length = 1281
[Episode 4412] total reward = 235.156784, length = 1256
[Episode 4413] total reward = 236.724792, length = 1257
[Episode 4414] total reward = -122.852211, length = 72
[Episode 4415] total reward = -130.432693, length = 110
[Episode 4416] total reward = -115.369087, length = 55
[Episode 4417] total reward = -116.413803, length = 97
[Episode 4418] total reward = -115.225372, length = 54
[

100%|██████████| 1000/1000 [00:03<00:00, 253.86it/s]


[Episode 4601] total reward = -52.519630, length = 338
[Episode 4602] total reward = 238.579071, length = 1226
[Episode 4603] total reward = 241.933945, length = 1184
[Episode 4604] total reward = 248.080139, length = 1112
[Episode 4605] total reward = 39.998238, length = 814
[Episode 4606] total reward = -132.300980, length = 119
[Episode 4607] total reward = -75.890404, length = 237
[Episode 4608] total reward = 235.947006, length = 1244
[Episode 4609] total reward = -131.136017, length = 122
[Episode 4610] total reward = 36.222031, length = 662
[Episode 4611] total reward = 242.152069, length = 1188
[Episode 4612] total reward = 236.373260, length = 1255
[Episode 4613] total reward = 240.390076, length = 1210
[Episode 4614] total reward = 2.209774, length = 639
[Episode 4615] total reward = 76.010658, length = 837
[Episode 4616] total reward = 243.239944, length = 1171
[Episode 4617] total reward = -122.764099, length = 126
[Episode 4618] total reward = -113.578445, length = 269
[Ep

100%|██████████| 690/690 [00:02<00:00, 253.91it/s]


[Episode 4801] total reward = 249.623474, length = 1085
[Episode 4802] total reward = 253.588013, length = 1068
[Episode 4803] total reward = 241.963821, length = 1161
[Episode 4804] total reward = 253.905334, length = 1040
[Episode 4805] total reward = -112.867630, length = 43
[Episode 4806] total reward = -19.134758, length = 673
[Episode 4807] total reward = 246.387024, length = 1138
[Episode 4808] total reward = -109.802513, length = 57
[Episode 4809] total reward = 244.366425, length = 1158
[Episode 4810] total reward = 243.761200, length = 1132
[Episode 4811] total reward = 249.348709, length = 1092
[Episode 4812] total reward = 247.852509, length = 1111
[Episode 4813] total reward = 246.663574, length = 1121
[Episode 4814] total reward = 250.244934, length = 1102
[Episode 4815] total reward = 240.546204, length = 1193
[Episode 4816] total reward = 248.437668, length = 1115
[Episode 4817] total reward = 246.647110, length = 1141
[Episode 4818] total reward = 255.978745, length = 

### 5.3 Load the Model and Play

In [15]:
save_dir = project_root + '/save'
model_path = os.path.join(save_dir, "model.pt")

if os.path.exists(model_path):
    print("Loading the model ... ", end="")
    checkpoint = torch.load(model_path)
    policy_net.load_state_dict(checkpoint["PolicyNet"])
    print("Done.")
else:
    print('ERROR: No model saved')

play(policy_net)

Loading the model ... Done.
[Evaluation] Total reward = 263.157728, length = 952


100%|██████████| 954/954 [00:03<00:00, 265.67it/s]
