In [1]:
!pip install gym
!pip install stable_baselines3



In [2]:
from gym import Env
from gym.spaces import Discrete, Box
import random
import numpy as np
import torch
from stable_baselines3 import SAC, PPO
import gym

In [115]:
### Define a two line manifold environment ###
class simEnv(Env):
    def __init__(self):
        # Action agent can take
        self.action_space = Discrete(2)
        # Work space
        self.observation_space = Box(np.array([-10, 10]), np.array([-10, 10]))
        # Initial starting point
        self.state = np.array([0, 0])
        # Goal state
        self.goal = np.array([10, 10])
        # Intersection point
        self.intersection = np.array([10, 0])
        # Indicator whether next manifold is reached or not
        self.next_manifold = False
    
    def step(self, action):
        if self.next_manifold:
            if action == 0:
                self.state[1] += 1
            else: 
                self.state[1] -= 1
        else:
            if action == 0:
                self.state[0] += 1
            else: 
                self.state[0] -= 1
        if self.state[0] == 10:
            self.next_manifold = True
        done = False
        # Design rewards
        if self.state[1] != 0 and self.state[0] != 10: # Fall off the manifold
            reward = -100
            done = True
        elif np.array_equal(self.state, self.intersection): # Get to the intersection point
            if self.next_manifold:
                reward = 1 / np.linalg.norm(self.state - self.goal)
            else:
                reward = 10
        elif np.array_equal(self.state, self.goal): # Reach the goal 
            reward = 100
            done = True
        else: # Direct the agent to intersection point / goal
            if self.next_manifold:
                reward = 1 / np.linalg.norm(self.state - self.goal)
            else: 
                reward = 1 / np.linalg.norm(self.state - self.intersection)
        # Set placeholder for info 
        info = {}
        return self.state, reward, done, info
    
    def render(self):
        pass
    
    def reset(self):
        self.state = np.array([0,0])
        self.next_manifold = False
        return self.state

In [53]:
### Define a two line manifold environment ###
class simEnv1(Env):
    def __init__(self):
        # Action agent can take
        self.action_space = Discrete(4)
        # Work space
        self.observation_space = Box(np.array([-10, 10]), np.array([-10, 10]))
        # Initial starting point
        self.state = np.array([0, 0])
        # Goal state
        self.goal = np.array([10, 10])
        # Intersection point
        self.intersection = np.array([10, 0])
        # Indicator whether next manifold is reached or not
        self.next_manifold = False
    
    def step(self, action):
        # action == 0 => moving up
        if action == 0:
            self.state[1] += 1
        # action == 1 => moving down 
        elif action == 1:
            self.state[1] -= 1
        # action == 2 => moving right
        elif action == 2:
            self.state[0] += 1
        # action == 3 => moving left
        elif action == 3:
            self.state[0] -= 1
        # If action is not valid, error out
        else:
            return
        if self.state[0] == 10:
            self.next_manifold = True
        done = False
        # Design rewards
        if self.state[1] != 0 and self.state[0] != 10: # Fall off the manifold
            reward = -100
            done = True
        elif np.array_equal(self.state, self.intersection): # Get to the intersection point
            reward = 10
        elif np.array_equal(self.state, self.goal): # Reach the goal 
            reward = 100
            done = True
        else: # Direct the agent to intersection point / goal
            if self.next_manifold:
                reward = 1 / np.linalg.norm(self.state - self.goal)
            else: 
                reward = 1 / np.linalg.norm(self.state - self.intersection)
        # Set placeholder for info 
        info = {}
        return self.state, reward, done, info
    
    def render(self):
        pass
    
    def reset(self):
        self.state = np.array([0,0])
        self.next_manifold = False
        return self.state

In [78]:
### Define a two line manifold environment ###
class simEnv2(Env):
    def __init__(self):
        # Action agent can take
        self.action_space = Discrete(4)
        # Work space
        self.observation_space = Box(np.array([-10, 10]), np.array([-10, 10]))
        # Initial starting point
        self.state = np.array([0, 0])
        # Goal state
        self.goal = np.array([10, 10])
        # Intersection point
        self.intersection = np.array([10, 0])
        # Indicator whether next manifold is reached or not
        self.next_manifold = False
    
    def step(self, action):
        # action == 0 => moving up
        if action == 0:
            self.state[1] += 1
        # action == 1 => moving down 
        elif action == 1:
            self.state[1] -= 1
        # action == 2 => moving right
        elif action == 2:
            self.state[0] += 1
        # action == 3 => moving left
        elif action == 3:
            self.state[0] -= 1
        # If action is not valid, error out
        else:
            return
        if self.state[0] == 10:
            self.next_manifold = True
        done = False
        # Design rewards
        if self.next_manifold and self.state[0] != 10: # After reaching the intersection,/
                                                       # discourge of going back to previous manifold
            reward = -100
            done = True 
        elif self.state[1] != 0 and self.state[0] != 10: # Fall off the manifold
            reward = -100
            done = True
        elif np.array_equal(self.state, self.intersection): # Get to the intersection point
            reward = 10
        elif np.array_equal(self.state, self.goal): # Reach the goal 
            reward = 100
            done = True
        else: # Direct the agent to intersection point / goal
            if self.next_manifold:
                reward = 1 / np.linalg.norm(self.state - self.goal)
            else: 
                reward = 1 / np.linalg.norm(self.state - self.intersection)
        # Set placeholder for info 
        info = {}
        return self.state, reward, done, info
    
    def render(self):
        pass
    
    def reset(self):
        self.state = np.array([0,0])
        self.next_manifold = False
        return self.state

In [None]:
### Define a two line manifold environment ### ### With simplified Action Space ###
class simEnv3(Env):
    def __init__(self):
        # Action agent can take
        self.action_space = Discrete(2)
        # Work space
        self.observation_space = Box(np.array([-10, 10]), np.array([-10, 10]))
        # Initial starting point
        self.state = np.array([0, 0])
        # Goal state
        self.goal = np.array([10, 10])
        # Intersection point
        self.intersection = np.array([10, 0])
        # Indicator whether next manifold is reached or not
        self.next_manifold = False
    
    def step(self, action):
        if self.next_manifold:
            if action == 0:
                self.state[1] += 1
            else: 
                self.state[1] -= 1
        else:
            if action == 0:
                self.state[0] += 1
            else: 
                self.state[0] -= 1
        if self.state[0] == 10:
            self.next_manifold = True
        done = False
        # Design rewards
        if self.state[1] != 0 and self.state[0] != 10: # Fall off the manifold
            reward = -100
            done = True
        elif np.array_equal(self.state, self.intersection): # Get to the intersection point
            if self.next_manifold:
                reward = 1 / np.linalg.norm(self.state - self.goal)
            else:
                reward = 10
        elif np.array_equal(self.state, self.goal): # Reach the goal 
            reward = 100
            done = True
        else: # Direct the agent to intersection point / goal
            if self.next_manifold:
                reward = 1 / np.linalg.norm(self.state - self.goal)
            else: 
                reward = 1 / np.linalg.norm(self.state - self.intersection)
        # Set placeholder for info 
        info = {}
        return self.state, reward, done, info
    
    def render(self):
        pass
    
    def reset(self):
        self.state = np.array([0,0])
        self.next_manifold = False
        return self.state

In [116]:
env = simEnv()

In [117]:
device = torch.device(
        "cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = PPO('MlpPolicy', env, verbose=1, device=device)
model = model.learn(total_timesteps=500000, eval_freq=1000)

Using cuda:0 device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1411 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 662         |
|    ep_rew_mean          | 145         |
| time/                   |             |
|    fps                  | 935         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016478904 |
|    clip_fraction        | 0.275       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.677      |
|    explained_variance   | 0.732       |
|    learning_rate        | 0.0003      |
|    loss            

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 662         |
|    ep_rew_mean          | 145         |
| time/                   |             |
|    fps                  | 777         |
|    iterations           | 11          |
|    time_elapsed         | 28          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.010329092 |
|    clip_fraction        | 0.0667      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.593      |
|    explained_variance   | 0.853       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.527       |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0113     |
|    value_loss           | 1.21        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.69e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 52.7        |
|    ep_rew_mean          | 113         |
| time/                   |             |
|    fps                  | 771         |
|    iterations           | 21          |
|    time_elapsed         | 55          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.011078905 |
|    clip_fraction        | 0.0442      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.647      |
|    explained_variance   | -1.19e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0216      |
|    n_updates            | 200         |
|    policy_gradient_loss | 0.000211    |
|    value_loss           | 0.231       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 52.7  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 52.7         |
|    ep_rew_mean          | 113          |
| time/                   |              |
|    fps                  | 770          |
|    iterations           | 31           |
|    time_elapsed         | 82           |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0010569212 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.688       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00208     |
|    n_updates            | 300          |
|    policy_gradient_loss | 0.000417     |
|    value_loss           | 0.0045       |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 52.7        |
|    ep_rew_mean          | 113         |
| time/                   |             |
|    fps                  | 760         |
|    iterations           | 41          |
|    time_elapsed         | 110         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006879686 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.691      |
|    explained_variance   | 1.19e-07    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0279      |
|    n_updates            | 400         |
|    policy_gradient_loss | 0.000579    |
|    value_loss           | 0.000347    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 52.7  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 52.7         |
|    ep_rew_mean          | 113          |
| time/                   |              |
|    fps                  | 758          |
|    iterations           | 51           |
|    time_elapsed         | 137          |
|    total_timesteps      | 104448       |
| train/                  |              |
|    approx_kl            | 0.0010993681 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.692       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00382      |
|    n_updates            | 500          |
|    policy_gradient_loss | 0.000635     |
|    value_loss           | 1.39e-05     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 52.7        |
|    ep_rew_mean          | 113         |
| time/                   |             |
|    fps                  | 753         |
|    iterations           | 61          |
|    time_elapsed         | 165         |
|    total_timesteps      | 124928      |
| train/                  |             |
|    approx_kl            | 0.004270927 |
|    clip_fraction        | 0.0146      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.494      |
|    explained_variance   | 1.19e-07    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00953     |
|    n_updates            | 600         |
|    policy_gradient_loss | -1.73e-05   |
|    value_loss           | 0.000611    |
-----------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52.7          |
|    ep_rew_mean          | 113           |
| time/                   |               |
|    fps                  | 747           |
|    iterations           | 71            |
|    time_elapsed         | 194           |
|    total_timesteps      | 145408        |
| train/                  |               |
|    approx_kl            | 0.00032339388 |
|    clip_fraction        | 0.0125        |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.274        |
|    explained_variance   | -1.19e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | -0.000676     |
|    n_updates            | 700           |
|    policy_gradient_loss | -0.00167      |
|    value_loss           | 0.000952      |
-------------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 52.7         |
|    ep_rew_mean          | 113          |
| time/                   |              |
|    fps                  | 749          |
|    iterations           | 81           |
|    time_elapsed         | 221          |
|    total_timesteps      | 165888       |
| train/                  |              |
|    approx_kl            | 0.0008771547 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.419       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00262      |
|    n_updates            | 800          |
|    policy_gradient_loss | -0.000586    |
|    value_loss           | 0.000925     |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52.7          |
|    ep_rew_mean          | 113           |
| time/                   |               |
|    fps                  | 744           |
|    iterations           | 91            |
|    time_elapsed         | 250           |
|    total_timesteps      | 186368        |
| train/                  |               |
|    approx_kl            | 0.00080738356 |
|    clip_fraction        | 0.00601       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.351        |
|    explained_variance   | 1.19e-07      |
|    learning_rate        | 0.0003        |
|    loss                 | 0.00173       |
|    n_updates            | 900           |
|    policy_gradient_loss | -0.000434     |
|    value_loss           | 0.000654      |
-------------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 52.7         |
|    ep_rew_mean          | 113          |
| time/                   |              |
|    fps                  | 744          |
|    iterations           | 101          |
|    time_elapsed         | 277          |
|    total_timesteps      | 206848       |
| train/                  |              |
|    approx_kl            | 0.0007796581 |
|    clip_fraction        | 0.0284       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.216       |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | -0.000391    |
|    n_updates            | 1000         |
|    policy_gradient_loss | -0.000919    |
|    value_loss           | 1.25e-05     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52.7          |
|    ep_rew_mean          | 113           |
| time/                   |               |
|    fps                  | 746           |
|    iterations           | 111           |
|    time_elapsed         | 304           |
|    total_timesteps      | 227328        |
| train/                  |               |
|    approx_kl            | 8.1196544e-05 |
|    clip_fraction        | 0.00273       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.399        |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.0003        |
|    loss                 | 0.000214      |
|    n_updates            | 1100          |
|    policy_gradient_loss | 0.000737      |
|    value_loss           | 1.46e-09      |
-------------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 52.7        |
|    ep_rew_mean          | 113         |
| time/                   |             |
|    fps                  | 748         |
|    iterations           | 121         |
|    time_elapsed         | 331         |
|    total_timesteps      | 247808      |
| train/                  |             |
|    approx_kl            | 0.002488588 |
|    clip_fraction        | 0.0511      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.276      |
|    explained_variance   | 5.96e-08    |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00335    |
|    n_updates            | 1200        |
|    policy_gradient_loss | -0.00146    |
|    value_loss           | 2.73e-05    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 52.7

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52.7          |
|    ep_rew_mean          | 113           |
| time/                   |               |
|    fps                  | 749           |
|    iterations           | 131           |
|    time_elapsed         | 357           |
|    total_timesteps      | 268288        |
| train/                  |               |
|    approx_kl            | 0.00065243593 |
|    clip_fraction        | 0.00127       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.165        |
|    explained_variance   | -1.19e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 0.000419      |
|    n_updates            | 1300          |
|    policy_gradient_loss | 0.000137      |
|    value_loss           | 3.58e-05      |
-------------------------------------------
-------------------------------------------
| rollout/                |     

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 52.7         |
|    ep_rew_mean          | 113          |
| time/                   |              |
|    fps                  | 745          |
|    iterations           | 140          |
|    time_elapsed         | 384          |
|    total_timesteps      | 286720       |
| train/                  |              |
|    approx_kl            | 0.0002260659 |
|    clip_fraction        | 0.00298      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.0521      |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | -1.48e-06    |
|    n_updates            | 1390         |
|    policy_gradient_loss | -0.000112    |
|    value_loss           | 2.33e-05     |
------------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_l

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 52.7        |
|    ep_rew_mean          | 113         |
| time/                   |             |
|    fps                  | 743         |
|    iterations           | 149         |
|    time_elapsed         | 410         |
|    total_timesteps      | 305152      |
| train/                  |             |
|    approx_kl            | 7.69668e-05 |
|    clip_fraction        | 0.00215     |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.0132     |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 1.05e-06    |
|    n_updates            | 1480        |
|    policy_gradient_loss | -0.000142   |
|    value_loss           | 1.38e-05    |
-----------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52

--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 52.7     |
|    ep_rew_mean          | 113      |
| time/                   |          |
|    fps                  | 744      |
|    iterations           | 159      |
|    time_elapsed         | 437      |
|    total_timesteps      | 325632   |
| train/                  |          |
|    approx_kl            | 0.0      |
|    clip_fraction        | 0        |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.00505 |
|    explained_variance   | 0        |
|    learning_rate        | 0.0003   |
|    loss                 | 1.05e-05 |
|    n_updates            | 1580     |
|    policy_gradient_loss | 4.16e-09 |
|    value_loss           | 2.26e-05 |
--------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52.7          |
|    ep_rew_mean          | 113           |
| tim

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |
| time/                   |           |
|    fps                  | 744       |
|    iterations           | 169       |
|    time_elapsed         | 464       |
|    total_timesteps      | 346112    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.00378  |
|    explained_variance   | -1.19e-07 |
|    learning_rate        | 0.0003    |
|    loss                 | 7.27e-07  |
|    n_updates            | 1680      |
|    policy_gradient_loss | -6.81e-08 |
|    value_loss           | 1.54e-05  |
---------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 52.7         |
|    ep_rew_mean          | 113

--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 52.7     |
|    ep_rew_mean          | 113      |
| time/                   |          |
|    fps                  | 745      |
|    iterations           | 179      |
|    time_elapsed         | 492      |
|    total_timesteps      | 366592   |
| train/                  |          |
|    approx_kl            | 0.0      |
|    clip_fraction        | 0        |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.0021  |
|    explained_variance   | 0        |
|    learning_rate        | 0.0003   |
|    loss                 | 4.84e-07 |
|    n_updates            | 1780     |
|    policy_gradient_loss | 6.48e-07 |
|    value_loss           | 1.34e-05 |
--------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52.7          |
|    ep_rew_mean          | 113           |
| tim

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |
| time/                   |           |
|    fps                  | 746       |
|    iterations           | 189       |
|    time_elapsed         | 518       |
|    total_timesteps      | 387072    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.00087  |
|    explained_variance   | -1.19e-07 |
|    learning_rate        | 0.0003    |
|    loss                 | 2.22e-05  |
|    n_updates            | 1880      |
|    policy_gradient_loss | 4.85e-07  |
|    value_loss           | 1.18e-05  |
---------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 52.7         |
|    ep_rew_mean          | 113

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |
| time/                   |           |
|    fps                  | 745       |
|    iterations           | 199       |
|    time_elapsed         | 546       |
|    total_timesteps      | 407552    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.000523 |
|    explained_variance   | 0         |
|    learning_rate        | 0.0003    |
|    loss                 | -1.06e-06 |
|    n_updates            | 1980      |
|    policy_gradient_loss | 4.14e-08  |
|    value_loss           | 1.23e-05  |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |
| time/                   |           |
|    fps                  | 747       |
|    iterations           | 209       |
|    time_elapsed         | 572       |
|    total_timesteps      | 428032    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.000523 |
|    explained_variance   | -1.19e-07 |
|    learning_rate        | 0.0003    |
|    loss                 | -2.29e-07 |
|    n_updates            | 2080      |
|    policy_gradient_loss | 8.12e-07  |
|    value_loss           | 6.71e-06  |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |
| time/                   |           |
|    fps                  | 743       |
|    iterations           | 219       |
|    time_elapsed         | 603       |
|    total_timesteps      | 448512    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.000141 |
|    explained_variance   | 0         |
|    learning_rate        | 0.0003    |
|    loss                 | -4.99e-08 |
|    n_updates            | 2180      |
|    policy_gradient_loss | 1.4e-09   |
|    value_loss           | 8.12e-06  |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |
| time/                   |           |
|    fps                  | 743       |
|    iterations           | 229       |
|    time_elapsed         | 630       |
|    total_timesteps      | 468992    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.000273 |
|    explained_variance   | 1.19e-07  |
|    learning_rate        | 0.0003    |
|    loss                 | -2.25e-07 |
|    n_updates            | 2280      |
|    policy_gradient_loss | -2.44e-06 |
|    value_loss           | 1.17e-05  |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |
| time/                   |           |
|    fps                  | 745       |
|    iterations           | 239       |
|    time_elapsed         | 656       |
|    total_timesteps      | 489472    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.000273 |
|    explained_variance   | 0         |
|    learning_rate        | 0.0003    |
|    loss                 | 1.06e-06  |
|    n_updates            | 2380      |
|    policy_gradient_loss | -5.54e-07 |
|    value_loss           | 4.82e-06  |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 52.7      |
|    ep_rew_mean          | 113       |


In [118]:
obs = env.reset()
print(env)

for _ in range(100):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print(obs)
    print(rewards)

<simEnv instance>
[1 0]
0.1111111111111111
[2 0]
0.125
[3 0]
0.14285714285714285
[4 0]
0.16666666666666666
[5 0]
0.2
[6 0]
0.25
[7 0]
0.3333333333333333
[8 0]
0.5
[9 0]
1.0
[10  0]
0.1
[10  1]
0.1111111111111111
[10  2]
0.125
[10  3]
0.14285714285714285
[10  4]
0.16666666666666666
[10  5]
0.2
[10  6]
0.25
[10  7]
0.3333333333333333
[10  8]
0.5
[10  9]
1.0
[10 10]
100
[10 11]
1.0
[10 12]
0.5
[10 13]
0.3333333333333333
[10 14]
0.25
[10 15]
0.2
[10 16]
0.16666666666666666
[10 17]
0.14285714285714285
[10 18]
0.125
[10 19]
0.1111111111111111
[10 20]
0.1
[10 21]
0.09090909090909091
[10 22]
0.08333333333333333
[10 23]
0.07692307692307693
[10 24]
0.07142857142857142
[10 25]
0.06666666666666667
[10 26]
0.0625
[10 27]
0.058823529411764705
[10 28]
0.05555555555555555
[10 29]
0.05263157894736842
[10 30]
0.05
[10 31]
0.047619047619047616
[10 32]
0.045454545454545456
[10 33]
0.043478260869565216
[10 34]
0.041666666666666664
[10 35]
0.04
[10 36]
0.038461538461538464
[10 37]
0.037037037037037035
[10 3