In [1]:
import gym
import numpy as np
from boat_env import BoatEnv

from stable_baselines.common.env_checker import check_env
from stable_baselines import DQN

from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [5]:
boatenv = BoatEnv(type='discrete', mode='simulation')
check_env(boatenv)

In [22]:
# Deactivate all the DQN extensions to have the original version
# In practice, it is recommend to have them activated
# kwargs = {'double_q': False, 'prioritized_replay': False, 'policy_kwargs': dict(dueling=False)}

# Note that the MlpPolicy of DQN is different from the one of PPO
# but stable-baselines handles that automatically if you pass a string
dqn_model = DQN('MlpPolicy', boatenv, verbose=1, tensorboard_log="./log/") #, **kwargs)

In [8]:
# Train the agent for x steps
dqn_model.learn(total_timesteps=50000, log_interval=10)


--------------------------------------
| % time spent exploring  | 50       |
| episodes                | 10       |
| mean 100 episode reward | 0.6      |
| steps                   | 2528     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 20       |
| mean 100 episode reward | 1.7      |
| steps                   | 6411     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 30       |
| mean 100 episode reward | 3.1      |
| steps                   | 10559    |
--------------------------------------


<stable_baselines.deepq.dqn.DQN at 0x17579c9f988>

In [21]:
obs = boatenv.reset()

for i in range(1000):
    action, _states = dqn_model.predict(obs)
    obs, rewards, dones, info = boatenv.step(action)
    boatenv.render()
    if dones == True:
        obs = boatenv.reset()
    
boatenv.close()

## Continuous

In [2]:
from stable_baselines.sac.policies import MlpPolicy
from stable_baselines import SAC

In [3]:
boatenv = BoatEnv(type='continuous', mode='simulation')
check_env(boatenv)

In [6]:
sac_model = SAC(MlpPolicy, boatenv, verbose=1, tensorboard_log="./log/")
sac_model.learn(total_timesteps=50000, log_interval=1)



-------------------------------------------
| current_lr              | 0.0003        |
| ent_coef                | 0.98896474    |
| ent_coef_loss           | -0.018586244  |
| entropy                 | 1.2762728     |
| episodes                | 2             |
| fps                     | 214           |
| mean 100 episode reward | -1            |
| n_updates               | 38            |
| policy_loss             | -0.63328016   |
| qf1_loss                | 0.00013312467 |
| qf2_loss                | 1.813969e-05  |
| time_elapsed            | 0             |
| total timesteps         | 137           |
| value_loss              | 0.08337617    |
-------------------------------------------
-------------------------------------------
| current_lr              | 0.0003        |
| ent_coef                | 0.8524365     |
| ent_coef_loss           | -0.2687397    |
| entropy                 | 1.289706      |
| episodes                | 3             |
| fps                     | 14

------------------------------------------
| current_lr              | 0.0003       |
| ent_coef                | 0.22932354   |
| ent_coef_loss           | -2.4820294   |
| entropy                 | 1.3046303    |
| episodes                | 14           |
| fps                     | 142          |
| mean 100 episode reward | 1.8          |
| n_updates               | 4910         |
| policy_loss             | -6.571297    |
| qf1_loss                | 0.00822128   |
| qf2_loss                | 0.0089412425 |
| time_elapsed            | 35           |
| total timesteps         | 5009         |
| value_loss              | 0.0028117849 |
------------------------------------------
------------------------------------------
| current_lr              | 0.0003       |
| ent_coef                | 0.14809883   |
| ent_coef_loss           | -3.1595483   |
| entropy                 | 1.3285587    |
| episodes                | 15           |
| fps                     | 147          |
| mean 100 

-------------------------------------------
| current_lr              | 0.0003        |
| ent_coef                | 0.0019941614  |
| ent_coef_loss           | -0.83402056   |
| entropy                 | 0.5808845     |
| episodes                | 26            |
| fps                     | 148           |
| mean 100 episode reward | 7.5           |
| n_updates               | 23428         |
| policy_loss             | -3.4575891    |
| qf1_loss                | 0.00027636273 |
| qf2_loss                | 0.0003316409  |
| time_elapsed            | 158           |
| total timesteps         | 23527         |
| value_loss              | 0.0014239855  |
-------------------------------------------
-------------------------------------------
| current_lr              | 0.0003        |
| ent_coef                | 0.0017879144  |
| ent_coef_loss           | -1.5397105    |
| entropy                 | 0.52851164    |
| episodes                | 27            |
| fps                     | 147 

<stable_baselines.sac.sac.SAC at 0x2143a375188>

In [7]:
obs = boatenv.reset()

for n in range(10):
    for i in range(300):
        action, _states = sac_model.predict(obs)
        obs, rewards, dones, info = boatenv.step(action)
        boatenv.render()
        if dones == True:
            obs = boatenv.reset()
    obs = boatenv.reset()
    
boatenv.close()