In [1]:
!pip install gym==0.21 > /dev/null
!pip install stable-baselines3[extra] box2d > /dev/null
!pip install sb3-contrib > /dev/null

In [2]:
!pip install pyvirtualdisplay > /dev/null
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null

In [3]:
import gym
from distutils.dir_util import copy_tree
from sb3_contrib import TQC

In [4]:
from IPython.display import HTML
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
import glob
import io
import base64
from gym.wrappers import Monitor
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f230fe6cdd0>

In [5]:
env = gym.make("BipedalWalker-v3")

In [6]:
model = TQC(
    policy="MlpPolicy", 
    env=env,
    buffer_size=300000, 
    ent_coef="auto",
    gamma=0.98,
    gradient_steps=64,
    learning_rate=0.00073,
    learning_starts=0,
    policy_kwargs=dict(log_std_init=-3, net_arch=[400, 300],n_critics=2, n_quantiles=25),
    tau=0.02,
    train_freq=64,
    use_sde=True,
    verbose=1
)

model.learn(total_timesteps=100000, log_interval=4)
model.save("bipedal-walker-tqc")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 116      |
|    ep_rew_mean     | -118     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 60       |
|    time_elapsed    | 7        |
|    total_timesteps | 463      |
| train/             |          |
|    actor_loss      | -12.6    |
|    critic_loss     | 0.495    |
|    ent_coef        | 0.718    |
|    ent_coef_loss   | -2.17    |
|    learning_rate   | 0.00073  |
|    n_updates       | 448      |
|    std             | 0.0499   |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 105      |
|    ep_rew_mean     | -112     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 61       |
|    time_elapsed    | 13       |
|    total_timesteps 

In [7]:
def show_video(folder):
  mp4list = glob.glob(f'{folder}/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env, folder):
  env = Monitor(env, folder, force=True)
  return env

In [8]:
def simulate(env, model, verbose=False):
  observation = env.reset()
  done = False
  i = 0
  while not done:
    i += 1
    action = model.predict(observation)[0]
    observation, reward, done, info = env.step(action)
    if verbose and (done or i % 500):
      display_results(observation, reward, done, info)
  return i, reward
      

def display_results(observation, reward, done, info):
  print(f"Observation: {observation}")
  print(f"Reward: {reward}")
  print(f"Done: {done}")
  print(f"Info: {info}")
  print("==================================================")



---



In [9]:
best_reward = -1e8
for i in tqdm(range(10)):
  env = wrap_env(env, './video')
  steps, reward = simulate(env, model)
  env.close()
  if reward > best_reward:
    best_reward = reward
    copy_tree('./video', f'./video_{best_reward}') 

100%|██████████| 10/10 [02:42<00:00, 16.29s/it]




---



In [13]:
show_video("/content/video_0.3607886327505112")

# number1 = 62
# number2 = 62
# show_video(f'video_{number1}_{number2}')