In [7]:
from time import sleep

import numpy as np
import gym
from gym.envs.registration import register
from stable_baselines3 import *
from stable_baselines3.common.env_util import make_vec_env
import wandb
from wandb.integration.sb3 import WandbCallback

from gsnake.env import GoogleSnakeEnv
from gsnake.configs import GoogleSnakeConfig

In [3]:
register(
    id='GoogleSnake-v1',
    entry_point=GoogleSnakeEnv,
    max_episode_steps=500,
)

In [4]:
config = GoogleSnakeConfig(
    # reward_mode='basic',
    reward_mode='time_constrained',
    reward_scale=1,
    n_foods=3
)

In [12]:
name = 'PPO_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
policy_kwargs = {'normalize_images':False}
model = PPO(
    "MlpPolicy",
    env,
    policy_kwargs=policy_kwargs,
    verbose=0, tensorboard_log=f'runs/{run.id}')
# model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=10_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0166691142834073, max=1.0))…



Output()

KeyboardInterrupt: 

In [8]:
name = 'A2C_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = A2C("MlpPolicy", env, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666855178333814, max=1.0)…



Output()

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▄█▄▆▆▂▄▇▆▇▅▄▃▄▃▁▁▂▂▇▇▁▄▂▄▆▆▆▅▂█▃▇▄▅▃▅█▂▃
time/fps,▅▃▂▂▂▁▃▂▁▂▁▁▃▄▄▆▆▆▆▆▆▆▆▇▇███████▇▇▇▇▇▇▇▆
train/entropy_loss,▁▇██████████████████████████████████████
train/explained_variance,▂▂▂▂▂▂▂▂▂▂▂▂▅▃▂▂▄▂▂▃▂▂▁▂▂▂█▂▂▂▅▂▁▃▂▂▂▅▂▂
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/policy_loss,█▁▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/value_loss,▃▅▁▃▁▁▃▇█▁▆▂▁▄▄▅▁▂▄▇▃▁▁▁▂▆▁▄▃▅▄▃▃▅▂▁▃▁▅▁

0,1
global_step,1000000.0
rollout/ep_len_mean,12.0
rollout/ep_rew_mean,-3.82
time/fps,4615.0
train/entropy_loss,-8e-05
train/explained_variance,0.0008
train/learning_rate,0.0007
train/policy_loss,-1e-05
train/value_loss,24.70985


In [9]:
name = 'DDPG_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = DDPG("MlpPolicy", env, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666861878332687, max=1.0)…

AssertionError: The algorithm only supports <class 'gym.spaces.box.Box'> as action spaces but Discrete(3) was provided

In [10]:
name = 'DQN_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = DQN("MlpPolicy", env, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.040293…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668424066665466, max=1.0…



Output()

0,1
global_step,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
rollout/ep_len_mean,▂▂▂▂▁▁▁▁▁▁▁▁▁▁▂▁▁▃▂▅▁▁▅▁▄▄█▂▁▂▂▁▅█▁▂▅▂▁▁
rollout/ep_rew_mean,▇█▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▆▇▃██▄▇▄▄▂▆▇▇▆▇▅▁▇█▅▆█▇
rollout/exploration_rate,█▇▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,▅▅▄█▇▆▅▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▅▁▁▄▂▁▅▄▁▂▂▂▂▂▂▁▁█▂▁▂█▂▁▄

0,1
global_step,998910.0
rollout/ep_len_mean,35.75
rollout/ep_rew_mean,-15.19
rollout/exploration_rate,0.05
time/fps,2251.0
train/learning_rate,0.0001
train/loss,4.84073


In [None]:
name = 'SAC_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = SAC("MlpPolicy", env, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

In [None]:
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
name = 'TD3_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# The noise objects for TD3
env = GoogleSnakeEnv(config)
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

In [15]:
####################################################################
# Human evaluation
####################################################################
#model = PPO.load("ppo_mlp_time")

config = GoogleSnakeConfig(reward_mode='time_constrained', n_foods=3)
env = GoogleSnakeEnv(config, 42, 'gui')
obs = env.reset()
try:
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
        sleep(0.5)
except KeyboardInterrupt:
    print('Terminated')
finally:
    env.close()

ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default
