In [1]:
from time import sleep

import numpy as np
import torch
from torch import nn
import gym
from gym.envs.registration import register
from stable_baselines3 import *
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import wandb
from wandb.integration.sb3 import WandbCallback

from gsnake.env import GoogleSnakeEnv
from gsnake.configs import GoogleSnakeConfig

register(
    id='GoogleSnake-v1',
    entry_point=GoogleSnakeEnv,
    max_episode_steps=500,
)


class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        with torch.no_grad():
            n_flatten = self.cnn(
                torch.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, 64), nn.ReLU())
        self.linear2 = nn.Sequential(nn.Linear(64, features_dim))

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear2(self.linear(self.cnn(observations)))

cnnpolicy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=128),
    normalize_images=False
)

pygame 2.1.0 (SDL 2.0.16, Python 3.9.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
name = 'PPO_MLP_time_nch_10M'

config = GoogleSnakeConfig(
    # reward_mode='basic',
    multi_channel=True,
    reward_mode='time_constrained',
    reward_scale=1,
    n_foods=3
)
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
policy_kwargs = {'normalize_images':False}
model = PPO(
    "MlpPolicy",
    env,
    policy_kwargs=policy_kwargs,
    verbose=0, tensorboard_log=f'runs/{run.id}')
# model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=10_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdev-jahn[0m. Use [1m`wandb login --relogin`[0m to force relogin


Output()

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▂▃▂▃▄▅▄▅▆▆▆▇▆█▇▄▅▆▅▅▄▄▄▄▃▃▄▄▃▄▄▅▄▄▄▃▄▄▄
rollout/ep_rew_mean,▁▁▁▂▃▂▂▃▄▃▃▄▃▄▃▄▅▅▄▅▄▆▅▆▆▇▆▇▇▇▇▇▆▇▇▇████
time/fps,█▃▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▃
train/approx_kl,▂▄▃▃▂▃▄▆▃█▃▄▄▆▇▅▃▃▅▃▅▂▁▆▂▂▃▄▁▃▃▂▃▂▂▁▂▂▂▁
train/clip_fraction,▆█▇▆▄▄▃▄▂▄▂▃▂▂▁▃▁▁▂▂▂▁▁▂▂▂▂▂▁▂▂▂▁▁▁▁▂▂▂▁
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▄▅▆▇▇▇████████████████████████████████
train/explained_variance,▁▂▂▃▄▅▆▇▇▇▆▇▇▆▇██▇██▇▇▇█▇▇▇▇███▇███▇████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,10014720.0
rollout/ep_len_mean,67.96
rollout/ep_rew_mean,186.21001
time/fps,1277.0
train/approx_kl,0.0079
train/clip_fraction,0.03755
train/clip_range,0.2
train/entropy_loss,-0.08471
train/explained_variance,0.74159
train/learning_rate,0.0003


In [None]:
# resume
model = PPO.load(f'{name}.pt', env=env, device='cuda')
run = wandb.init(
    resume='must',
    id='1c9io7z8',
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)

model.learn(total_timesteps=40_000_000, callback=WandbCallback(verbose=2), progress_bar=True, reset_num_timesteps=False)
model.save(f'{name}.pt')

Output()

In [14]:
config = GoogleSnakeConfig(
    multi_channel=True,
    # reward_mode='basic',
    reward_mode='time_constrained',
    reward_scale=1,
    n_foods=3
)
name = 'PPO_CNN_time_nch'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = PPO(
    "CnnPolicy",
    env,
    policy_kwargs=cnnpolicy_kwargs,
    verbose=0, tensorboard_log=f'runs/{run.id}')
# model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading



Output()

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▆▅▆██▄▆█▃▄▄▃▄▂▃▃▄▂▁▂▄▁▄▁▅▅▇▃▅█▄▅▄▂▄▅▃▅▅▃
rollout/ep_rew_mean,▃▄▆▃▇▄▄▅▆▂▅▃█▄▇▄▁▅▂▄▅▅▃▃▅▆▅▆▄▅▇▁▃▄▇█▆▇▄▂
time/fps,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▂▁▁▂▂▂▃▄▄▅▄▄▄▄▄▅▆▆▅▇▅▆█▆▆█▆▇▆▆▆▆▆▆▇▆▆▇▇▆
train/clip_fraction,▂▂▁▃▂▂▃▄▅▅▄▄▄▄▅▅▆▆▅▇▅▇▇▆▇█▇▇▆▇▆▆▇▇▇▇▆▇▇▇
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▁▁▂▂▂▂▃▃▄▄▅▅▆▆▇▇█▇▆▆▆▅▆▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇
train/explained_variance,█▇▆██▆▄▇▅▅▄▃▃▃▁▂▃▃▄▁▃▄▂▄▄▃▃▃▇▅▇▄▃▄▆▄▆▄▇▃
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,1003520.0
rollout/ep_len_mean,19.23
rollout/ep_rew_mean,-0.79
time/fps,1078.0
train/approx_kl,0.01234
train/clip_fraction,0.14059
train/clip_range,0.2
train/entropy_loss,-0.96023
train/explained_variance,-0.17357
train/learning_rate,0.0003


In [None]:
name = 'A2C_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = A2C("MlpPolicy", env, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666855178333814, max=1.0)…



Output()

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▄█▄▆▆▂▄▇▆▇▅▄▃▄▃▁▁▂▂▇▇▁▄▂▄▆▆▆▅▂█▃▇▄▅▃▅█▂▃
time/fps,▅▃▂▂▂▁▃▂▁▂▁▁▃▄▄▆▆▆▆▆▆▆▆▇▇███████▇▇▇▇▇▇▇▆
train/entropy_loss,▁▇██████████████████████████████████████
train/explained_variance,▂▂▂▂▂▂▂▂▂▂▂▂▅▃▂▂▄▂▂▃▂▂▁▂▂▂█▂▂▂▅▂▁▃▂▂▂▅▂▂
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/policy_loss,█▁▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
train/value_loss,▃▅▁▃▁▁▃▇█▁▆▂▁▄▄▅▁▂▄▇▃▁▁▁▂▆▁▄▃▅▄▃▃▅▂▁▃▁▅▁

0,1
global_step,1000000.0
rollout/ep_len_mean,12.0
rollout/ep_rew_mean,-3.82
time/fps,4615.0
train/entropy_loss,-8e-05
train/explained_variance,0.0008
train/learning_rate,0.0007
train/policy_loss,-1e-05
train/value_loss,24.70985


In [None]:
name = 'DDPG_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = DDPG("MlpPolicy", env, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666861878332687, max=1.0)…

AssertionError: The algorithm only supports <class 'gym.spaces.box.Box'> as action spaces but Discrete(3) was provided

In [10]:
name = 'DQN_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = DQN("MlpPolicy", env, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.040293…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668424066665466, max=1.0…



Output()

0,1
global_step,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
rollout/ep_len_mean,▂▂▂▂▁▁▁▁▁▁▁▁▁▁▂▁▁▃▂▅▁▁▅▁▄▄█▂▁▂▂▁▅█▁▂▅▂▁▁
rollout/ep_rew_mean,▇█▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▆▇▃██▄▇▄▄▂▆▇▇▆▇▅▁▇█▅▆█▇
rollout/exploration_rate,█▇▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time/fps,▅▅▄█▇▆▅▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▅▁▁▄▂▁▅▄▁▂▂▂▂▂▂▁▁█▂▁▂█▂▁▄

0,1
global_step,998910.0
rollout/ep_len_mean,35.75
rollout/ep_rew_mean,-15.19
rollout/exploration_rate,0.05
time/fps,2251.0
train/learning_rate,0.0001
train/loss,4.84073


In [None]:
name = 'SAC_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = SAC("MlpPolicy", env, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

In [None]:
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
name = 'TD3_MLP_time'
run = wandb.init(
    job_type='train', config=config.__dict__,
    project='RL2',
    tags=[name.split('_')[0], 'gsnake'],
    name=name,
    sync_tensorboard=True,
    monitor_gym='False'
)
# The noise objects for TD3
env = GoogleSnakeEnv(config)
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

# Parallel environments
env = make_vec_env("GoogleSnake-v1", n_envs=10, env_kwargs={'config':config})
model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0, tensorboard_log=f'runs/{run.id}')

model.learn(total_timesteps=1_000_000, callback=WandbCallback(verbose=2), progress_bar=True)
run.finish()

model.save(f'{name}.pt')
del model # remove to demonstrate saving and loading

In [6]:
####################################################################
# Human evaluation
####################################################################
model = PPO.load("PPO_MLP_time.pt")
config = GoogleSnakeConfig(
    # reward_mode='basic',
    multi_channel=True,
    reward_mode='time_constrained',
    reward_scale=1,
    n_foods=3
)
env = GoogleSnakeEnv(config, 42, 'gui')
obs = env.reset()
try:
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
        sleep(0.5)
except KeyboardInterrupt:
    print('Terminated')
finally:
    env.close()

Terminated
