# Test area

In [7]:
# from gym_vectorvelocity.utils import test_with_random_moves

# test_with_random_moves()
# test_with_random_moves(episodes=10)

Episode 1 ended with reward: -74.9488902707035


: 

In [None]:
# from gym_vectorvelocity.utils import play_as_human

# play_as_human(sound_volume=0.0, save_volume=True)

# Check observation and action space

### Custom reward

In [1]:
from gym_vectorvelocity import VectorVelocityEnv
from stable_baselines3.common.env_checker import check_env


# env modifications if needed
GAMEOVER_PENALTY = 75
MISSED_COIN_PENALTY = 3

DODGED_OBSTACLE_REWARD = 1
COLLECTED_COIN_REWARD = 12

def create_env():
    env = VectorVelocityEnv()
    env.coin_missed_penalty = MISSED_COIN_PENALTY
    env.game_over_penalty = GAMEOVER_PENALTY
    env.dodged_obstacle_reward = DODGED_OBSTACLE_REWARD
    env.coin_reward = COLLECTED_COIN_REWARD
    return env

env = create_env()
check_env(env)

pygame 2.6.0 (SDL 2.28.4, Python 3.10.11)
Hello from the pygame community. https://www.pygame.org/contribute.html


# Initialize Env and set up GPU training

In [2]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
from gymnasium import make

env = make('VectorVelocity-v0')

In [4]:
# Print the action space
print("Action Space:", env.action_space)

# Print the observation space
print("Observation Space:", env.observation_space)

Action Space: Discrete(3)
Observation Space: Dict('coin_dists': Box(-1.0, 1.0, (40,), float32), 'coins': Box(-1.0, 1.0, (40,), float32), 'collected_coins': Discrete(20001), 'lane_coins': MultiDiscrete([4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]), 'lane_obstacles': MultiDiscrete([4 4 4 4 4 4 4 4 4]), 'obstacle_dists': Box(-1.0, 1.0, (18,), float32), 'obstacles': Box(-1.0, 1.0, (18,), float32), 'player_pos': Box(0.0, 1.0, (1,), float32), 'score': Discrete(120001), 'speed': Discrete(20))


# Test first model

In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy

def create_monitored_env():
    env = create_env()  # Create your environment
    env = Monitor(env)  # Wrap it with the Monitor wrapper
    return env

env = DummyVecEnv([create_monitored_env])

# Initialize the PPO agent
model = PPO("MultiInputPolicy", env, verbose=1, device=device)

# Train the model
model.learn(total_timesteps=100)

# Evaluate the trained model
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 373      |
|    ep_rew_mean     | -58.6    |
| time/              |          |
|    fps             | 60       |
|    iterations      | 1        |
|    time_elapsed    | 34       |
|    total_timesteps | 2048     |
---------------------------------
Mean reward: -15.040545600000002 +/- 93.87096577533228


# Using StableBaselines for multiprocessing - Model PPO

### edited after https://stable-baselines3.readthedocs.io/en/master/guide/examples.html

In [7]:
import gymnasium as gym

import os

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

#---------------------------------------------------------------------------
env_id = "VectorVelocity-v0"
#---------------------------------------------------------------------------
seed =  42
total_timesteps = 1e5
n_train_env = os.cpu_count()
#---------------------------------------------------------------------------
model_save_dir = "../models/ppo"
os.makedirs(model_save_dir, exist_ok=True)
tensorboard_log_dir = "../tensorboard_logs/ppo"
os.makedirs(tensorboard_log_dir, exist_ok=True)
#---------------------------------------------------------------------------

train_env = make_vec_env(env_id, n_envs=n_train_env, seed=seed, vec_env_cls=DummyVecEnv)

model = PPO("MultiInputPolicy", train_env, verbose=1, device=device, tensorboard_log=tensorboard_log_dir)
model.learn(total_timesteps=total_timesteps)

mean_reward_ppo, std_reward_ppo = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean reward: {mean_reward_ppo} +/- {std_reward_ppo}")

model.save(f"{model_save_dir}/ppo_vector_velocity")

Using cuda device
Logging to ./tensorboard_logs/ppo\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 371      |
|    ep_rew_mean     | -59.5    |
| time/              |          |
|    fps             | 715      |
|    iterations      | 1        |
|    time_elapsed    | 34       |
|    total_timesteps | 24576    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 420         |
|    ep_rew_mean          | -40.2       |
| time/                   |             |
|    fps                  | 422         |
|    iterations           | 2           |
|    time_elapsed         | 116         |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.010871346 |
|    clip_fraction        | 0.0353      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained

![DQN Trainings History](tensorboard_logs\screenshots\ppo_trainings_history.png)

# Using StableBaselines for multiprocessing - Model DQN

In [8]:
import gymnasium as gym

import os

from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

#---------------------------------------------------------------------------
env_id = "VectorVelocity-v0"
#---------------------------------------------------------------------------
seed =  42
total_timesteps = 1e5
n_train_env = os.cpu_count()
#---------------------------------------------------------------------------
model_save_dir = "../models/dqn"
os.makedirs(model_save_dir, exist_ok=True)
tensorboard_log_dir = "../tensorboard_logs/dqn"
os.makedirs(tensorboard_log_dir, exist_ok=True)
#---------------------------------------------------------------------------

train_env = make_vec_env(env_id, n_envs=n_train_env, seed=seed, vec_env_cls=DummyVecEnv)

model = DQN("MultiInputPolicy", train_env, verbose=1, device=device, tensorboard_log=tensorboard_log_dir)
model.learn(total_timesteps=total_timesteps)

mean_reward_dqn, std_reward_dqn = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean reward: {mean_reward_dqn} +/- {std_reward_dqn}")

model.save(f"{model_save_dir}/dqn_vector_velocity")

Using cuda device
Logging to ./tensorboard_logs/dqn\DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 242      |
|    ep_rew_mean      | -68.9    |
|    exploration_rate | 0.71     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 679      |
|    time_elapsed     | 4        |
|    total_timesteps  | 3048     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 6.93e-05 |
|    n_updates        | 61       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 301      |
|    ep_rew_mean      | -65.1    |
|    exploration_rate | 0.543    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 688      |
|    time_elapsed     | 6        |
|    total_timesteps  | 4812     |
| train/              |          |
|    learning_rate    | 0.0001  

![DQN Trainings History](tensorboard_logs\screenshots\dqn_trainings_history.png)

# Because of the cleaner curve of the DQN we chose to do hyperparameter tuning on the DQN. The tuning can be found in dqn_hyperparameter_tuning.ipynb.