# Training PPO with Vector Envs and Domain Randomization 3D setting


## Notice

If you encounter an RuntimeError like the following comment raised on multiprocessing/spawn.py, wrap up the code from ``gym.vector.make=`` or ``gym.vector.AsyncVectorEnv`` to the end of the code by ``if__name__ == '__main__'``.

``An attempt has been made to start a new process before the current process has finished its bootstrapping phase.``




------------------------------


## Importing required libraries

In [56]:

from __future__ import annotations

import os

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch import optim
from tqdm import tqdm

import gymnasium as gym
# Ensure gym is installed
%pip install gym

import gymnasium_env

from gymnasium.envs.registration import register
from gymnasium.utils import seeding
from gymnasium.wrappers import RecordEpisodeStatistics, FlattenObservation,
from stable_baselines3 import DDPG, PPO, A2C, SAC, TD3, DQN
from gymnasium.vector import SyncVectorEnv, DummyVecEnv
gym.pprint_registry()


Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
     ---------------------------------------- 0.0/721.7 kB ? eta -:--:--
     -------------------------------------- 721.7/721.7 kB 9.8 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting gym_notices>=0.0.4 (from gym)
  Downloading gym_notices-0.0.8-py3-none-any.whl.metadata (1.0 kB)
Downloading gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml): started
  Building wheel for gym (pyproject.toml): finished with status 'done'
  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827740 sha256=e51ffec933a8008479bfeaeedce7e738ef7e2d5e7de1648c9a

# Proximal Policy Optimization using Vectorize Environments

## Environment Creation Randozimation

----------------------

Randomly generating the parameters for 3 parallel 'LunarLander-v3' envs, using `np.clip` to stay in the recommended parameter space:




In [2]:
env = gym.make("GridWorld3D-v0")
print(env)

<OrderEnforcing<PassiveEnvChecker<GridWorld3DEnv<GridWorld3D-v0>>>>


### Synchronous Vectorization

In [None]:

# Define the function that will create each individual environment
def make_env():
    envs =  gym.make("GridWorld3D-v0", max_episode_steps=800)
    return FlattenObservation(envs)
# Create a SyncVectorEnv with 6 parallel environments
envs = EnvCompatibility([lambda: make_env() for _ in range(6)])

# Verify the environment
print(envs)

ImportError: cannot import name 'EnvCompatibility' from 'gymnasium.wrappers' (c:\Users\Emanuele_Benati\AppData\Local\Programs\Python\Python39\lib\site-packages\gymnasium\wrappers\__init__.py)

## Setup

In [58]:
#environment hyperparameters
n_envs = 6
n_updates = 1000
n_steps_per_update = 64


In [59]:
#agent hyperparams
gamma = 0.99
learning_rate = 0.0003
entropy_coef = 0.01
n_features = 4


In [60]:
# set the device
use_cuda = False
if use_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")

### Print Environment Spaces

In [62]:
# Access the observation space of the wrapped environment
obs_space = envs.observation_space
print("Observation space:", obs_space)

# Access the action space of the wrapped environment
action_space = envs.action_space
print("Action space:", action_space)


Observation space: Box(0.0, 50.0, (6,), float32)
Action space: Box(-1.0, 1.0, (3,), float32)


## Initialized The Agent

In [63]:

# Initialize the DDPG model with specified parameters
model = PPO(
    policy="MlpPolicy",  # Define the policy type (MLP for most continuous tasks)
    env=envs,  # Pass the vectorized environment (SyncVectorEnv)
    gamma=gamma,  # Discount factor (typically set to 0.99)
    learning_rate=learning_rate,  # Use a single learning rate value
    n_steps=2048,  # Number of steps to run for each environment per update
    batch_size=64,  # Batch size for optimization
    n_epochs=10,  # Number of epochs to optimize the policy
    ent_coef= entropy_coef,  # Entropy coefficient (controls exploration)
    policy_kwargs={
        "net_arch": [64, 64],  # Network architecture for the policy (2 hidden layers)
    },
    verbose=1  # Verbosity level for logging
)
print(model)

Using cpu device
<stable_baselines3.ppo.ppo.PPO object at 0x000002878CA5E6D0>


# Training the agent

In [64]:
from stable_baselines3.common.callbacks import BaseCallback

# Custom Callback to log rewards and number of steps per episode
class RewardAndStepCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(RewardAndStepCallback, self).__init__(verbose)
        self.episode_rewards = []
        self.episode_steps = []
        
    def _on_step(self):
        # Check if the episode has finished (terminated) and print the stats
        if 'episode' in self.locals and 'reward' in self.locals:
            episode = self.locals['episode']
            reward = self.locals['reward']
            steps = self.locals['n_steps']  # Number of steps in the current batch
            
            # Print rewards and steps for each episode
            if episode % 1 == 0:  # Print every episode (you can adjust this)
                print(f"Episode {episode}: Total Reward: {reward}, Steps: {steps}")
        
        return True  # Always return True to continue training

In [65]:
# Train the model with callback
n_timesteps = 100000  # Number of timesteps to train for
model.learn(total_timesteps=n_timesteps, progress_bar=True)

TypeError: cannot unpack non-iterable NoneType object