In [None]:
!pip install -U setuptools==65.5.0 pip==21
!pip install gymnasium
!pip install skrl
!pip install stable_baselines3
!pip install imitation
!pip install highway-env

# Import library

In [38]:
import gymnasium as gym
import highway_env
import torch
import numpy as np
# import the skrl components to build the RL system
from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG
from skrl.envs.wrappers.torch import wrap_env
from skrl.memories.torch import RandomMemory
from skrl.trainers.torch import SequentialTrainer
from skrl.utils import set_seed
from skrl.utils.model_instantiators.torch import Shape, deterministic_model
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy

from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
import gymnasium as gym
import highway_env
from imitation.algorithms import bc
from imitation.algorithms.dagger import SimpleDAggerTrainer
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env


The environment is used is highway-v0, which I showed on the slide

In [39]:
# seed for reproducibility
set_seed(42)  # e.g. `set_seed(42)` for fixed seed


# load and wrap the gymnasium environment.
# note: the environment version may change depending on the gymnasium version
env = gym.make("highway-v0", render_mode='rgb_array')
env = wrap_env(env)

device = env.device

[38;20m[skrl:INFO] Seed: 42[0m
  logger.warn(
[38;20m[skrl:INFO] Environment class: gymnasium.core.Wrapper, gymnasium.utils.record_constructor.RecordConstructorArgs[0m
[38;20m[skrl:INFO] Environment wrapper: Gymnasium[0m


In [40]:
# instantiate a memory as experience replay
memory = RandomMemory(memory_size=50000, num_envs=env.num_envs, device=device, replacement=False)

In [41]:

# instantiate the agent's models (function approximators) using the model instantiator utility.
# DQN requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/api/agents/dqn.html#models
models = {}
models["q_network"] = deterministic_model(observation_space=env.observation_space,
                                          action_space=env.action_space,
                                          device=device,
                                          clip_actions=False,
                                          input_shape=Shape.OBSERVATIONS,
                                          hiddens=[64, 64],
                                          hidden_activation=["relu", "relu"],
                                          output_shape=Shape.ACTIONS,
                                          output_activation=None,
                                          output_scale=1.0)
models["target_q_network"] = deterministic_model(observation_space=env.observation_space,
                                                 action_space=env.action_space,
                                                 device=device,
                                                 clip_actions=False,
                                                 input_shape=Shape.OBSERVATIONS,
                                                 hiddens=[64, 64],
                                                 hidden_activation=["relu", "relu"],
                                                 output_shape=Shape.ACTIONS,
                                                 output_activation=None,
                                                 output_scale=1.0)

In [42]:
# initialize models' parameters (weights and biases)
for model in models.values():
    model.init_parameters(method_name="normal_", mean=0.0, std=0.1)

In [43]:
# configure and instantiate the agent (visit its documentation to see all the options)
# https://skrl.readthedocs.io/en/latest/api/agents/dqn.html
#configuration-and-hyperparameters
cfg = DQN_DEFAULT_CONFIG.copy()
cfg["learning_starts"] = 100
cfg["exploration"]["final_epsilon"] = 0.04
cfg["exploration"]["timesteps"] = 1500
# logging to TensorBoard and write checkpoints (in timesteps)
cfg["experiment"]["write_interval"] = 1000
cfg["experiment"]["checkpoint_interval"] = 5000
cfg["experiment"]["directory"] = "runs/torch/CartPole"

agent = DQN(models=models,
            memory=memory,
            cfg=cfg,
            observation_space=env.observation_space,
            action_space=env.action_space,
            device=device)



# Train example expert

In [44]:
# configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 50000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=[agent])

# start training
trainer.train()

  0%|          | 9/50000 [00:03<5:12:51,  2.66it/s]


KeyboardInterrupt: 

# Sample transition

In [45]:
# Define the number of episodes to collect transitions
num_episodes = 3

# Initialize lists to store transitions
obs_list, acts_list, infos_list, next_obs_list, dones_list = [], [], [], [], []

# Main loop to collect transitions from multiple episodes
for _ in range(num_episodes):
    # Reset the environment to the initial state
    observation, info = env.reset()
    done = False

    # Episode-specific loop
    while not done:
        # Take a random action (replace this with your own agent's action)
        action = env.action_space.sample()

        # Step through the environment with the chosen action
        next_observation, reward, done, truncated, info = env.step(action)

        # Append transitions to the lists
        obs_list.append(observation)
        acts_list.append(action)
        infos_list.append(info)
        next_obs_list.append(next_observation)
        dones_list.append(done)

        # Update the current observation
        observation = next_observation

# Close the environment when finished collecting transitions
env.close()

# Create a dictionary to store the transitions
my_transitions = {
    "obs": obs_list,
    "acts": acts_list,
    "infos": infos_list,
    "next_obs": next_obs_list,
    "dones": dones_list
}


In [46]:
from imitation.data import types

def load_custom_transitions(my_transitions):
    transitions = types.Transitions(
        obs=np.array(my_transitions["obs"]),
        acts=np.array(my_transitions["acts"], dtype=np.int32),
        infos=my_transitions["infos"],
        next_obs=np.array(my_transitions["next_obs"]),
        dones=np.array(my_transitions["dones"], dtype=np.bool),
    )
    return transitions

# Load your custom transitions
custom_transitions = load_custom_transitions(my_transitions)

  obs=np.array(my_transitions["obs"]),
  obs=np.array(my_transitions["obs"]),
  next_obs=np.array(my_transitions["next_obs"]),
  next_obs=np.array(my_transitions["next_obs"]),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dones=np.array(my_transitions["dones"], dtype=np.bool),
  dones=np.array(my_transitions["dones"], dtype=np.bool),
  dones=np.array(my_transitions["dones"], dtype=np.bool),


Make the new environment since the old environment is wrapped, thus cannot be use in this case

In [31]:
env1 = gym.make('highway-v0')

  logger.warn(


In [49]:
rng = np.random.default_rng(0)
bc_trainer = bc.BC(
    observation_space=env1.observation_space,
    action_space=env1.action_space,
    demonstrations=custom_transitions,
    rng=rng,
)

In [50]:
reward, _ = evaluate_policy(
    bc_trainer.policy,  # type: ignore[arg-type]
    env1,
    n_eval_episodes=3,
    # render=True,  # comment out to speed up
)
print(f"Reward before training: {reward}")



Reward before training: 23.385851586858433


In [51]:
print("Training a policy using Behavior Cloning")
bc_trainer.train(n_epochs=100)

reward, _ = evaluate_policy(
    bc_trainer.policy,  # type: ignore[arg-type]
    env1,
    n_eval_episodes=3,
)
print(f"Reward after training: {reward}")

Training a policy using Behavior Cloning


0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00161 |
|    entropy        | 1.61     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 115      |
|    loss           | 1.61     |
|    neglogp        | 1.61     |
|    prob_true_act  | 0.2      |
|    samples_so_far | 32       |
--------------------------------


100batch [00:00, 114.04batch/s]


Reward after training: 22.374947667121887
