> ERC Starting Grant on COeXISTENCE between humans and machines in urban mobility.


<img src="../images/img_mileston1.png" alt="Milestone 1 Image" width="500" height="400">

# Title: Machine training using DQN algorithm
## Name: Anastasia
### Date: July 11, 2024
---

### Description

> In this notebook, we implement the training of independent machine agents using the DQN algorithm.
---

## Objective

> The purpose of this notebook is to understand whether DQN algorithm can train effectively our RL agents.
---

## Experiment Summary

### Network Architecture
- Csomor network


### Agents
| **Type**          |           |
|-------------------|---------------------|
| **Number**        | 5 machines |
| **Total demand** | random |


### Origin and Destination Details
| **Origin Count**      | 2                            |
|-----------------------|------------------------------|
| **Destination Count** | 2                            |
| **Origin Pairing**    | 279952229#0, 115604053       |
| **Destination Pairing**| -115602933#2, -441496282#1     |

    

### Hardware Utilized for Experiment Execution
| **Type of Machine** | Personal computer (or server) |
|----------------------|-------------------------------|
| **CPU**              | 12th Gen Intel(R) Core(TM) i7-1255U |
|                      | Cores: 10                   |
|                      | Sockets: 1                  |
|                      | Base Speed: 1.70 GHz        |
| **Memory**           | 16GB                          |
| **Disc (SSD)**       | 477 GB                        |
| **Operating System** | Windows 11                    |


### Imported libraries 

In [1]:
import matplotlib.pyplot as plt
import os
import pandas as pd
from tensordict.nn import TensorDictModule, TensorDictSequential
import torch
from torchrl.collectors import SyncDataCollector
from torch.distributions import Categorical
from torchrl.envs.libs.pettingzoo import PettingZooWrapper
from torchrl.envs.transforms import TransformedEnv, RewardSum
from torchrl.envs.utils import check_env_specs
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.modules import MultiAgentMLP, ProbabilisticActor
from torchrl.objectives.value import GAE
from torchrl.objectives import ClipPPOLoss, ValueEstimators
from torchrl.modules import MLP, QValueActor
from torchrl.data import CompositeSpec
from torchrl.modules import EGreedyModule
from torchrl.objectives import DQNLoss, HardUpdate
from torchrl.record.loggers import generate_exp_name, get_logger
from torchrl.envs.transforms import RenameTransform
from torchrl.modules.tensordict_module import QValueModule
from tqdm import tqdm
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

from environment import TrafficEnvironment
from keychain import Keychain as kc
from services.plotter import Plotter
from utilities import get_params

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

### Hyperparameters specification

In [2]:
# Devices
device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)
vmas_device = device  # The device where the simulator is run

# Sampling
frames_per_batch = 18  # Number of team frames collected per training iteration
n_iters = 100  # Number of sampling and training iterations
total_frames = frames_per_batch * n_iters

# Training
num_epochs = 100  # Number of optimization steps per training iteration
minibatch_size = 2  # Size of the mini-batches in each optimization step
lr = 3e-4  # Learning rate
max_grad_norm = 1.0  # Maximum norm for the gradients

# DQN
gamma = 0.99  # discount factor
hard_update_freq = 10

### Environment Creation

In [3]:
params = get_params(kc.PARAMS_PATH)

In [4]:
env = TrafficEnvironment(params[kc.RUNNER], params[kc.ENVIRONMENT], params[kc.SIMULATOR], params[kc.AGENT_GEN], params[kc.AGENTS], params[kc.PHASE])

[CONFIRMED] Environment variable exists: SUMO_HOME
[SUCCESS] Added module directory: C:\Program Files (x86)\Eclipse\Sumo\tools


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
env.start()

In [6]:
env = PettingZooWrapper(
    env=env,
    use_mask=True,
    group_map=None,
    categorical_actions=True,
    done_on_any = False
)

In [7]:
out_keys = []

for group, agents in env.group_map.items():
    out_keys.append((group, "episode_reward"))

print(out_keys)

[('2', 'episode_reward'), ('0', 'episode_reward'), ('3', 'episode_reward'), ('1', 'episode_reward'), ('5', 'episode_reward')]


In [8]:
env = TransformedEnv(
    env,
    RewardSum(
        in_keys=env.reward_keys,
        reset_keys=["_reset"] * len(env.group_map.keys()),
        out_keys = out_keys
    ),
)

In [9]:
env.reward_keys

[('0', 'reward'),
 ('1', 'reward'),
 ('2', 'reward'),
 ('3', 'reward'),
 ('5', 'reward')]

In [10]:
env.group_map.keys()

dict_keys(['2', '0', '3', '1', '5'])

In [11]:
check_env_specs(env)

key is:  ('2', 'mask') 



key is:  ('0', 'mask') 



key is:  ('3', 'mask') 



key is:  ('1', 'mask') 



key is:  ('5', 'mask') 



key is:  ('2', 'observation') 



key is:  ('2', 'info') 



key is:  ('0', 'observation') 



key is:  ('0', 'info') 



key is:  ('3', 'observation') 



key is:  ('3', 'info') 



key is:  ('1', 'observation') 



key is:  ('1', 'info') 



key is:  ('5', 'observation') 



key is:  ('5', 'info') 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('2',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('0',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('3',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('1',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('5',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is: 

2024-07-15 12:20:44,312 [torchrl][INFO] check_env_specs succeeded!


key is:  ('2', 'mask') 



key is:  ('0', 'mask') 



key is:  ('3', 'mask') 



key is:  ('1', 'mask') 



key is:  ('5', 'mask') 



key is:  ('2', 'observation') 



key is:  ('2', 'reward') 



key is:  ('2', 'done') 



key is:  ('2', 'terminated') 



key is:  ('2', 'truncated') 



key is:  ('2', 'info') 



key is:  ('0', 'observation') 



key is:  ('0', 'reward') 



key is:  ('0', 'done') 



key is:  ('0', 'terminated') 



key is:  ('0', 'truncated') 



key is:  ('0', 'info') 



key is:  ('3', 'observation') 



key is:  ('3', 'reward') 



key is:  ('3', 'done') 



key is:  ('3', 'terminated') 



key is:  ('3', 'truncated') 



key is:  ('3', 'info') 



key is:  ('1', 'observation') 



key is:  ('1', 'reward') 



key is:  ('1', 'done') 



key is:  ('1', 'terminated') 



key is:  ('1', 'truncated') 



key is:  ('1', 'info') 



key is:  ('5', 'observation') 



key is:  ('5', 'reward') 



key is:  ('5', 'done') 



key is:  ('5', 'terminated') 



key is:  ('5',

In [12]:
reset_td = env.reset()

key is:  ('2', 'mask') 



key is:  ('0', 'mask') 



key is:  ('3', 'mask') 



key is:  ('1', 'mask') 



key is:  ('5', 'mask') 



key is:  ('2', 'observation') 



key is:  ('2', 'info') 



key is:  ('0', 'observation') 



key is:  ('0', 'info') 



key is:  ('3', 'observation') 



key is:  ('3', 'info') 



key is:  ('1', 'observation') 



key is:  ('1', 'info') 



key is:  ('5', 'observation') 



key is:  ('5', 'info') 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('2',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('0',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('3',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('1',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('5',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is: 

### Policy network

In [13]:
modules = {}
for group, agents in env.group_map.items():
    share_parameters_policy = False  # Can change this based on the group

    mlp = MultiAgentMLP(
        n_agent_inputs=env.observation_spec[group, "observation"].shape[
            -1
        ],  # n_obs_per_agent
        n_agent_outputs= env.full_action_spec[group, "action"].space.n,  # n_actions_per_agents
        n_agents=len(agents),  # Number of agents in the group
        centralised=False,  # the policies are decentralised (i.e., each agent will act from its local observation)
        share_params=share_parameters_policy,
        device=device,
        depth=4,
        num_cells=64,
        activation_class=torch.nn.ReLU,
    )

    # Wrap the neural network in a :class:`~tensordict.nn.TensorDictModule`.
    # This is simply a module that will read the ``in_keys`` from a tensordict, feed them to the
    # neural networks, and write the
    # outputs in-place at the ``out_keys``.
    module = TensorDictModule(mlp, 
                              in_keys=[(group, "observation")],
                              out_keys=[(group,"action_value")],
    )

    modules[group] = module

    print("module is: ", module, "\n\n\n")

module is:  TensorDictModule(
    module=MultiAgentMLP(
      (agent_networks): ModuleList(
        (0): MLP(
          (0): Linear(in_features=3, out_features=64, bias=True)
          (1): ReLU()
          (2): Linear(in_features=64, out_features=64, bias=True)
          (3): ReLU()
          (4): Linear(in_features=64, out_features=64, bias=True)
          (5): ReLU()
          (6): Linear(in_features=64, out_features=64, bias=True)
          (7): ReLU()
          (8): Linear(in_features=64, out_features=3, bias=True)
        )
      )
    ),
    device=cpu,
    in_keys=[('2', 'observation')],
    out_keys=[('2', 'action_value')]) 



module is:  TensorDictModule(
    module=MultiAgentMLP(
      (agent_networks): ModuleList(
        (0): MLP(
          (0): Linear(in_features=3, out_features=64, bias=True)
          (1): ReLU()
          (2): Linear(in_features=64, out_features=64, bias=True)
          (3): ReLU()
          (4): Linear(in_features=64, out_features=64, bias=True)
    

In [14]:
q_value_modules = {}

for group, agents in env.group_map.items():

    q_value_actor = QValueActor(
            modules[group],
            spec=env.action_spec[group],
            in_keys=[(group, "observation")],
            action_value_key=(group, "action_value"), #if not specified it will expect ("actio_value") and not [("group", "action_value")]
        )  

    q_value_actor.out_keys = [(group, "action"), (group, "action_value"), (group, "chosen_action_value")]
    print("q_value actor is: ", q_value_actor, "\n\n\n")
    q_value_modules[group] = q_value_actor

q_value actor is:  QValueActor(
    module=ModuleList(
      (0): TensorDictModule(
          module=MultiAgentMLP(
            (agent_networks): ModuleList(
              (0): MLP(
                (0): Linear(in_features=3, out_features=64, bias=True)
                (1): ReLU()
                (2): Linear(in_features=64, out_features=64, bias=True)
                (3): ReLU()
                (4): Linear(in_features=64, out_features=64, bias=True)
                (5): ReLU()
                (6): Linear(in_features=64, out_features=64, bias=True)
                (7): ReLU()
                (8): Linear(in_features=64, out_features=3, bias=True)
              )
            )
          ),
          device=cpu,
          in_keys=[('2', 'observation')],
          out_keys=[('2', 'action_value')])
      (1): QValueModule()
    ),
    device=cpu,
    in_keys=[('2', 'observation')],
    out_keys=[('2', 'action'), ('2', 'action_value'), ('2', 'chosen_action_value')]) 



q_value actor is:  QVal

In [15]:
len(agents)

1

In [16]:
env.action_spec[group]

CompositeSpec(
    action: DiscreteTensorSpec(
        shape=torch.Size([1]),
        space=DiscreteBox(n=3),
        device=cpu,
        dtype=torch.int64,
        domain=discrete), device=cpu, shape=torch.Size([1]))

In [17]:
for group, agents in env.group_map.items():

    tensordict = env.fake_tensordict()
    q_value_modules[group](tensordict)

key is:  ('2', 'observation') 



key is:  ('2', 'action_value') 



key is:  ('0', 'observation') 



key is:  ('0', 'action_value') 



key is:  ('3', 'observation') 



key is:  ('3', 'action_value') 



key is:  ('1', 'observation') 



key is:  ('1', 'action_value') 



key is:  ('5', 'observation') 



key is:  ('5', 'action_value') 





### Greedy module

In [18]:
greedy_module = {}
annealing_frames = 10
eps_start=10.0
eps_end=5.0

for group, agents in env.group_map.items():

    greedy_module[group] = EGreedyModule(
        annealing_num_steps=annealing_frames,
        eps_init=eps_start,
        eps_end=eps_end,
        spec=env.action_spec[group],
    )

In [19]:
policy = TensorDictSequential(*q_value_modules.values(), *greedy_module.values()).to(device)

### Collector

In [20]:
collector = SyncDataCollector(
    env,
    policy,
    device=device,
    storing_device=device,
    frames_per_batch=frames_per_batch,
    reset_at_each_iter=False,
    total_frames=total_frames,
)

key is:  ('2', 'mask') 



key is:  ('0', 'mask') 



key is:  ('3', 'mask') 



key is:  ('1', 'mask') 



key is:  ('5', 'mask') 



key is:  ('2', 'observation') 



key is:  ('2', 'info') 



key is:  ('0', 'observation') 



key is:  ('0', 'info') 



key is:  ('3', 'observation') 



key is:  ('3', 'info') 



key is:  ('1', 'observation') 



key is:  ('1', 'info') 



key is:  ('5', 'observation') 



key is:  ('5', 'info') 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('2',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('0',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('3',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('1',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is:  ('truncated',) 



key is:  ('5',) 



key is:  ('done',) 



key is:  ('terminated',) 



key is: 

### Replay Buffer

In [21]:
replay_buffers = {}
for group, _agents in env.group_map.items():
    replay_buffers[group] = ReplayBuffer(
        storage=LazyTensorStorage(
            frames_per_batch, device=device
        ),  # We store the frames_per_batch collected at each iteration
        batch_size=minibatch_size,  # We will sample minibatches of this size
    )

### DQN loss function

In [22]:
losses = {}
optimizers = {}
target_net_updaters = {}


for group, _agents in env.group_map.items():
    print("group is: ", group, "\n\n\n")
    loss_module = DQNLoss(
        value_network=q_value_modules[group],
        loss_function="l2",
        delay_value=True,
        action_space = "categorical"
    )

    loss_module.set_keys(  # We have to tell the loss where to find the keys
        reward=(group, "reward"),  
        action_value=(group, "action_value"),
        action=(group, "action"), 
        done=(group, "done"),
        terminated=(group, "terminated"),
        value=(group, "chosen_action_value"),
    )

    loss_module.make_value_estimator(gamma=gamma)
    loss_module = loss_module.to(device)

    target_net_updaters[group] = HardUpdate(
        loss_module, value_network_update_interval=hard_update_freq
    )    

    losses[group] = loss_module

    optimizer = torch.optim.Adam(loss_module.parameters(), lr)
    
    optimizers[group] = optimizer


# Access loss module for the first group for example
group = next(iter(env.group_map))

group is:  2 



group is:  0 



group is:  3 



group is:  1 



group is:  5 





### Create the logger

In [23]:
logger = None

exp_name = generate_exp_name("DQN", f"TrafficEnv")
logger = get_logger(
    "csv",
    logger_name="dqn",
    experiment_name=exp_name,
    wandb_kwargs={
        "project": "lalala",
        "group": "la",
    },
)

In [24]:
import time
import tqdm

collected_frames = 0
start_time = time.time()
num_updates = 5
batch_size = 10
test_interval = 5
num_test_episodes = 5
frames_per_batch = frames_per_batch
pbar = tqdm.tqdm(total=total_frames)
init_random_frames = 5
sampling_start = time.time()
q_losses = torch.zeros(num_updates, device=device)

  0%|          | 0/1800 [00:00<?, ?it/s]

### Training loop

In [25]:
for i, tensordict_data in enumerate(collector):

    for group, _agents in env.group_map.items():
        tensordict_data.set(
            ("next", group, "done"),
            tensordict_data.get(("next", "done"))
            .unsqueeze(-1)
            .expand(tensordict_data.get_item_shape(("next", group, "reward"))),  # Adjust index to start from 0
        )
        tensordict_data.set(
            ("next", group, "terminated"),
            tensordict_data.get(("next", "terminated"))
            .unsqueeze(-1)
            .expand(tensordict_data.get_item_shape(("next", group, "reward"))),  # Adjust index to start from 0
        )
    print("Inside\n")

    log_info = {}
    sampling_time = time.time() - sampling_start
    pbar.update(data.numel())

    data = data.reshape(-1)
    current_frames = data.numel()

    for group, _agents in env.group_map.items():
        replay_buffers[group].extend(data)
        collected_frames += current_frames

        greedy_module[group].step(current_frames)

key is:  ('2', 'observation') 



key is:  ('2', 'action_value') 



key is:  ('0', 'observation') 



key is:  ('0', 'action_value') 



key is:  ('3', 'observation') 



key is:  ('3', 'action_value') 



key is:  ('1', 'observation') 



key is:  ('1', 'action_value') 



key is:  ('5', 'observation') 



key is:  ('5', 'action_value') 



key is:  ('action',) 



key is:  ('action',) 



key is:  ('action',) 



key is:  ('action',) 



key is:  ('action',) 



key is:  ('next',) 



key is:  ('next',) 



agent_index is:  0
key is:  ('2', 'action') 





KeyError: 'key "action" not found in TensorDict with keys [\'action_value\', \'done\', \'episode_reward\', \'mask\', \'observation\', \'terminated\', \'truncated\']'