> ERC Starting Grant on COeXISTENCE between humans and machines in urban mobility.


<img src="../images/img_mileston1.png" alt="Milestone 1 Image" width="500" height="400">

# Title: Machine training using SAC algorithm
## Name: Anastasia
### Date: June 27, 2024
---

### Description

> In this notebook, we implement the training of independent machine agents using the SAC algorithm.
---

## Objective

> The purpose of this notebook is to understand whether SAC algorithm can train effectively our RL agents.
---

## Experiment Summary

### Network Architecture
- Csomor network


### Agents
| **Type**          |           |
|-------------------|---------------------|
| **Number**        | 3 machines |
| **Total demand** | random |


### Origin and Destination Details
| **Origin Count**      | 2                            |
|-----------------------|------------------------------|
| **Destination Count** | 2                            |
| **Origin Pairing**    | 279952229#0, 115604053       |
| **Destination Pairing**| -115602933#2, -441496282#1     |

    

### Hardware Utilized for Experiment Execution
| **Type of Machine** | Personal computer (or server) |
|----------------------|-------------------------------|
| **CPU**              | 12th Gen Intel(R) Core(TM) i7-1255U |
|                      | Cores: 10                   |
|                      | Sockets: 1                  |
|                      | Base Speed: 1.70 GHz        |
| **Memory**           | 16GB                          |
| **Disc (SSD)**       | 477 GB                        |
| **Operating System** | Windows 11                    |


### Imported libraries 

In [1]:
import matplotlib.pyplot as plt
import os
import pandas as pd
from tensordict.nn import TensorDictModule, TensorDictSequential
import torch
from torchrl.collectors import SyncDataCollector
from torch.distributions import Categorical, OneHotCategorical
from torchrl.envs.libs.pettingzoo import PettingZooWrapper
from torchrl.envs.transforms import TransformedEnv, RewardSum
from torchrl.envs.utils import check_env_specs
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data import TensorDictReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.modules import MultiAgentMLP, ProbabilisticActor, ValueOperator
from torchrl.objectives.value import GAE
from torchrl.objectives import ClipPPOLoss, ValueEstimators
from torchrl.objectives import DiscreteSACLoss, SACLoss, SoftUpdate, ValueEstimators
from torchrl._utils import logger as torchrl_logger
from tqdm import tqdm
import time
import sys


from environment import TrafficEnvironment
from keychain import Keychain as kc
from services.plotter import Plotter
from utilities import get_params

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

### Hyperparameters specification

In [2]:
# Devices
device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)
vmas_device = device  # The device where the simulator is run

# Sampling
frames_per_batch = 10  # Number of team frames collected per training iteration
n_iters = 40  # Number of sampling and training iterations
total_frames = frames_per_batch * n_iters

# Training
num_epochs = 100  # Number of optimization steps per training iteration
minibatch_size = 2  # Size of the mini-batches in each optimization step
lr = 3e-4  # Learning rate
max_grad_norm = 1.0  # Maximum norm for the gradients

# SAC
gamma = 0.99  # discount factor
tau = 0.02

### Environment Creation

In [3]:
params = get_params(kc.PARAMS_PATH)

In [4]:
env = TrafficEnvironment(params[kc.RUNNER], params[kc.ENVIRONMENT], params[kc.SIMULATOR], params[kc.AGENT_GEN], params[kc.AGENTS], params[kc.PHASE])

[CONFIRMED] Environment variable exists: SUMO_HOME
[SUCCESS] Added module directory: C:\Program Files (x86)\Eclipse\Sumo\tools


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
env.start()

In [6]:
env = PettingZooWrapper(
    env=env,
    use_mask=True,
    group_map=None,
    categorical_actions=True,
    done_on_any = False
)

In [7]:
env = TransformedEnv(
    env,
    RewardSum(
        in_keys=env.reward_keys,
        reset_keys=["_reset"] * len(env.group_map.keys()),
    ),
)

In [8]:
check_env_specs(env)

2024-07-12 15:45:26,143 [torchrl][INFO] check_env_specs succeeded!


In [9]:
reset_td = env.reset()

### Policy network

In [10]:
policy_modules = {}
for group, agents in env.group_map.items():
    share_parameters_policy = False  # Can change this based on the group

    policy_net = MultiAgentMLP(
        n_agent_inputs=env.observation_spec[group, "observation"].shape[
            -1
        ],  # n_obs_per_agent
        n_agent_outputs= env.full_action_spec[group, "action"].space.n,  # n_actions_per_agents
        n_agents=len(agents),  # Number of agents in the group
        centralised=False,  # the policies are decentralised (i.e., each agent will act from its local observation)
        share_params=share_parameters_policy,
        device=device,
        depth=4,
        num_cells=64,
        activation_class=torch.nn.Tanh,
    )

    # Wrap the neural network in a :class:`~tensordict.nn.TensorDictModule`.
    # This is simply a module that will read the ``in_keys`` from a tensordict, feed them to the
    # neural networks, and write the
    # outputs in-place at the ``out_keys``.

    policy_module = TensorDictModule(
        policy_net,
        in_keys=[(group, "observation")],
        out_keys=[(group, "logits")],
    )  # We just name the input and output that the network will read and write to the input tensordict
    policy_modules[group] = policy_module

In [11]:
len(agents)

1

In [12]:
policies = {}

for group, _agents in env.group_map.items():

    policy = ProbabilisticActor(
        module=policy_modules[group],
        spec=env.full_action_spec[group, "action"],
        in_keys=[(group, "logits")],
        out_keys=[(group, "action")],
        distribution_class=Categorical,
        return_log_prob=True,
        log_prob_key=(group, "sample_log_prob"),
    )
    
    policies[group] = policy

In [13]:
for group, agents in env.group_map.items():

    print(env.observation_spec[group, "observation"].shape[-1], "\n\n")

3 


3 


3 


3 


3 


3 




### Critic network

In [14]:
critic_modules = {}
for group, agents in env.group_map.items():
    share_parameters_critic = False
    mappo = False  # IPPO if False

    critic_net = MultiAgentMLP(
        n_agent_inputs=env.observation_spec[group, "observation"].shape[-1],
        n_agent_outputs=1,  # 1 value per agent
        n_agents=len(agents),
        centralised=mappo,
        share_params=share_parameters_critic,
        device=device,
        depth=4,
        num_cells=128,
        activation_class=torch.nn.Tanh,
    )

    value_module = ValueOperator(
        module=critic_net,
        in_keys=[(group, "observation")],
        out_keys=[(group, "action_value")],
    )
    critic_modules[group] = value_module

In [15]:
critic_modules

{'18': ValueOperator(
     module=MultiAgentMLP(
       (agent_networks): ModuleList(
         (0): MLP(
           (0): Linear(in_features=3, out_features=128, bias=True)
           (1): Tanh()
           (2): Linear(in_features=128, out_features=128, bias=True)
           (3): Tanh()
           (4): Linear(in_features=128, out_features=128, bias=True)
           (5): Tanh()
           (6): Linear(in_features=128, out_features=128, bias=True)
           (7): Tanh()
           (8): Linear(in_features=128, out_features=1, bias=True)
         )
       )
     ),
     device=cpu,
     in_keys=[('18', 'observation')],
     out_keys=[('18', 'action_value')]),
 '10': ValueOperator(
     module=MultiAgentMLP(
       (agent_networks): ModuleList(
         (0): MLP(
           (0): Linear(in_features=3, out_features=128, bias=True)
           (1): Tanh()
           (2): Linear(in_features=128, out_features=128, bias=True)
           (3): Tanh()
           (4): Linear(in_features=128, out_feature

In [16]:
reset_td = env.reset()
for group, _agents in env.group_map.items():
    print(
        f"Running value and policy for group '{group}':",
        critic_modules[group](policies[group](reset_td)),
    )

Running value and policy for group '18': TensorDict(
    fields={
        10: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                episode_reward: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                mask: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([1, 3]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
            batch_size=torch.Size([1]),
            device=cpu,
            is_shared=False),
        12: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.bool, is_shared=False

In [17]:
policy = TensorDictSequential(*policies.values())

In [18]:
policy

TensorDictSequential(
    module=ModuleList(
      (0): ProbabilisticActor(
          module=ModuleList(
            (0): TensorDictModule(
                module=MultiAgentMLP(
                  (agent_networks): ModuleList(
                    (0): MLP(
                      (0): Linear(in_features=3, out_features=64, bias=True)
                      (1): Tanh()
                      (2): Linear(in_features=64, out_features=64, bias=True)
                      (3): Tanh()
                      (4): Linear(in_features=64, out_features=64, bias=True)
                      (5): Tanh()
                      (6): Linear(in_features=64, out_features=64, bias=True)
                      (7): Tanh()
                      (8): Linear(in_features=64, out_features=3, bias=True)
                    )
                  )
                ),
                device=cpu,
                in_keys=[('18', 'observation')],
                out_keys=[('18', 'logits')])
            (1): SafeProbabilisticMod

### Collector

In [19]:
collector = SyncDataCollector(
    env,
    policy,
    device=device,
    storing_device=device,
    frames_per_batch=frames_per_batch,
    reset_at_each_iter=True,
    total_frames=total_frames,
)

### Replay Buffer

In [20]:
replay_buffers = {}
for group, _agents in env.group_map.items():
    replay_buffers[group] = TensorDictReplayBuffer(
        storage=LazyTensorStorage(
            frames_per_batch, device=device
        ),  # We store the frames_per_batch collected at each iteration
        sampler=SamplerWithoutReplacement(),
        batch_size=minibatch_size,  # We will sample minibatches of this size
    )

### SAC loss function

In [21]:
losses = {}
optimizers = {}
target_net_updaters = {}

for group, _agents in env.group_map.items():
    
    loss_module = DiscreteSACLoss(
        actor_network=policies[group],
        qvalue_network=critic_modules[group],
        delay_qvalue=True,
        num_actions=env.action_spec[group]['action'].space.n,
        action_space=env.action_spec[group]['action'] ### changed this - don't know if it's correct
    )
    loss_module.set_keys(  # We have to tell the loss where to find the keys
        reward=(group, "reward"),  
        action_value=(group, "action_value"),
        action=(group, "action"), 
        done=(group, "done"),
        terminated=(group, "terminated"),
    )

    loss_module.make_value_estimator(ValueEstimators.TD0, gamma=gamma)

    target_net_updaters[group] = SoftUpdate(loss_module, eps=1 - tau)


    losses[group] = loss_module

    optimizers[group] = torch.optim.Adam(loss_module.parameters(), lr)

In [22]:
env.action_spec[group]['action'].space.n

3

In [23]:
env.action_spec[group]['action']

DiscreteTensorSpec(
    shape=torch.Size([1]),
    space=DiscreteBox(n=3),
    device=cpu,
    dtype=torch.int64,
    domain=discrete)

In [24]:
for group, _agents in env.group_map.items():
    print(losses[group], "\n\n")

DiscreteSACLoss(
  (actor_network_params): TensorDictParams(params=TensorDict(
      fields={
          module: TensorDict(
              fields={
                  0: TensorDict(
                      fields={
                          module: TensorDict(
                              fields={
                                  agent_networks: TensorDict(
                                      fields={
                                          0: TensorDict(
                                              fields={
                                                  0: TensorDict(
                                                      fields={
                                                          bias: Parameter(shape=torch.Size([64]), device=cpu, dtype=torch.float32, is_shared=False),
                                                          weight: Parameter(shape=torch.Size([64, 3]), device=cpu, dtype=torch.float32, is_shared=False)},
                                                   

### Training loop

In [25]:
env.action_spec

CompositeSpec(
    18: CompositeSpec(
        action: DiscreteTensorSpec(
            shape=torch.Size([1]),
            space=DiscreteBox(n=3),
            device=cpu,
            dtype=torch.int64,
            domain=discrete), device=cpu, shape=torch.Size([1])),
    10: CompositeSpec(
        action: DiscreteTensorSpec(
            shape=torch.Size([1]),
            space=DiscreteBox(n=3),
            device=cpu,
            dtype=torch.int64,
            domain=discrete), device=cpu, shape=torch.Size([1])),
    23: CompositeSpec(
        action: DiscreteTensorSpec(
            shape=torch.Size([1]),
            space=DiscreteBox(n=3),
            device=cpu,
            dtype=torch.int64,
            domain=discrete), device=cpu, shape=torch.Size([1])),
    12: CompositeSpec(
        action: DiscreteTensorSpec(
            shape=torch.Size([1]),
            space=DiscreteBox(n=3),
            device=cpu,
            dtype=torch.int64,
            domain=discrete), device=cpu, shape

In [26]:
sampling_start = time.time()
total_time = 0

for i, tensordict_data in enumerate(collector):
    torchrl_logger.info(f"\nIteration {i}")

    sampling_time = time.time() - sampling_start

    current_frames = tensordict_data.numel()
    total_frames += current_frames

    for group, _agents in env.group_map.items():
        data_view = tensordict_data.reshape(-1)  # Flatten the batch size to shuffle data
        replay_buffers[group].extend(data_view)

    training_tds = []
    training_start = time.time()
    for _ in range(num_epochs):
        for group, _agents in env.group_map.items():
            for _ in range(frames_per_batch // minibatch_size):
                print("group is: ", group, "\n\n")

                subdata = replay_buffers[group].sample()

                loss_vals = losses[group](subdata)
                training_tds.append(loss_vals.detach())

                loss_value = (
                    loss_vals["loss_actor"]
                    + loss_vals["loss_alpha"]
                    + loss_vals["loss_qvalue"]
                )

                loss_value.backward()

                total_norm = torch.nn.utils.clip_grad_norm_(
                    loss_module.parameters(), max_grad_norm
                )
                training_tds[-1].set("grad_norm", total_norm.mean())

                optimizers[group].step()
                optimizers[group].zero_grad()
                
                target_net_updaters[group].step()

    collector.update_policy_weights_()

    training_time = time.time() - training_start

    iteration_time = sampling_time + training_time
    total_time += iteration_time
    training_tds = torch.stack(training_tds)

Before truncations:  1 2 1 





2024-07-12 15:45:32,066 [torchrl][INFO] 
Iteration 0


group is:  18 


action.shape is:  torch.Size([2, 2, 1]) 


action is:  tensor([[[[1]],

         [[1]]],


        [[[1]],

         [[1]]]]) 





RuntimeError: index 1 is out of bounds for dimension 3 with size 1