In [1]:
import Device
!pip install vmas
!pip install Pillow
!pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
!pip install ipython
!pip install autoreload
!pip install torch-geometric
!pip install wandb

Looking in indexes: https://download.pytorch.org/whl/nightly/cpu
Collecting wandb
  Obtaining dependency information for wandb from https://files.pythonhosted.org/packages/ed/d7/8927aef63869d5d379adb63dc97f9cbc53830fdf85457b84a156fabcb231/wandb-0.15.8-py3-none-any.whl.metadata
  Downloading wandb-0.15.8-py3-none-any.whl.metadata (8.3 kB)
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Obtaining dependency information for GitPython!=3.1.29,>=1.0.0 from https://files.pythonhosted.org/packages/67/50/742c2fb60989b76ccf7302c7b1d9e26505d7054c24f08cc7ec187faaaea7/GitPython-3.1.32-py3-none-any.whl.metadata
  Downloading GitPython-3.1.32-py3-none-any.whl.metadata (10.0 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Obtaining dependency information for sentry-sdk>=1.0.0 from https://files.pythonhosted.org/packages/86/bb/ecb87fd214d5bbade07edf2ecdd829cf346e5b552689d6228112c6517286/sentry_sdk-1.29.2-py2.py3-none-any.whl.metadata
  Downloading sentry_sdk-1.29.2-py2.py3-none-any.whl.me

In [None]:

import math
import time

import torch
from PIL import Image
from torch import tensor, Tensor
from vmas import make_env

import wandb
from Cleaning import Scenario as CleaningScenario
from DeepQLearner import DeepQLearner
from LearningConfiguration import LearningConfiguration, NNFactory
from ReplayBuffer import ReplayBufferFactory
import Device

scenario_name = CleaningScenario()

# Scenario specific variables
n_agents = 1
num_envs = 1  # Number of vectorized environments
continuous_actions = True
device = Device.get()  # or cuda or any other torch device
n_steps = 800  # Number of steps before returning done
n_epochs = 100
dict_spaces = True  # Weather to return obs, rewards, and infos as dictionaries with agent names (by default they are lists of len # of agents)

run = wandb.init(project="vmas", reinit=True, config={
    "learning_rate": 0.0005,
    "architecture": "MLP",
    #"epochs": n_steps
})

dataset_size = 10000

frame_list = []  # For creating a gif
init_time = time.time()
step = 0

# Actions
speed = 0.75
north = tensor([0, -1*speed])
south = tensor([0, speed])
east = tensor([speed, 0])
west = tensor([-1*speed, 0])
#stop = tensor([0, 0])
ne = tensor([speed, -1*speed])
nw = tensor([-1*speed, -1*speed])
se = tensor([speed, speed])
sw = tensor([-1*speed, speed])

lidar_measure_shape = 50 * 2
pos_shape = 2
vel_shape = 2
tot_shape = lidar_measure_shape + pos_shape + vel_shape

actions = [north, south, east, west, ne, nw, se, sw]
#learning_configuration = LearningConfiguration(update_each=math.floor(n_steps/3),dqn_factory=NNFactory(tot_shape,64,len(actions)))
learning_configuration = LearningConfiguration(update_each=300,dqn_factory=NNFactory(tot_shape,64,len(actions)))

dql = DeepQLearner(
    memory=ReplayBufferFactory(dataset_size),
    action_space=actions,
    learning_configuration=learning_configuration
)


def isOneEnvDone(info_array):
    tensor = info_array["agent_0"]["active_targets"]
    for i in range(num_envs):
        if tensor[i] == 0:
            return True
    return False

for e in range(n_epochs):
    env = make_env(
        scenario=scenario_name,
        num_envs=num_envs,
        device=device,
        continuous_actions=continuous_actions,
        dict_spaces=dict_spaces,
        wrapper=None,
        seed=None,
        n_targets=8,
        n_agents=n_agents,
        wandb=wandb
    )
    previous_states = {}
    for step in range(n_steps):
        print(f"Step {step}")
        actions = {}
        logs = {}
        for i, agent in enumerate(env.agents):
            lidar_measure = previous_states[agent.name]["lidar_measure"] if step > 0 else torch.zeros(num_envs, lidar_measure_shape).to(Device.get())
            positions = agent.state.pos
            velocities = agent.state.vel
            agent_actions_list = []
            for j in range(num_envs):
                state = torch.cat((positions[j], velocities[j], lidar_measure[j]),dim=-1).to(Device.get())
                action = dql.behavioural(state)
                #print(action)
                agent_actions_list.append(action)
            agent_actions = torch.stack(agent_actions_list)
            actions.update({agent.name: agent_actions})
            if step > dql.batch_size/num_envs:
                dql.improve() # Improve the model
                #TODO Should I do the improve once for each env or once for each agent?
        obs, rewards, dones, info = env.step(actions)
        mean_reward = 0
        #print(rewards)
        for i, agent in enumerate(env.agents):
            positions = agent.state.pos
            velocities = agent.state.vel
            lidar_measure = obs[agent.name][:, (tot_shape - lidar_measure_shape):]
            previous_states.update({agent.name: {"lidar_measure": lidar_measure, "pos": positions, "vel": velocities}})
            for j in range(num_envs):
                reward = rewards[agent.name][j]
                mean_reward += reward
                logs.update({f"reward_{agent.name}_env_{j}": reward})
                prev_state = previous_states[agent.name]
                prev_state = torch.cat((prev_state["pos"][j], prev_state["vel"][j], prev_state["lidar_measure"][j]),dim=-1).to(Device.get())
                state = obs[agent.name][j]
                action = actions[agent.name][j]
                dql.record(prev_state,action,reward,state)
        mean_reward /= (num_envs*n_agents)
        logs.update({"epsilon": dql.epsilon.value()})
        logs.update({"loss": dql.last_loss})
        logs.update({"mean_reward": mean_reward})
        logs.update({f"mean_reward_epoch_{e}": mean_reward})
    
        wandb.log(logs)
        dql.epsilon.update() # Update epsilon
        #dql.snapshot(step, "0")
        frame_list.append(
            Image.fromarray(env.render(mode="rgb_array", agent_index_focus=None))
        )  # Can give the camera an agent index to focus on
        
        print(info)
        if isOneEnvDone(info):
            print("Env done")
            break
    
    gif_name = scenario_name.__class__.__name__ + "-epoch-" + str(e) + ".gif"
    
    # Produce a gif
    frame_list[0].save(
        gif_name,
        save_all=True,
        append_images=frame_list[1:],
        duration=2,
        loop=0,
    )
    
    frame_list.clear()
    
    total_time = time.time() - init_time
    print(
        f"It took: {total_time}s for {n_steps} steps of {num_envs} parallel environments on device {device} "
        f"for {scenario_name} scenario."
    )
    


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

Step 0




{'agent_0': {'active_targets': tensor([[8]])}}
Step 1
{'agent_0': {'active_targets': tensor([[8]])}}
Step 2
{'agent_0': {'active_targets': tensor([[8]])}}
Step 3
{'agent_0': {'active_targets': tensor([[8]])}}
Step 4
{'agent_0': {'active_targets': tensor([[8]])}}
Step 5
{'agent_0': {'active_targets': tensor([[8]])}}
Step 6
{'agent_0': {'active_targets': tensor([[8]])}}
Step 7
{'agent_0': {'active_targets': tensor([[8]])}}
Step 8
{'agent_0': {'active_targets': tensor([[8]])}}
Step 9
{'agent_0': {'active_targets': tensor([[8]])}}
Step 10
{'agent_0': {'active_targets': tensor([[8]])}}
Step 11
{'agent_0': {'active_targets': tensor([[8]])}}
Step 12
{'agent_0': {'active_targets': tensor([[8]])}}
Step 13
{'agent_0': {'active_targets': tensor([[8]])}}
Step 14
{'agent_0': {'active_targets': tensor([[8]])}}
Step 15
{'agent_0': {'active_targets': tensor([[8]])}}
Step 16
{'agent_0': {'active_targets': tensor([[8]])}}
Step 17
{'agent_0': {'active_targets': tensor([[8]])}}
Step 18
{'agent_0': {'activ