# 🚀 Install, Import, and Log In

In [28]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.environment import ActionTuple, BaseEnv
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
# import matplotlib.pyplot as plt
# %matplotlib inline
import random
from functools import wraps
from time import time


import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
from ipywidgets import IntProgress

from typing import Tuple, Dict
from math import floor

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [29]:
def measure(func):
    @wraps(func)
    def _time_it(*args, **kwargs):
        start = int(round(time() * 1000))
        try:
            return func(*args, **kwargs)
        finally:
            end_ = int(round(time() * 1000)) - start
            print(f"Total execution time: {end_/(1000*60) if end_ > 0 else 0} minutes")
    return _time_it

### 0️⃣ Step 0: Install W&B

In [30]:
%%capture
!pip install wandb

### 1️⃣ Step 1: Import W&B and Login

In [31]:
import wandb

wandb.login()

True

# 👩‍🔬 Define the Experiment and Pipeline

### 2️⃣ Step 2: Track metadata and hyperparameters with `wandb.init`

In [32]:
config = dict(
    epochs=5,
    classes=10,
    kernels=[16, 32],
    batch_size=128,
    learning_rate=0.001,
    dataset="MNIST",
    architecture="CNN",
    NUM_TRAINING_STEPS = 100000, #10000000
    NUM_TEST_STEPS = 10,
    NUM_NEW_EXP = 1000,
    BUFFER_SIZE = 10000,
    worker_id=0,
    time_scale=20,
    no_graphics = True
    )
env = None

In [33]:
def model_pipeline(hyperparameters):
    global env
    
    try:
        env.close()
    except:
        pass

    # tell wandb to get started
    with wandb.init(project="pytorch-DQN", config=hyperparameters):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # make the model, data, and optimization problem
        model, env, criterion, optimizer = make(config)
        print(model)

        # and use them to train the model
        train(model, env, criterion, optimizer, config)

        # and test its final performance
        test(model, env, config)

        env.close()

    return model

In [34]:
def make(config):
    # Make the data
    env, spec = make_env(config)
    
    # Make the model
    # model = Agent(config.kernels, config.classes).to(device)
    model = QNetwork((44, 1), 126, 3)

    # Make the loss and optimizer
    criterion = torch.nn.MSELoss() # None # nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    
    return model, env, criterion, optimizer

# 📡 Define the Env Loading and Model

In [35]:
def make_env(config):
    channel = EngineConfigurationChannel()
    env = UE("run32_training", seed=1, worker_id=config.worker_id, no_graphics=config.no_graphics, side_channels=[channel])
    channel.set_configuration_parameters(time_scale = config.time_scale)
    print("Environment created.")
    
    env.reset()
    behavior_name = list(env.behavior_specs)[0]
    print(f"Name of the behavior : {behavior_name}")
    
    spec = env.behavior_specs[behavior_name]
    print(f"Type of the spec : {spec}")
    
    return env, spec

In [36]:
class QNetwork(torch.nn.Module):
    def __init__(
        self,
        input_shape: Tuple[int], 
        encoding_size: int, 
        output_size: int
    ):
        """
        Creates a neural network that takes as input a batch of images (3
        dimensional tensors) and outputs a batch of outputs (1 dimensional
        tensors)
        """
        super(QNetwork, self).__init__()
        self.dense1 = torch.nn.Linear(input_shape[0], encoding_size)
        self.dense2 = torch.nn.Linear(encoding_size, encoding_size)

        self.dense2_x1 = torch.nn.Linear(encoding_size, output_size)
        self.dense2_x2 = torch.nn.Linear(encoding_size, output_size)
        self.dense2_x3 = torch.nn.Linear(encoding_size, output_size)

        self.act = torch.nn.ReLU()
        self.act_out = torch.nn.Sigmoid() # ReLU

    def forward(self, visual_obs: torch.tensor):
        hidden = self.dense1(visual_obs)
        hidden = self.act(hidden)
        hidden = self.dense2(hidden)
        hidden = self.act(hidden)
        x1 = self.dense2_x1(hidden)
        x2 = self.dense2_x2(hidden)
        x3 = self.dense2_x3(hidden)

        #     x1 = self.act(x1)
        #     x2 = self.act(x2)
        #     x3 = self.act(x3)

        return x1, x2, x3

In [37]:
import numpy as np
from typing import NamedTuple, List


class Experience(NamedTuple):
    """
    An experience contains the data of one Agent transition.
    - Observation
    - Action
    - Reward
    - Done flag
    - Next Observation
    """

    obs: np.ndarray
    action: np.ndarray
    reward: float
    done: bool
    next_obs: np.ndarray

# A Trajectory is an ordered sequence of Experiences
Trajectory = List[Experience]

# A Buffer is an unordered list of Experiences from multiple Trajectories
Buffer = List[Experience]

In [38]:
from mlagents_envs.environment import ActionTuple, BaseEnv
from typing import Dict
import random


class Trainer:
  @staticmethod
  def generate_trajectories(
    env: BaseEnv, q_net: QNetwork, buffer_size: int, epsilon: float
  ):
    """
    Given a Unity Environment and a Q-Network, this method will generate a
    buffer of Experiences obtained by running the Environment with the Policy
    derived from the Q-Network.
    :param BaseEnv: The UnityEnvironment used.
    :param q_net: The Q-Network used to collect the data.
    :param buffer_size: The minimum size of the buffer this method will return.
    :param epsilon: Will add a random normal variable with standard deviation.
    epsilon to the value heads of the Q-Network to encourage exploration.
    :returns: a Tuple containing the created buffer and the average cumulative
    the Agents obtained.
    """
    # Create an empty Buffer
    buffer: Buffer = []

    # Reset the environment
    env.reset()
    # Read and store the Behavior Name of the Environment
    behavior_name = list(env.behavior_specs)[0]
    # Read and store the Behavior Specs of the Environment
    spec = env.behavior_specs[behavior_name]

    # Create a Mapping from AgentId to Trajectories. This will help us create
    # trajectories for each Agents
    dict_trajectories_from_agent: Dict[int, Trajectory] = {}
    # Create a Mapping from AgentId to the last observation of the Agent
    dict_last_obs_from_agent: Dict[int, np.ndarray] = {}
    # Create a Mapping from AgentId to the last observation of the Agent
    dict_last_action_from_agent: Dict[int, np.ndarray] = {}
    # Create a Mapping from AgentId to cumulative reward (Only for reporting)
    dict_cumulative_reward_from_agent: Dict[int, float] = {}
    # Create a list to store the cumulative rewards obtained so far
    cumulative_rewards: List[float] = []
    
    
    entered_terminal = False
    while len(buffer) < buffer_size:  # While not enough data in the buffer
      # Get the Decision Steps and Terminal Steps of the Agents
      decision_steps, terminal_steps = env.get_steps(behavior_name)
    
        # For all Agents with a Terminal Step:
      for agent_id_terminated in terminal_steps:
#         print("entered agent with terminal step")
#         print(agent_id_terminated)

        # Create its last experience (is last because the Agent terminated)
        last_experience = Experience(
          obs=dict_last_obs_from_agent[agent_id_terminated].copy(),
          reward=terminal_steps[agent_id_terminated].reward,
          done=not terminal_steps[agent_id_terminated].interrupted,
          action=dict_last_action_from_agent[agent_id_terminated].copy(),
          next_obs=terminal_steps[agent_id_terminated].obs[0],
        )
        # Clear its last observation and action (Since the trajectory is over)
        dict_last_obs_from_agent.pop(agent_id_terminated)
        dict_last_action_from_agent.pop(agent_id_terminated)
        # Report the cumulative reward
        cumulative_reward = (
          dict_cumulative_reward_from_agent.pop(agent_id_terminated)
          + terminal_steps[agent_id_terminated].reward
        )
#         print("cumulative reward: ", cumulative_reward)
        cumulative_rewards.append(cumulative_reward) #  - 50
        # Add the Trajectory and the last experience to the buffer
        buffer.extend(dict_trajectories_from_agent.pop(agent_id_terminated))
        buffer.append(last_experience)
        entered_terminal = True

      # For all Agents with a Decision Step:
      for agent_id_decisions in decision_steps:
        # If the Agent does not have a Trajectory, create an empty one
        if agent_id_decisions not in dict_trajectories_from_agent:
          dict_trajectories_from_agent[agent_id_decisions] = []
          dict_cumulative_reward_from_agent[agent_id_decisions] = 0

        # If the Agent requesting a decision has a "last observation"
        if agent_id_decisions in dict_last_obs_from_agent:
          # Create an Experience from the last observation and the Decision Step
          exp = Experience(
            obs=dict_last_obs_from_agent[agent_id_decisions].copy(),
            reward=decision_steps[agent_id_decisions].reward, #  - 0.05
            done=False,
            action=dict_last_action_from_agent[agent_id_decisions].copy(),
            next_obs=decision_steps[agent_id_decisions].obs[0],
          )
          # Update the Trajectory of the Agent and its cumulative reward
          dict_trajectories_from_agent[agent_id_decisions].append(exp)
          dict_cumulative_reward_from_agent[agent_id_decisions] += (
            decision_steps[agent_id_decisions].reward
          )
        # Store the observation as the new "last observation"
        dict_last_obs_from_agent[agent_id_decisions] = (
          decision_steps[agent_id_decisions].obs[0]
        )

      # Generate an action for all the Agents that requested a decision
      # Compute the values for each action given the observation    
      act1, act2, act3 = q_net(torch.from_numpy(decision_steps.obs[0]))
    
      if len(decision_steps) == 0:
#             print("error: no more observations ! ")
            env.step()
            continue
#       if act1.size == 0:
#             print("error: Action space received = 0")
#             env.step()
#             continue
            
      # get actions as arrays
      act1 = act1.detach().numpy()
      act2 = act2.detach().numpy()
      act3 = act3.detach().numpy()
    
#       print("action received from QNetwork: ", act1)
#       print("action received from QNetwork: ", act2)
#       print("action received from QNetwork: ", act3)
      act1 += epsilon * np.random.randn(act1.shape[0], act1.shape[1]).astype(np.float32)
      act2 += epsilon * np.random.randn(act1.shape[0], act1.shape[1]).astype(np.float32)
      act3 += epsilon * np.random.randn(act1.shape[0], act1.shape[1]).astype(np.float32)
      
      # pick the best action using argmax
      act1 = np.argmax(act1, axis=1)
      act2 = np.argmax(act2, axis=1)
      act3 = np.argmax(act3, axis=1)
#       print("action received from argmax: ", act1)
#       print("action received from argmax: ", act1.shape)
#       act1 = np.array([act1])
#       act2 = np.array([act2])
#       act3 = np.array([act3])
      act1 = np.expand_dims(act1, axis=1)
      act2 = np.expand_dims(act2, axis=1)
      act3 = np.expand_dims(act3, axis=1)
#       print("action received from argmax expanded: ", act1)
#       print("action received from argmax expanded: ", act1.shape)

      # map action index 2 to -1 for the agent to move backwards, left, and rotate left
      act1[act1 > 1] = -1
      act2[act2 > 1] = -1
      act3[act3 > 1] = -1

      # format to numpy arrays
#       print("action received from mapping: ", act1)
#       try:
# #         actions_values = np.array([act1, act2, act3]).reshape(3,3)
#         actions_values = np.vstack((act1, act2, act3))
#       except:
#         actions_values = np.zeros((3,3))
#         print("error: network received an input of size 0 and i caught the error :/")

        #      0-8nt(type(actions_values))
    
    
#       actions =  np.vstack((act1, act2, act3))
#       actions =  np.vstack((act1, act2, act3))

      temp = np.hstack((act1, act2, act3))
#       print("actions stacked with hstack:", temp)
#       print("actions stacked with hstack: shape:", temp.shape)
#       temp = np.concatenate((act1, act2, act3), axis=1)
#       print("actions stacked with concat:", temp)
#       print("actions stacked with concat: shape:", temp.shape)
        
#       actions_values = np.zeros((3,3))
#       # Add some noise with epsilon to the values
#       actions_values += epsilon * np.random.randn(actions_values.shape[0], actions_values.shape[1]).astype(np.float32)
#       actions = np.argmax(actions_values, axis=1)
      
      actions = temp
#       print("final actions: ", actions)
#       print("final actions shape: ", actions.shape)
      actions.resize((len(decision_steps), 3))
#       print("decision steps size:", len(decision_steps))
#       print("final actions after resize: ", actions)
#       print("final actions shape: ", actions.shape)


      # Store the action that was picked, it will be put in the trajectory later
      for agent_index, agent_id in enumerate(decision_steps.agent_id):
        dict_last_action_from_agent[agent_id] = actions[agent_index]
#       print("dict last action: ", dict_last_action_from_agent)

        
      # Set the actions in the environment
      # Unity Environments expect ActionTuple instances.
      action_tuple = ActionTuple()
      action_tuple.add_discrete(actions)
#       print("filtered action received from QNetwork: ", action_tuple.discrete)
      env.set_actions(behavior_name, action_tuple)
      # Perform a step in the simulation
      env.step()
    return buffer, np.mean(cumulative_rewards)

  @staticmethod
  def update_q_net(
    q_net: QNetwork, 
    criterion: torch.nn.modules.loss,
    optimizer: torch.optim, 
    buffer: Buffer, 
    action_size: int
  ):
    """
    Performs an update of the Q-Network using the provided optimizer and buffer
    """
    def calculate_bellman_loss(next_pred_action, pred_action, reward, done, GAMMA, batch, action_size, action):
        # Use the Bellman equation to update the Q-Network
        target = (
          reward
          + (1.0 - done)
          * GAMMA
          * torch.max(next_pred_action.detach(), dim=1, keepdim=True).values
        ).double()
#         print("next_act_prediction:", next_pred_action.detach().numpy())
        
#         print("Target:", target)
#         print("Target shape:", target.shape)

#         print("action:", action)
#         print("action shape: ", action.shape)
        assert(action.shape[0] == len(batch))
        action[action < 0] = 2
#         print("action after correction:", action)
        
        mask = np.eye(action_size)[action]
#         mask = torch.zeros((len(batch), action_size))  
#         print("mask: ", mask)
#         print("mask shape: ", mask.shape)
#         mask.scatter_(1, action, 1)
#         print("mask after scatter: ", mask)
        mask = torch.from_numpy(mask).double()
#         print("pred_action: error", pred_action)
#         print(type(pred_action))
#         print(pred_action.dtype)
#         print(type(mask))
#         print(mask.dtype)
        prediction = torch.sum(pred_action.double() * mask, dim=1, keepdim=True)
#         print("act_prediction:", pred_action.detach().numpy())
#         print("prediction: ", prediction)
#         print("prediction shaPE: ", prediction.shape)
#         print("prediction type: ", type(prediction))
#         print("prediction dtype: ", prediction.dtype)
        
        criterion = torch.nn.MSELoss()
        loss = criterion(prediction, target)
        
        return loss

    BATCH_SIZE = 1000
    NUM_EPOCH = 3
    GAMMA = 0.9
    batch_size = min(len(buffer), BATCH_SIZE)
    random.shuffle(buffer)
    # Split the buffer into batches
    batches = [
      buffer[batch_size * start : batch_size * (start + 1)]
      for start in range(int(len(buffer) / batch_size))
    ]
    for _ in range(NUM_EPOCH):
      for batch in batches:
        # Create the Tensors that will be fed in the network
        obs = torch.from_numpy(np.stack([ex.obs for ex in batch]))
        reward = torch.from_numpy(
          np.array([ex.reward for ex in batch], dtype=np.float32).reshape(-1, 1)
        )
        done = torch.from_numpy(
          np.array([ex.done for ex in batch], dtype=np.float32).reshape(-1, 1)
        )
        action = torch.from_numpy(np.stack([ex.action for ex in batch]))
        next_obs = torch.from_numpy(np.stack([ex.next_obs for ex in batch]))
        
        # Prerequisite: collect outputs
        pnext_a1, pnext_a2, pnext_a3 = q_net(next_obs)
        p_a1, p_a2, p_a3 = q_net(obs)
        
        # bellman equation for each loss
        loss1 = calculate_bellman_loss(pnext_a1, p_a1, reward, done, GAMMA, batch, action_size, action[:, 0])
        loss2 = calculate_bellman_loss(pnext_a2, p_a2, reward, done, GAMMA, batch, action_size, action[:, 1])
        loss3 = calculate_bellman_loss(pnext_a3, p_a3, reward, done, GAMMA, batch, action_size, action[:, 2])
        loss = loss1 + loss2 + loss3
        
        # Perform the backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


# 👟 Define Training Logic

### 3️⃣ Step 3. Track gradients with `wandb.watch` and everything else with `wandb.log`

In [39]:
@measure
def train(model, env, criterion, optimizer, config):
    wandb.watch(model, criterion, log="all", log_freq=10)
    experiences: Buffer = []

    cumulative_rewards: List[float] = []
    NUM_NEW_EXP = config.NUM_NEW_EXP
    BUFFER_SIZE = config.BUFFER_SIZE

    behavior_name = list(env.behavior_specs)[0]
        
    for episode in range(config.NUM_TRAINING_STEPS):
        new_exp,_ = Trainer.generate_trajectories(env, model, NUM_NEW_EXP, epsilon=0.1)
        random.shuffle(experiences)
        if len(experiences) > BUFFER_SIZE:
            experiences = experiences[:BUFFER_SIZE]
        experiences.extend(new_exp)
        Trainer.update_q_net(model, criterion, optimizer, experiences, 3)
        _, rewards = Trainer.generate_trajectories(env, model, 100, epsilon=0)
        cumulative_rewards.append(rewards)
        # print("Training step ", episode + 1, "\treward ", episode_rewards)
        train_log(rewards, episode)

In [40]:
# @measure
# def train(model, env, criterion, optimizer, config):
# #     wandb.watch(model, criterion, log="all", log_freq=10)

#     behavior_name = list(env.behavior_specs)[0]
        
#     for episode in range(config.NUM_TRAINING_STEPS):
#         env.reset()
#         decision_steps, terminal_steps = env.get_steps(behavior_name)
#         tracked_agent = -1 # -1 indicates not yet tracking
#         done = False # For the tracked_agent
#         episode_rewards = 0 # For the tracked_agent
        
#         while not done:
#             # Track the first agent we see if not tracking 
#             # Note : len(decision_steps) = [numberb of agents that requested a decision]
#             if tracked_agent == -1 and len(decision_steps) >= 1:
#                 tracked_agent = decision_steps.agent_id[0] 

#             # Generate an action for all agents
#             # these are the observations
# #             print(decision_steps[0])
#             action = model.get_action(decision_steps)
#     #         print(action.discrete)
#             # Set the actions
#             env.set_actions(behavior_name, action)

#             # Move the simulation forward
#             env.step()

#             # Get the new simulation results
#             decision_steps, terminal_steps = env.get_steps(behavior_name)
#             if tracked_agent in decision_steps: # The agent requested a decision
#                 episode_rewards += decision_steps[tracked_agent].reward
#             if tracked_agent in terminal_steps: # The agent terminated its episode
#                 print("reward on terminal step:", terminal_steps[tracked_agent].reward)
#                 episode_rewards += terminal_steps[tracked_agent].reward
#                 done = True
#         # print("Training step ", episode + 1, "\treward ", episode_rewards)
#         train_log(episode_rewards, episode)

In [41]:
def train_log(reward, episode):
#     loss = float(loss)
    reward = float(reward)
    episode +=1
    
    # where the magic happens
    wandb.log({"episode": episode, "reward": reward}) #, step=example_ct
    print(f"Reward after " + str(episode).zfill(5) + f" episodes: {reward:.3f}")

# 🧪 Define Testing Logic

#### 4️⃣ Optional Step 4: Call `wandb.save`

In [42]:
def test(model, env, config):
    model.eval()

    # Run the model on some test examples
    with torch.no_grad():
        experiences: Buffer = []
        cumulative_rewards: List[float] = []
        NUM_NEW_EXP = config.NUM_NEW_EXP
        BUFFER_SIZE = config.BUFFER_SIZE

        behavior_name = list(env.behavior_specs)[0]

        for episode in range(config.NUM_TEST_STEPS):
            new_exp,_ = Trainer.generate_trajectories(env, model, NUM_NEW_EXP, epsilon=0.1)
            random.shuffle(experiences)
            if len(experiences) > BUFFER_SIZE:
                experiences = experiences[:BUFFER_SIZE]
            experiences.extend(new_exp)
#                 Trainer.update_q_net(model, criterion, optimizer, experiences, 3)
            _, rewards = Trainer.generate_trajectories(env, model, 100, epsilon=0)
            cumulative_rewards.append(rewards)
            # print("Training step ", episode + 1, "\treward ", episode_rewards)

        print(f"Average reward of the model after {config.NUM_TEST_STEPS} " +
              f"test episodes: {numpy.mean(cumulative_rewards)}%")

        wandb.log({"test_average_reward": numpy.mean(cumulative_rewards)})

    # Save the model in the exchangeable ONNX format
    torch.onnx.export(model, [], "model.onnx")
    wandb.save("model.onnx")

# 🏃‍♀️ Run training and watch your metrics live on [wandb.ai](https://wandb.ai)!

In [43]:
# Build, train and analyze the model with the pipeline
model, env = model_pipeline(config)

Environment created.
Name of the behavior : Hummingbird?team=0
Type of the spec : BehaviorSpec(observation_specs=[ObservationSpec(shape=(44,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='RayPerceptionSensor'), ObservationSpec(shape=(3,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='StackingSensor_size3_VectorSensor_size1')], action_spec=ActionSpec(continuous_size=0, discrete_branches=(3, 3, 3)))
QNetwork(
  (dense1): Linear(in_features=44, out_features=126, bias=True)
  (dense2): Linear(in_features=126, out_features=126, bias=True)
  (dense2_x1): Linear(in_features=126, out_features=3, bias=True)
  (dense2_x2): Linear(in_features=126, out_features=3, bias=True)
  (dense2_x3): Linear(in_features=126, out_features=3, bias=True)
  (act): ReLU()
  (act_out): Sigmoid()
)
Reward after 00001 episodes: 6.029
Reward after 00002 episodes: 0.042
Reward after 00003 episodes: 1.612

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,2689.0
reward,0.0
_runtime,56428.0
_timestamp,1622897623.0
_step,2689.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
reward,█▁▁▁▁▇█▇████▇█████▄█▁██▁█▁▁▁▁▂▁▁████▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇█
_timestamp,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███


KeyboardInterrupt: 

# 🧹 Test Hyperparameters with Sweeps


## [Check out Hyperparameter Optimization in PyTorch using W&B Sweep $\rightarrow$](https://colab.research.google.com/drive/1QTIK23LBuAkdejbrvdP5hwBGyYlyEJpT?usp=sharing)

Running a hyperparameter sweep with Weights & Biases is very easy. There are just 3 simple steps:

1. **Define the sweep:** We do this by creating a dictionary or a [YAML file](https://docs.wandb.com/library/sweeps/configuration) that specifies the parameters to search through, the search strategy, the optimization metric et all.

2. **Initialize the sweep:** 
`sweep_id = wandb.sweep(sweep_config)`

3. **Run the sweep agent:** 
`wandb.agent(sweep_id, function=train)`

And voila! That's all there is to running a hyperparameter sweep!
<img src="https://imgur.com/UiQKg0L.png" alt="Weights & Biases" />

# 🤓 Advanced Setup
1. [Environment variables](https://docs.wandb.com/library/environment-variables): Set API keys in environment variables so you can run training on a managed cluster.
2. [Offline mode](https://docs.wandb.com/library/technical-faq#can-i-run-wandb-offline): Use `dryrun` mode to train offline and sync results later.
3. [On-prem](https://docs.wandb.com/self-hosted): Install W&B in a private cloud or air-gapped servers in your own infrastructure. We have local installations for everyone from academics to enterprise teams.
4. [Sweeps](https://docs.wandb.com/sweeps): Set up hyperparameter search quickly with our lightweight tool for tuning.

In [None]:
# for n in range(NUM_TRAINING_STEPS):
#   new_exp,_ = Trainer.generate_trajectories(env, qnet, NUM_NEW_EXP, epsilon=0.1)
#   random.shuffle(experiences)
#   if len(experiences) > BUFFER_SIZE:
#     experiences = experiences[:BUFFER_SIZE]
#   experiences.extend(new_exp)
#   Trainer.update_q_net(qnet, optim, experiences, 3)
#   _, rewards = Trainer.generate_trajectories(env, qnet, 100, epsilon=0)
#   cumulative_rewards.append(rewards)
#   print("Training step ", n+1, "\treward ", rewards)
#   print()


# env.close()

# # Show the training graph
# plt.plot(range(NUM_TRAINING_STEPS), cumulative_rewards)

In [None]:
#  class UnityEnv(gym.Env):
#     """
#     Provides Gym wrapper for Unity Learning Environments.
#     Multi-agent environments use lists for object types, as done here:
#     https://github.com/openai/multiagent-particle-envs
#     """
 
#     def __init__(
#         self,
#         environment_filename: str,
#         dimensions: int = [],   #Added
#         timescale: int = 1,     #Added
#         worker_id: int = 0,
#         use_visual: bool = False,
#         uint8_visual: bool = False,
#         multiagent: bool = False,
#         flatten_branched: bool = False,
#         no_graphics: bool = False,
#         allow_multiple_visual_obs: bool = False,
#         set_config: bool = True,    #Added
#     ):
#         """
#         Environment initialization
#         :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym.
#         :param worker_id: Worker number for environment.
#         :param use_visual: Whether to use visual observation or vector observation.
#         :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0).
#         :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done).
#         :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than
#             MultiDiscrete.
#         :param no_graphics: Whether to run the Unity simulator in no-graphics mode
#         :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one.
#         """
#         base_port = 5005
#         if environment_filename is None:
#             base_port = UnityEnvironment.DEFAULT_EDITOR_PORT
 
#         channel = EngineConfigurationChannel()        # Added
 
 
#         #Added
#         if set_config == True:
#             channel.set_configuration_parameters(time_scale=timescale, width=dimensions[0], height=dimensions[1])
#         #Added
 
#         self._env = UnityEnvironment(
#             environment_filename,
#             worker_id,
#             base_port=base_port,
#             no_graphics=no_graphics,
#             side_channels=[channel],        # Added
#         )