# 🚀 Install, Import, and Log In

In [143]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
# import matplotlib.pyplot as plt
# %matplotlib inline
import random
from functools import wraps
from time import time

import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
from ipywidgets import IntProgress

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [153]:
def measure(func):
    @wraps(func)
    def _time_it(*args, **kwargs):
        start = int(round(time() * 1000))
        try:
            return func(*args, **kwargs)
        finally:
            end_ = int(round(time() * 1000)) - start
            print(f"Total execution time: {end_/(1000*60) if end_ > 0 else 0} minutes")
    return _time_it

### 0️⃣ Step 0: Install W&B

In [22]:
%%capture
!pip install wandb

### 1️⃣ Step 1: Import W&B and Login

In [23]:
import wandb

wandb.login()

True

# 👩‍🔬 Define the Experiment and Pipeline

### 2️⃣ Step 2: Track metadata and hyperparameters with `wandb.init`

In [173]:
config = dict(
    epochs=5,
    classes=10,
    kernels=[16, 32],
    batch_size=128,
    learning_rate=0.005,
    dataset="MNIST",
    architecture="CNN",
    NUM_TRAINING_STEPS = 100000, #10000000
    NUM_TEST_STEPS = 10,
    NUM_NEW_EXP = 1000,
    BUFFER_SIZE = 10000,
    worker_id=3,
    time_scale=1
    )
env = None

In [174]:
def model_pipeline(hyperparameters):
    global env
    
    try:
        env.close()
    except:
        pass

    # tell wandb to get started
    with wandb.init(project="pytorch-demo1", config=hyperparameters):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # make the model, data, and optimization problem
        model, env, criterion, optimizer = make(config)
        print(model)

        # and use them to train the model
        train(model, env, criterion, optimizer, config)

        # and test its final performance
        test(model, env, config)

        env.close()

    return model

In [175]:
def make(config):
    # Make the data
    env, spec = make_env(config)
    
    # Make the model
    # model = Agent(config.kernels, config.classes).to(device)
    model = Agent(spec)

    # Make the loss and optimizer
    criterion = None # nn.CrossEntropyLoss()
    optimizer = None # torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    
    return model, env, criterion, optimizer

# 📡 Define the Env Loading and Model

In [176]:
def make_env(config):
    channel = EngineConfigurationChannel()
    env = UE("run32_training", seed=1, worker_id=config.worker_id, side_channels=[channel])
    channel.set_configuration_parameters(time_scale = config.time_scale)
    print("Environment created.")
    
    env.reset()
    behavior_name = list(env.behavior_specs)[0]
    print(f"Name of the behavior : {behavior_name}")
    
    spec = env.behavior_specs[behavior_name]
    print(f"Type of the spec : {spec}")
    
    return env, spec

In [177]:
# Conventional and convolutional neural network
class Agent():
    def __init__(self, spec):
        self.spec = spec
    def get_action(self, decision_steps):
        return self.spec.action_spec.random_action(len(decision_steps))

In [178]:
import torch
from typing import Tuple
from math import floor


class VisualQNetwork(torch.nn.Module):
    def __init__(
        self,
        input_shape: Tuple[int], 
        encoding_size: int, 
        output_size: int
    ):
        """
        Creates a neural network that takes as input a batch of images (3
        dimensional tensors) and outputs a batch of outputs (1 dimensional
        tensors)
        """
        super(VisualQNetwork, self).__init__()
        self.dense1 = torch.nn.Linear(input_shape[0], encoding_size)
        self.dense2 = torch.nn.Linear(encoding_size, encoding_size)

        self.dense2_x1 = torch.nn.Linear(encoding_size, output_size)
        self.dense2_x2 = torch.nn.Linear(encoding_size, output_size)
        self.dense2_x3 = torch.nn.Linear(encoding_size, output_size)

        self.act = torch.nn.Sigmoid() # ReLU

    def forward(self, visual_obs: torch.tensor):
        hidden = self.dense1(visual_obs)
        hidden = self.act(hidden)
        hidden = self.dense2(hidden)
        hidden = self.act(hidden)
        x1 = self.dense2_x1(hidden)
        x2 = self.dense2_x2(hidden)
        x3 = self.dense2_x3(hidden)

        #     x1 = self.act(x1)
        #     x2 = self.act(x2)
        #     x3 = self.act(x3)

        return x1, x2, x3

# 👟 Define Training Logic

### 3️⃣ Step 3. Track gradients with `wandb.watch` and everything else with `wandb.log`

In [179]:
@measure
def train(model, env, criterion, optimizer, config):
#     wandb.watch(model, criterion, log="all", log_freq=10)
    behavior_name = list(env.behavior_specs)[0]
        
    for episode in range(config.NUM_TRAINING_STEPS):
        env.reset()
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        tracked_agent = -1 # -1 indicates not yet tracking
        done = False # For the tracked_agent
        episode_rewards = 0 # For the tracked_agent
        
        while not done:
            # Track the first agent we see if not tracking 
            # Note : len(decision_steps) = [numberb of agents that requested a decision]
            if tracked_agent == -1 and len(decision_steps) >= 1:
                tracked_agent = decision_steps.agent_id[0] 

            # Generate an action for all agents
            # these are the observations
#             print(decision_steps[0])
            action = model.get_action(decision_steps)
    #         print(action.discrete)
            # Set the actions
            env.set_actions(behavior_name, action)

            # Move the simulation forward
            env.step()

            # Get the new simulation results
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            if tracked_agent in decision_steps: # The agent requested a decision
                episode_rewards += decision_steps[tracked_agent].reward
            if tracked_agent in terminal_steps: # The agent terminated its episode
                print("reward on terminal step:", terminal_steps[tracked_agent].reward)
                episode_rewards += terminal_steps[tracked_agent].reward
                done = True
        # print("Training step ", episode + 1, "\treward ", episode_rewards)
        train_log(episode_rewards, episode)

In [180]:
def train_log(reward, episode):
#     loss = float(loss)
    reward = float(reward)

    # where the magic happens
    wandb.log({"episode": episode, "reward": reward}) #, step=example_ct
    print(f"Reward after " + str(episode).zfill(5) + f" episodes: {reward:.3f}")

# 🧪 Define Testing Logic

#### 4️⃣ Optional Step 4: Call `wandb.save`

In [181]:
def test(model, env, config):
#     model.eval()

    # Run the model on some test examples
    with torch.no_grad():
        behavior_name = list(env.behavior_specs)[0]
        cumulative_rewards: List[float] = []

        for episode in range(config.NUM_TEST_STEPS):
            env.reset()
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            tracked_agent = -1 # -1 indicates not yet tracking
            done = False # For the tracked_agent
            episode_rewards = 0 # For the tracked_agent

            while not done:
                # Track the first agent we see if not tracking 
                # Note : len(decision_steps) = [number of agents that requested a decision]
                if tracked_agent == -1 and len(decision_steps) >= 1:
                    tracked_agent = decision_steps.agent_id[0] 

                # Generate an action for all agents
                action = model.get_action(decision_steps)

                # Set the actions
                env.set_actions(behavior_name, action)

                # Move the simulation forward
                env.step()

                # Get the new simulation results
                decision_steps, terminal_steps = env.get_steps(behavior_name)
                if tracked_agent in decision_steps: # The agent requested a decision
                    episode_rewards += decision_steps[tracked_agent].reward
                if tracked_agent in terminal_steps: # The agent terminated its episode
                    print("reward on terminal step:", terminal_steps[tracked_agent].reward)
                    episode_rewards += terminal_steps[tracked_agent].reward
                    done = True
            cumulative_rewards.append(episode_rewards)

        print(f"Average reward of the model after {config.NUM_TEST_STEPS} " +
              f"test episodes: {numpy.average(cumulative_rewards)}%")

        wandb.log({"test_average_reward": numpy.average(cumulative_rewards)})

    # Save the model in the exchangeable ONNX format
    torch.onnx.export(model, [], "model.onnx")
    wandb.save("model.onnx")

# 🏃‍♀️ Run training and watch your metrics live on [wandb.ai](https://wandb.ai)!

In [182]:
# Build, train and analyze the model with the pipeline
model, env = model_pipeline(config)

Environment created.
Name of the behavior : Hummingbird?team=0
Type of the spec : BehaviorSpec(observation_specs=[ObservationSpec(shape=(44,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='RayPerceptionSensor'), ObservationSpec(shape=(3,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='StackingSensor_size3_VectorSensor_size1')], action_spec=ActionSpec(continuous_size=0, discrete_branches=(3, 3, 3)))
<__main__.Agent object at 0x7ff034696670>
reward on terminal step: 0.0
Reward after 00000 episodes: 0.000
reward on terminal step: 0.0
Reward after 00001 episodes: 0.000
reward on terminal step: 0.0
Reward after 00002 episodes: 185.935
reward on terminal step: -1.0
Reward after 00003 episodes: 100.940
reward on terminal step: 0.0
Reward after 00004 episodes: 130.720
reward on terminal step: -1.0
Reward after 00005 episodes: -1.000
reward on terminal step: 0.0
Reward after 00006

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,47.0
reward,36.06682
_runtime,8576.0
_timestamp,1622832572.0
_step,47.0


0,1
episode,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
reward,▁▁▅▃▄▁▂▁▁▁▂▁▂▅▁▁▁▁▁▇▁▁▂█▁▅▂▁▁▁▄▁▅▅▁▁▁▁▁▂
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


KeyboardInterrupt: 

In [183]:
env.close()

# 🧹 Test Hyperparameters with Sweeps


## [Check out Hyperparameter Optimization in PyTorch using W&B Sweep $\rightarrow$](https://colab.research.google.com/drive/1QTIK23LBuAkdejbrvdP5hwBGyYlyEJpT?usp=sharing)

Running a hyperparameter sweep with Weights & Biases is very easy. There are just 3 simple steps:

1. **Define the sweep:** We do this by creating a dictionary or a [YAML file](https://docs.wandb.com/library/sweeps/configuration) that specifies the parameters to search through, the search strategy, the optimization metric et all.

2. **Initialize the sweep:** 
`sweep_id = wandb.sweep(sweep_config)`

3. **Run the sweep agent:** 
`wandb.agent(sweep_id, function=train)`

And voila! That's all there is to running a hyperparameter sweep!
<img src="https://imgur.com/UiQKg0L.png" alt="Weights & Biases" />

# 🤓 Advanced Setup
1. [Environment variables](https://docs.wandb.com/library/environment-variables): Set API keys in environment variables so you can run training on a managed cluster.
2. [Offline mode](https://docs.wandb.com/library/technical-faq#can-i-run-wandb-offline): Use `dryrun` mode to train offline and sync results later.
3. [On-prem](https://docs.wandb.com/self-hosted): Install W&B in a private cloud or air-gapped servers in your own infrastructure. We have local installations for everyone from academics to enterprise teams.
4. [Sweeps](https://docs.wandb.com/sweeps): Set up hyperparameter search quickly with our lightweight tool for tuning.

In [None]:
# for n in range(NUM_TRAINING_STEPS):
#   new_exp,_ = Trainer.generate_trajectories(env, qnet, NUM_NEW_EXP, epsilon=0.1)
#   random.shuffle(experiences)
#   if len(experiences) > BUFFER_SIZE:
#     experiences = experiences[:BUFFER_SIZE]
#   experiences.extend(new_exp)
#   Trainer.update_q_net(qnet, optim, experiences, 3)
#   _, rewards = Trainer.generate_trajectories(env, qnet, 100, epsilon=0)
#   cumulative_rewards.append(rewards)
#   print("Training step ", n+1, "\treward ", rewards)
#   print()


# env.close()

# # Show the training graph
# plt.plot(range(NUM_TRAINING_STEPS), cumulative_rewards)

In [None]:
#  class UnityEnv(gym.Env):
#     """
#     Provides Gym wrapper for Unity Learning Environments.
#     Multi-agent environments use lists for object types, as done here:
#     https://github.com/openai/multiagent-particle-envs
#     """
 
#     def __init__(
#         self,
#         environment_filename: str,
#         dimensions: int = [],   #Added
#         timescale: int = 1,     #Added
#         worker_id: int = 0,
#         use_visual: bool = False,
#         uint8_visual: bool = False,
#         multiagent: bool = False,
#         flatten_branched: bool = False,
#         no_graphics: bool = False,
#         allow_multiple_visual_obs: bool = False,
#         set_config: bool = True,    #Added
#     ):
#         """
#         Environment initialization
#         :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym.
#         :param worker_id: Worker number for environment.
#         :param use_visual: Whether to use visual observation or vector observation.
#         :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0).
#         :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done).
#         :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than
#             MultiDiscrete.
#         :param no_graphics: Whether to run the Unity simulator in no-graphics mode
#         :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one.
#         """
#         base_port = 5005
#         if environment_filename is None:
#             base_port = UnityEnvironment.DEFAULT_EDITOR_PORT
 
#         channel = EngineConfigurationChannel()        # Added
 
 
#         #Added
#         if set_config == True:
#             channel.set_configuration_parameters(time_scale=timescale, width=dimensions[0], height=dimensions[1])
#         #Added
 
#         self._env = UnityEnvironment(
#             environment_filename,
#             worker_id,
#             base_port=base_port,
#             no_graphics=no_graphics,
#             side_channels=[channel],        # Added
#         )