# 🚀 Install, Import, and Log In

In [1]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.environment import ActionTuple, BaseEnv
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
# import matplotlib.pyplot as plt
# %matplotlib inline
import random
from functools import wraps
from time import time


import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
from ipywidgets import IntProgress

from typing import Tuple, Dict
from math import floor

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

############################### Import libraries ###############################


import os
import glob
import time
from datetime import datetime

from torch.distributions import MultivariateNormal
from torch.distributions import Categorical

# import roboschool
# import pybullet_envs


################################## set device ##################################

print("============================================================================================")


# set device to cpu or cuda
device = torch.device('cpu')

if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
    
print("============================================================================================")


device(type='cuda', index=0)

In [2]:
def measure(func):
    @wraps(func)
    def _time_it(*args, **kwargs):
        start = int(round(time() * 1000))
        try:
            return func(*args, **kwargs)
        finally:
            end_ = int(round(time() * 1000)) - start
            print(f"Total execution time: {end_/(1000*60) if end_ > 0 else 0} minutes")
    return _time_it

### 0️⃣ Step 0: Install W&B

In [3]:
%%capture
!pip install wandb

### 1️⃣ Step 1: Import W&B and Login

In [4]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mademord[0m (use `wandb login --relogin` to force relogin)


True

# 👩‍🔬 Define the Experiment and Pipeline

### 2️⃣ Step 2: Track metadata and hyperparameters with `wandb.init`

In [5]:
config = dict(
    epochs=5,
    classes=10,
    kernels=[16, 32],
    batch_size=128,
    learning_rate=0.001,
    dataset="MNIST",
    architecture="CNN",
    NUM_TRAINING_STEPS = int(1e4), #10000000
    NUM_TEST_STEPS = 10,
    NUM_NEW_EXP = 1000,
    BUFFER_SIZE = 10000,
    worker_id=2,
    time_scale=20,
    no_graphics = True
    )
env = None

In [6]:
def model_pipeline(hyperparameters):
    global env
    
    try:
        env.close()
    except:
        pass

    # tell wandb to get started
    with wandb.init(project="pytorch-DQN", config=hyperparameters):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # make the model, data, and optimization problem
        model, env, criterion, optimizer = make(config)
        print(model)

        # and use them to train the model
        train(model, env, criterion, optimizer, config)

        # and test its final performance
        test(model, env, config)

        env.close()

    return model

In [7]:
def make(config):
    # Make the data
    env, spec = make_env(config)
    
    # Make the model
    # model = Agent(config.kernels, config.classes).to(device)
    model = QNetwork((44, 1), 126, 3)

    # Make the loss and optimizer
    criterion = torch.nn.MSELoss() # None # nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    
    return model, env, criterion, optimizer

# 📡 Define the Env Loading and Model

In [8]:
def make_env(config):
    channel = EngineConfigurationChannel()
    env = UE("run32_training", seed=1, worker_id=config.worker_id, no_graphics=config.no_graphics, side_channels=[channel])
    channel.set_configuration_parameters(time_scale = config.time_scale)
    print("Environment created.")
    
    env.reset()
    behavior_name = list(env.behavior_specs)[0]
    print(f"Name of the behavior : {behavior_name}")
    
    spec = env.behavior_specs[behavior_name]
    print(f"Type of the spec : {spec}")
    
    return env, spec

In [None]:
################################## PPO Policy ##################################
class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, has_continuous_action_space, action_std_init):
        super(ActorCritic, self).__init__()

        self.has_continuous_action_space = has_continuous_action_space

        if has_continuous_action_space:
            self.action_dim = action_dim
            self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device)

        # actor
        if has_continuous_action_space :
            self.actor = nn.Sequential(
                            nn.Linear(state_dim, 64),
                            nn.Tanh(),
                            nn.Linear(64, 64),
                            nn.Tanh(),
                            nn.Linear(64, action_dim),
                            nn.Tanh()
                        )
        else:
            self.actor = nn.Sequential(
                            nn.Linear(state_dim, 64),
                            nn.Tanh(),
                            nn.Linear(64, 64),
                            nn.Tanh(),
                            nn.Linear(64, action_dim),
                            nn.Softmax(dim=-1)
                        )

        
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 64),
                        nn.Tanh(),
                        nn.Linear(64, 64),
                        nn.Tanh(),
                        nn.Linear(64, 1)
                    )
        
    def set_action_std(self, new_action_std):

        if self.has_continuous_action_space:
            self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std).to(device)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling ActorCritic::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")


    def forward(self):
        raise NotImplementedError
    

    def act(self, state):

        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
            dist = MultivariateNormal(action_mean, cov_mat)
        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()
    

    def evaluate(self, state, action):

        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            action_var = self.action_var.expand_as(action_mean)
            cov_mat = torch.diag_embed(action_var).to(device)
            dist = MultivariateNormal(action_mean, cov_mat)
            
            # for single action continuous environments
            if self.action_dim == 1:
                action = action.reshape(-1, self.action_dim)

        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_logprobs, state_values, dist_entropy


class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6):

        self.has_continuous_action_space = has_continuous_action_space

        if has_continuous_action_space:
            self.action_std = action_std_init

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()


    def set_action_std(self, new_action_std):
        
        if self.has_continuous_action_space:
            self.action_std = new_action_std
            self.policy.set_action_std(new_action_std)
            self.policy_old.set_action_std(new_action_std)
        
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling PPO::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")


    def decay_action_std(self, action_std_decay_rate, min_action_std):
        print("--------------------------------------------------------------------------------------------")

        if self.has_continuous_action_space:
            self.action_std = self.action_std - action_std_decay_rate
            self.action_std = round(self.action_std, 4)
            if (self.action_std <= min_action_std):
                self.action_std = min_action_std
                print("setting actor output action_std to min_action_std : ", self.action_std)
            else:
                print("setting actor output action_std to : ", self.action_std)
            self.set_action_std(self.action_std)

        else:
            print("WARNING : Calling PPO::decay_action_std() on discrete action space policy")

        print("--------------------------------------------------------------------------------------------")


    def select_action(self, state):

        if self.has_continuous_action_space:
            with torch.no_grad():
                state = torch.FloatTensor(state).to(device)
                action, action_logprob = self.policy_old.act(state)

            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)

            return action.detach().cpu().numpy().flatten()

        else:
            with torch.no_grad():
                state = torch.FloatTensor(state).to(device)
                action, action_logprob = self.policy_old.act(state)
            
            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)

            return action.item()


    def update(self):

        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)

        
        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()
    
    
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   

    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        
        
       



# 👟 Define Training Logic

### 3️⃣ Step 3. Track gradients with `wandb.watch` and everything else with `wandb.log`

In [12]:
@measure
def train(model, env, criterion, optimizer, config):
    wandb.watch(model, criterion, log="all", log_freq=10)
    experiences: Buffer = []

    cumulative_rewards: List[float] = []
    NUM_NEW_EXP = config.NUM_NEW_EXP
    BUFFER_SIZE = config.BUFFER_SIZE

    behavior_name = list(env.behavior_specs)[0]
        
    for episode in range(config.NUM_TRAINING_STEPS):
        new_exp,_ = Trainer.generate_trajectories(env, model, NUM_NEW_EXP, epsilon=0.1)
        random.shuffle(experiences)
        if len(experiences) > BUFFER_SIZE:
            experiences = experiences[:BUFFER_SIZE]
        experiences.extend(new_exp)
        Trainer.update_q_net(model, criterion, optimizer, experiences, 3)
        _, rewards = Trainer.generate_trajectories(env, model, 100, epsilon=0)
        cumulative_rewards.append(rewards)
        # print("Training step ", episode + 1, "\treward ", episode_rewards)
        train_log(rewards, episode)

In [None]:

print("============================================================================================")


################################### Training ###################################

config = dict(
    # Env HP
    worker_id=0,
    time_scale=20,
    no_graphics = True,
    # Generic HP
    NUM_TEST_STEPS = 10,
    NUM_TRAINING_STEPS = int(1e4),
    # Algorithm HP
    max_ep_len = 800,
    has_continuous_action_space =  False,
    K_epochs = 40               # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    gamma = 0.9
    lr_actor = 0.0003,
    lr_critic = 0.001
    )
####### initialize environment hyperparameters ######

has_continuous_action_space = config.has_continuous_action_space

max_ep_len = config.max_ep_len                    # max timesteps in one episode
max_training_timesteps = config.NUM_TRAINING_STEPS   # break training loop if timeteps > max_training_timesteps

print_freq = max_ep_len * 4     # print avg reward in the interval (in num timesteps)
log_freq = max_ep_len * 2       # log avg reward in the interval (in num timesteps)
save_model_freq = int(2e4)      # save model frequency (in num timesteps)

action_std = None


#####################################################


## Note : print/log frequencies should be > than max_ep_len


################ PPO hyperparameters ################

update_timestep = max_ep_len * 4   # update policy every n timesteps
K_epochs = config.K_epochs         # update policy for K epochs
eps_clip = config.eps_clip         # clip parameter for PPO
gamma = config.gamma               # discount factor

lr_actor = config.lr_actor         # learning rate for actor network
lr_critic = config.lr_critic       # learning rate for critic network

random_seed = 0         # set random seed if required (0 = no random seed)

#####################################################



print("training environment name : " + env_name)


# state space dimension
state_dim = env.observation_space.shape[0]

# action space dimension
if has_continuous_action_space:
    action_dim = env.action_space.shape[0]
else:
    action_dim = env.action_space.n



###################### logging ######################

#### log files for multiple runs are NOT overwritten

log_dir = "PPO_logs"
if not os.path.exists(log_dir):
      os.makedirs(log_dir)

log_dir = log_dir + '/' + env_name + '/'
if not os.path.exists(log_dir):
      os.makedirs(log_dir)


#### get number of log files in log directory
run_num = 0
current_num_files = next(os.walk(log_dir))[2]
run_num = len(current_num_files)


#### create new log file for each run 
log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

print("current logging run number for " + env_name + " : ", run_num)
print("logging at : " + log_f_name)

#####################################################


################### checkpointing ###################

run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder

directory = "PPO_preTrained"
if not os.path.exists(directory):
      os.makedirs(directory)

directory = directory + '/' + env_name + '/'
if not os.path.exists(directory):
      os.makedirs(directory)


checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
print("save checkpoint path : " + checkpoint_path)

#####################################################


############# print all hyperparameters #############

print("--------------------------------------------------------------------------------------------")

print("max training timesteps : ", max_training_timesteps)
print("max timesteps per episode : ", max_ep_len)

print("model saving frequency : " + str(save_model_freq) + " timesteps")
print("log frequency : " + str(log_freq) + " timesteps")
print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")

print("--------------------------------------------------------------------------------------------")

print("state space dimension : ", state_dim)
print("action space dimension : ", action_dim)

print("--------------------------------------------------------------------------------------------")

if has_continuous_action_space:
    print("Initializing a continuous action space policy")
    print("--------------------------------------------------------------------------------------------")
    print("starting std of action distribution : ", action_std)
    print("decay rate of std of action distribution : ", action_std_decay_rate)
    print("minimum std of action distribution : ", min_action_std)
    print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")

else:
    print("Initializing a discrete action space policy")

print("--------------------------------------------------------------------------------------------")

print("PPO update frequency : " + str(update_timestep) + " timesteps") 
print("PPO K epochs : ", K_epochs)
print("PPO epsilon clip : ", eps_clip)
print("discount factor (gamma) : ", gamma)

print("--------------------------------------------------------------------------------------------")

print("optimizer learning rate actor : ", lr_actor)
print("optimizer learning rate critic : ", lr_critic)

if random_seed:
    print("--------------------------------------------------------------------------------------------")
    print("setting random seed to ", random_seed)
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

#####################################################

print("============================================================================================")

################# training procedure ################

# initialize a PPO agent
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)


# track total training time
start_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT) : ", start_time)

print("============================================================================================")


# logging file
log_f = open(log_f_name,"w+")
log_f.write('episode,timestep,reward\n')


# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

log_running_reward = 0
log_running_episodes = 0

time_step = 0
i_episode = 0


# training loop
while time_step <= max_training_timesteps:
    
    state = env.reset()
    current_ep_reward = 0

    for t in range(1, max_ep_len+1):
        
        # select action with policy
        action = ppo_agent.select_action(state)
        state, reward, done, _ = env.step(action)
        
        # saving reward and is_terminals
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)
        
        time_step +=1
        current_ep_reward += reward

        # update PPO agent
        if time_step % update_timestep == 0:
            ppo_agent.update()

        # if continuous action space; then decay action std of ouput action distribution
        if has_continuous_action_space and time_step % action_std_decay_freq == 0:
            ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)

        # log in logging file
        if time_step % log_freq == 0:

            # log average reward till last episode
            log_avg_reward = log_running_reward / log_running_episodes
            log_avg_reward = round(log_avg_reward, 4)

            log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
            log_f.flush()

            log_running_reward = 0
            log_running_episodes = 0

        # printing average reward
        if time_step % print_freq == 0:

            # print average reward till last episode
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

            print_running_reward = 0
            print_running_episodes = 0
            
        # save model weights
        if time_step % save_model_freq == 0:
            print("--------------------------------------------------------------------------------------------")
            print("saving model at : " + checkpoint_path)
            ppo_agent.save(checkpoint_path)
            print("model saved")
            print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
            print("--------------------------------------------------------------------------------------------")
            
        # break; if the episode is over
        if done:
            break

    print_running_reward += current_ep_reward
    print_running_episodes += 1

    log_running_reward += current_ep_reward
    log_running_episodes += 1

    i_episode += 1


log_f.close()
env.close()




# print total training time
print("============================================================================================")
end_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT) : ", start_time)
print("Finished training at (GMT) : ", end_time)
print("Total training time  : ", end_time - start_time)
print("============================================================================================")







In [14]:
def train_log(reward, episode):
#     loss = float(loss)
    reward = float(reward)
    episode +=1
    
    # where the magic happens
    wandb.log({"episode": episode, "reward": reward}) #, step=example_ct
    print(f"Reward after " + str(episode).zfill(5) + f" episodes: {reward:.3f}")

# 🧪 Define Testing Logic

#### 4️⃣ Optional Step 4: Call `wandb.save`

In [15]:
def test(model, env, config):
    model.eval()

    # Run the model on some test examples
    with torch.no_grad():
        experiences: Buffer = []
        cumulative_rewards: List[float] = []
        NUM_NEW_EXP = config.NUM_NEW_EXP
        BUFFER_SIZE = config.BUFFER_SIZE

        behavior_name = list(env.behavior_specs)[0]

        for episode in range(config.NUM_TEST_STEPS):
            new_exp,_ = Trainer.generate_trajectories(env, model, NUM_NEW_EXP, epsilon=0.1)
            random.shuffle(experiences)
            if len(experiences) > BUFFER_SIZE:
                experiences = experiences[:BUFFER_SIZE]
            experiences.extend(new_exp)
#                 Trainer.update_q_net(model, criterion, optimizer, experiences, 3)
            _, rewards = Trainer.generate_trajectories(env, model, 100, epsilon=0)
            cumulative_rewards.append(rewards)
            # print("Training step ", episode + 1, "\treward ", episode_rewards)

        print(f"Average reward of the model after {config.NUM_TEST_STEPS} " +
              f"test episodes: {numpy.mean(cumulative_rewards)}%")

        wandb.log({"test_average_reward": numpy.mean(cumulative_rewards)})

    # Save the model in the exchangeable ONNX format
    torch.onnx.export(model, [], "model.onnx")
    wandb.save("model.onnx")

# 🏃‍♀️ Run training and watch your metrics live on [wandb.ai](https://wandb.ai)!

In [None]:
# Build, train and analyze the model with the pipeline
model, env = model_pipeline(config)

Environment created.
Name of the behavior : Hummingbird?team=0
Type of the spec : BehaviorSpec(observation_specs=[ObservationSpec(shape=(44,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='RayPerceptionSensor'), ObservationSpec(shape=(3,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='StackingSensor_size3_VectorSensor_size1')], action_spec=ActionSpec(continuous_size=0, discrete_branches=(3, 3, 3)))
QNetwork(
  (dense1): Linear(in_features=44, out_features=126, bias=True)
  (dense2): Linear(in_features=126, out_features=126, bias=True)
  (dense2_x1): Linear(in_features=126, out_features=3, bias=True)
  (dense2_x2): Linear(in_features=126, out_features=3, bias=True)
  (dense2_x3): Linear(in_features=126, out_features=3, bias=True)
  (act): ReLU()
  (act_out): Sigmoid()
)
Reward after 00001 episodes: -1.000
Reward after 00002 episodes: 21.312
Reward after 00003 episodes: 36.

# 🧹 Test Hyperparameters with Sweeps


## [Check out Hyperparameter Optimization in PyTorch using W&B Sweep $\rightarrow$](https://colab.research.google.com/drive/1QTIK23LBuAkdejbrvdP5hwBGyYlyEJpT?usp=sharing)

Running a hyperparameter sweep with Weights & Biases is very easy. There are just 3 simple steps:

1. **Define the sweep:** We do this by creating a dictionary or a [YAML file](https://docs.wandb.com/library/sweeps/configuration) that specifies the parameters to search through, the search strategy, the optimization metric et all.

2. **Initialize the sweep:** 
`sweep_id = wandb.sweep(sweep_config)`

3. **Run the sweep agent:** 
`wandb.agent(sweep_id, function=train)`

And voila! That's all there is to running a hyperparameter sweep!
<img src="https://imgur.com/UiQKg0L.png" alt="Weights & Biases" />

# 🤓 Advanced Setup
1. [Environment variables](https://docs.wandb.com/library/environment-variables): Set API keys in environment variables so you can run training on a managed cluster.
2. [Offline mode](https://docs.wandb.com/library/technical-faq#can-i-run-wandb-offline): Use `dryrun` mode to train offline and sync results later.
3. [On-prem](https://docs.wandb.com/self-hosted): Install W&B in a private cloud or air-gapped servers in your own infrastructure. We have local installations for everyone from academics to enterprise teams.
4. [Sweeps](https://docs.wandb.com/sweeps): Set up hyperparameter search quickly with our lightweight tool for tuning.

In [None]:
# for n in range(NUM_TRAINING_STEPS):
#   new_exp,_ = Trainer.generate_trajectories(env, qnet, NUM_NEW_EXP, epsilon=0.1)
#   random.shuffle(experiences)
#   if len(experiences) > BUFFER_SIZE:
#     experiences = experiences[:BUFFER_SIZE]
#   experiences.extend(new_exp)
#   Trainer.update_q_net(qnet, optim, experiences, 3)
#   _, rewards = Trainer.generate_trajectories(env, qnet, 100, epsilon=0)
#   cumulative_rewards.append(rewards)
#   print("Training step ", n+1, "\treward ", rewards)
#   print()


# env.close()

# # Show the training graph
# plt.plot(range(NUM_TRAINING_STEPS), cumulative_rewards)

In [None]:
#  class UnityEnv(gym.Env):
#     """
#     Provides Gym wrapper for Unity Learning Environments.
#     Multi-agent environments use lists for object types, as done here:
#     https://github.com/openai/multiagent-particle-envs
#     """
 
#     def __init__(
#         self,
#         environment_filename: str,
#         dimensions: int = [],   #Added
#         timescale: int = 1,     #Added
#         worker_id: int = 0,
#         use_visual: bool = False,
#         uint8_visual: bool = False,
#         multiagent: bool = False,
#         flatten_branched: bool = False,
#         no_graphics: bool = False,
#         allow_multiple_visual_obs: bool = False,
#         set_config: bool = True,    #Added
#     ):
#         """
#         Environment initialization
#         :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym.
#         :param worker_id: Worker number for environment.
#         :param use_visual: Whether to use visual observation or vector observation.
#         :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0).
#         :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done).
#         :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than
#             MultiDiscrete.
#         :param no_graphics: Whether to run the Unity simulator in no-graphics mode
#         :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one.
#         """
#         base_port = 5005
#         if environment_filename is None:
#             base_port = UnityEnvironment.DEFAULT_EDITOR_PORT
 
#         channel = EngineConfigurationChannel()        # Added
 
 
#         #Added
#         if set_config == True:
#             channel.set_configuration_parameters(time_scale=timescale, width=dimensions[0], height=dimensions[1])
#         #Added
 
#         self._env = UnityEnvironment(
#             environment_filename,
#             worker_id,
#             base_port=base_port,
#             no_graphics=no_graphics,
#             side_channels=[channel],        # Added
#         )