# Optical RL-Gym

## Training the Stable Baselines agents using the DeepRLSA environment

This file contains examples of how to train agents for the DeepRMSA environment.

The agents used in this file come from the [Stable baselines](https://github.com/hill-a/stable-baselines) framework.

This notebook is based upon the one available [here](https://github.com/Stable-Baselines-Team/rl-colab-notebooks/blob/master/monitor_training.ipynb).

Before running this notebook, make sure to install Stable Baselines and the Optical RL-Gym in your Python environment.

### General imports

In [2]:
import os
import pickle
import numpy as np
from IPython.display import clear_output

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [3]:
import tensorflow as tf
# silencing tensorflow warnings
import logging
logging.getLogger('tensorflow').setLevel(logging.FATAL)
tf.__version__ # printint out tensorflow version used

'2.10.0'

### Stable Baseline imports

In [4]:
import stable_baselines3
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy

from stable_baselines3 import DQN
from stable_baselines3.dqn.policies import MlpPolicy 

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common import results_plotter

stable_baselines3.__version__ # printing out stable_baselines version used

  from .autonotebook import tqdm as notebook_tqdm


'1.7.0'

### Environment imports

In this particular example, there is no need to import anything specific to the Optical RL-Gym. Only by importing the Open AI Gym below, you already get all the functionality needed.

In [5]:
import gym

### Define a callback function

In [6]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                 # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print("Num timesteps: {} - ".format(self.num_timesteps),end="")
                    print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))
                  # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print("Saving new best model to {}".format(self.save_path))
                        self.model.save(self.save_path)
                if self.verbose > 0:
                    clear_output(wait=True)

        return True

### Setting up the environment

The parameters are set as in the [DeepRMSA](https://doi.org/10.1109/JLT.2019.2923615) work and its [available reporitory](https://github.com/xiaoliangchenUCD/DeepRMSA).

In [7]:
alg_name = 'DQN'
top_name = 'cost239'
k_path = 10

topology_dir = '/topologies/demo/' +  top_name +f'_{k_path}.h5'
with open(f'..{topology_dir}', 'rb') as f:
    topology = pickle.load(f)

node_request_probabilities = np.array([1/11, 1/11, 1/11, 1/11,
                                       1/11, 1/11, 1/11, 1/11,
                                       1/11, 1/11, 1/11])

for ht in [50]:
    env_args = dict(topology=topology, seed=10, 
                    allow_rejection=False, # the agent cannot proactively reject a request
                    j=1, # consider only the first suitable spectrum block for the spectrum assignment
                    mean_service_holding_time=ht, # value is not set as in the paper to achieve comparable reward values
                    episode_length=50, node_request_probabilities=node_request_probabilities, num_spectrum_resources = 358)

    # Create log dir
    log_dir = "./tmp/deeprmsa-dqn-sbpp-agent-{}-cost239/".format(ht)
    os.makedirs(log_dir, exist_ok=True)
    callback = SaveOnBestTrainingRewardCallback(check_freq=50, log_dir=log_dir)

    env = gym.make('DeepRMSA-v0', **env_args)

    # logs will be saved in log_dir/monitor.csv
    # in this case, on top of the usual monitored things, we also monitor service and bit rate blocking probabilities
    env = Monitor(env, log_dir + 'training', info_keywords=('episode_service_blocking_rate','bit_rate_blocking_rate','failure', 'episode_failure',
                        'failure_slots','episode_failure_slots', 
                        'failure_disjointness','episode_failure_disjointness', 'failure_shared_disjointness',
                        'episode_failure_shared_disjointness','shared_counter','episode_shared_counter', 'dpp_counter',
                        'episode_dpp_counter','compactness', 'throughput', 'available_slots_working', 'available_slots_backup'))

    # kwargs = {'double_q': True, 'prioritized_replay': True, 'policy_kwargs': dict(dueling=True)} # set of parameters for testing
    policy_kwargs = dict(net_arch=4*[128])  

    model = DQN(MlpPolicy, env, verbose=0, tensorboard_log="./tb/DQN-sbpp-agent-{}-cost239-DeepRMSA-v0/".format(ht), gamma=0.85, policy_kwargs=policy_kwargs,
            learning_rate=0.0001, exploration_fraction=0.01, batch_size = 256, buffer_size = 50000)

    training_m = model.learn(total_timesteps=200000, callback=callback)


Num timesteps: 200000 - Best mean reward: 22.17 - Last mean reward per episode: 20.71


In [7]:
alg_name = 'DQN'
top_name = 'cost239'
k_path = 10

topology_dir = '/topologies/demo/' +  top_name +f'_{k_path}.h5'
with open(f'..{topology_dir}', 'rb') as f:
    topology = pickle.load(f)

node_request_probabilities = np.array([1/11, 1/11, 1/11, 1/11,
                                       1/11, 1/11, 1/11, 1/11,
                                       1/11, 1/11, 1/11])

for ht in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    env_args = dict(topology=topology, seed=10, 
                    allow_rejection=False, # the agent cannot proactively reject a request
                    j=1, # consider only the first suitable spectrum block for the spectrum assignment
                    mean_service_holding_time=ht, # value is not set as in the paper to achieve comparable reward values
                    episode_length=50, node_request_probabilities=node_request_probabilities, num_spectrum_resources = 358)

    # Create log dir
    log_dir = "./tmp/deeprmsa-dqn-sbpp-heuristic-{}-cost239/".format(ht)
    os.makedirs(log_dir, exist_ok=True)
    callback = SaveOnBestTrainingRewardCallback(check_freq=50, log_dir=log_dir)

    env = gym.make('DeepRMSAKSP-v0', **env_args)

    # logs will be saved in log_dir/monitor.csv
    # in this case, on top of the usual monitored things, we also monitor service and bit rate blocking probabilities
    env = Monitor(env, log_dir + 'training', info_keywords=('episode_service_blocking_rate','bit_rate_blocking_rate','failure', 'episode_failure',
                        'failure_slots','episode_failure_slots', 
                        'failure_disjointness','episode_failure_disjointness', 'failure_shared_disjointness',
                        'episode_failure_shared_disjointness','shared_counter','episode_shared_counter', 'dpp_counter',
                        'episode_dpp_counter','compactness', 'throughput', 'available_slots_working', 'available_slots_backup'))

    # kwargs = {'double_q': True, 'prioritized_replay': True, 'policy_kwargs': dict(dueling=True)} # set of parameters for testing
    policy_kwargs = dict(net_arch=4*[128])  

    model = DQN(MlpPolicy, env, verbose=0, tensorboard_log="./tb/DQN-sbpp-heuristic-{}-cost239-DeepRMSAKSP-v0/".format(ht), gamma=0.85, policy_kwargs=policy_kwargs,
            learning_rate=0.0001, exploration_fraction=0.01, batch_size = 256, buffer_size = 50000)

    training_m = model.learn(total_timesteps=1000, callback=callback)



Num timesteps: 1000 - Best mean reward: 69.00 - Last mean reward per episode: 12.25
