In [1]:
import torch
import torch.nn as nn
import math
import numpy as np

np.random.seed(42)

import sys
print(sys.executable) # just to check which python

import gym
from gym import spaces

from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

/usr/local/opt/python@3.9/bin/python3.9


In [2]:
class RequestType:
    def __init__(self, request_type, bandwidth, service_rate, arrival_rate, source, sink, distribution, switch_rate=None):
        # distribution is 1x2 if elastic and 1x1 if static
        
        self.type = request_type
        self.bw = bandwidth
        self.service_rate = service_rate
        self.arrival_rate = arrival_rate
        self.source = source
        self.sink = sink
        self.distribution = distribution
        self.switch_rate = switch_rate
        
        self.num_made = 0
        self.num_accepted = 0

class Request:
    def __init__(self, request_type, service_time, arrival_time, source, sink, transfer_rate, distribution=None, parent_elastic=None, bw_dist=None, request_type_template=None):
        self.type = request_type
        self.service_time = service_time
        self.arrival_time = arrival_time
        self.source = source
        self.sink = sink
        self.bw = transfer_rate
        self.request_type = request_type
        self.parent_elastic = parent_elastic
        self.accepted = None
        self.path = None
        self.bw_dist = bw_dist
        
        self.blueprint = request_type_template
        
        if request_type == "elastic":
            self.distribution = distribution
            self.scale_requests = []
                   
        if request_type_template is not None:
            request_type_template.num_made += 1
            
    def add_scale_request(self, req): 
        # we store related scale requests for elastic requests
        # not used if static request
        self.scale_requests.append(req)
            
    def get_encoding(self, nodes_in_environment):
        # as per our notes, this SHOULD return 1x5 tensor,
        # but we have one hot encodings INSIDE this tensor,
        # so we will flatten this and return, so the size will be
        # larger than 1x5
        
        # nodes_in_environment is a list of all the nodes in our graph
        # eg ["a", "b", "c"]
        
        # request is [one hot source, one hot destination, bw, service time, one hot type]
                
        one_hot_source = nn.functional.one_hot(torch.tensor([nodes_in_environment.index(self.source)]), num_classes=len(nodes_in_environment)).flatten()
        one_hot_dest   = nn.functional.one_hot(torch.tensor([nodes_in_environment.index(self.sink)]), num_classes=len(nodes_in_environment)).flatten()
    
        if self.request_type == "static" or self.request_type == "scale":
            one_hot_type = torch.tensor([1, 0])
        elif self.request_type == "elastic":
            one_hot_type = torch.tensor([0, 1])
            
        encoding = torch.cat([one_hot_source, 
                             one_hot_dest,
                             torch.tensor([self.bw]), 
                             torch.tensor([self.service_time]),
                             one_hot_type])
        
        return encoding

In [3]:
class Link:
    def __init__(self, node_1, node_2, bw_capacity):
        self.serving_requests = []
        self.nodes = [node_1, node_2]
        self.total_bw = bw_capacity
        
    def reset(self):
        self.serving_requests = []
        
    def add_request(self, request_obj):
        self.serving_requests.append(request_obj)
        
    def remove_request(self, request_obj):
        self.serving_requests.remove(request_obj)
        
    def remaining_bw(self): 
        # subtracting bw being used from total bw capacity
        bw_being_used = 0
        for req in self.serving_requests:
            bw_being_used += req.bw
            
        return (self.total_bw - bw_being_used)

In [4]:
class Environment(gym.Env):
    # requests_in_service_encoder = nn.RNN(????, 7)
    metadata = {'render.modes': ['human']}
    
    def __init__(self, nodes, links, request_blueprints, use_RNN=False, sb3_compat=False):
        super(Environment, self).__init__()
                
        """
        nodes: list of strings where each string is just a name or identifier of a node
        links: list of tuples where in tuple t, t[0] is first node, t[1] is another node, and t[2] is bw capacity of the link
        request_blueprints: list of DeploymentRequest objects
        """
        self.nodes = nodes
        self.links = {}
        self.request_history = []
        self.E_history = []
        self.past_distributions = []
        self.request_blueprints = request_blueprints
        self.last_time = 0
        self.episode_timesteps = 600
        self.use_RNN = use_RNN
        self.sb3_compat = sb3_compat
        
        if sb3_compat:
            self.request_being_considered = None
        
        for link in links:
            if link[0] not in self.nodes or link[1] not in self.nodes:
                raise Exception("Node in link " + str(link) + " doesn't exist")
            
            link_obj = Link(*link)

            self.links[link[0] + link[1]] = link_obj
            self.links[link[1] + link[0]] = link_obj
            
        self.request_list = self.create_requests()
        self.request_queue = iter(self.request_list)
        
        
        # Setup gym-specific code
        env_encoding_size = self.get_encoding(increment_iterator=False).size()
        req_encoding_size = self.request_list[0].get_encoding(self.nodes).size()
        # print(env_encoding_size[0] + req_encoding_size[0])
        
        self.action_space = spaces.Box(low=0, high=math.inf,
                                      shape=(4,), dtype=np.float32)
        self.observation_space = spaces.Box(low=-math.inf, high=math.inf,
                                      shape=(env_encoding_size[0] + req_encoding_size[0],), dtype=np.float32)
        
        # TODO, WRITE RNN logic
        
        #if use_RNN:
        #    self.requests_in_service_encoder = nn.RNN
        
    def add_request(self, request, path=None): # we want to add this request to a link or path
        # path: a list of nodes that the request traverses including source and sink
        # if no path is specified, path is assumed to be [req.source, req.sink]
        
        if path is not None: 
            nodes = [[path[i], path[i + 1]] for i in range(len(path) - 1)]
            for node_pair in nodes:
                env.links[node_pair[0] + node_pair[1]].add_request(request)
        
        else:
            self.links[request.source + request.sink].add_request(request)
        
        request.accepted = True
        request.blueprint.num_accepted += 1
        self.request_history.append(request)
        # print(self.links[request.source + request.sink])
    
    def reset(self):
        for link in self.links.values():
            link.reset()
        self.request_history = []
        self.E_history = []
        self.past_distributions = []
        self.last_time = 0
        self.request_list = env.create_requests()
        self.request_queue = iter(self.request_list)
        
        return env.get_encoding()
        
    def reward(self, request, decision):
        base_rate = 1         # 1 when static
        type_bonus = 0.9      # 0.9 when static
        bw = request.bw
        if request.type == "elastic":
            base_rate = request.bw
            type_bonus = 1.1                # 1.1 when elastic
            bw = np.array(request.bw_dist).dot(request.distribution)
            
            
        r = bw * base_rate * request.service_time * type_bonus
        
        # if remaining bandwidth on link(s) < 0, very "bad" reward
        if request.path is not None:
            path_length = len(request.path)
            
            r *= math.pow(0.9, path_length - 2)
            
            nodes = [[request.path[i], request.path[i + 1]] for i in range(len(request.path) - 1)]
            for node_pair in nodes:
                if self.links[node_pair[0] + node_pair[1]].remaining_bw() < 0:
                    return (-r * 10)
        else:
            # path is direct, so no decrease of reward needed
            remaining_bw = self.links[request.source + request.sink].remaining_bw()
            if remaining_bw < 0:
                return (-r * 10)
        
        if decision == "accept":
            return r
        
        if decision == "reject":
            if request.type == "static" or request.type == "scale":
                return 0
            elif request.type == "elastic":
                if len(self.past_distributions) == 0:
                    return -1 * r
                
                else:
                    current_sum = torch.from_numpy(np.sum(self.past_distributions, axis=0))

                    average_past_distribution = current_sum / len(self.past_distributions)
                    current_req_distribution = torch.tensor(request.distribution)

                    return -1 * r * math.exp(-nn.functional.kl_div(average_past_distribution, current_req_distribution))

                """
                past_distributions = []
                for req in self.request_history:
                    if req.request_type == "elastic":
                        past_distributions.append(req.distribution)
                
                average_past_distribution = torch.mean(past_distributions, dim=1)
                current_req_distribution = torch.tensor(request.distribution)
                
                if bool(average_past_distribution[0] < current_req_distribution[0]):
                    return -1 * r * math.exp(-nn.functional.kl_div(average_past_distribution, current_req_distribution))
                else:
                    return 0
                """
                
    def next_req(self):
        next_req = next(self.request_queue)
        if self.sb3_compat:
            self.request_being_considered = next_req
        return next_req
                
    def step(self, action, req=None):
        # what happens if we have two requests that come in on the same timestep but there is only enough bandwidth for one?
        # do we the decision on the second request with knowledge of the first request
        # essentially, after we accept the first request, will we submit an updated encoding of the network to the policy network?
 
        # actions is a Nx2 matrix where the first column in the request and second is the decision
        # decision is either "accept" or "reject"
        # this is given by our agent
        
        # if req is None, that means we are using sb3_compat=True and we can get the req from self.request_being_considered
                
        if req is None:
            req = self.request_being_considered
            
        if action[0] > 0.5:
            # accept request
            paths = (env.search(req.source, req.sink, [], []))
            paths.sort(key=lambda x: len(x)) # sort by shortest path
            # select the path we are using
            path = paths[action[1:4].argmax()]
            
            self.add_request(req, path)
        
            reward = env.reward(req, "accept")
        elif action[0] < 0.5:
            # reject
            reward = env.reward(req, "reject")
        
        obs = env.get_encoding()
        
        done = req.arrival_time > 600
        info = {}
        
        return obs, reward, done, info
        
    def update_requests(self, current_time):
        # here, we remove expired requests and update E_history based off of the request stats
        
        for link in self.links.values():
            for request in link.serving_requests.copy():
                if (request.arrival_time + request.service_time) > self.last_time and (request.arrival_time + request.service_time) < current_time:
                    # request has expired, let's remove it from the links
                    for link in self.links.values():
                        if request in link.serving_requests:
                            link.remove_request(request)

                    if request.type == "elastic":
                        time_on_higher_bw = 0
                        for scale_req in request.scale_requests:
                            time_on_higher_bw += scale_req.service_time

                        time_on_lower_bw = request.service_time - time_on_higher_bw

                        # calculate E[history]
                        request_time = np.array([time_on_lower_bw, time_on_higher_bw])
                        request_bw = request.bw
                        result = (request_time / request_time.sum()).dot(request_bw)
                        self.past_distributions.append(request_time / request_time.sum())
                        self.E_history.append(result)

    def get_encoding(self, increment_iterator=True):
        links_processed = [] 
        # these will store links that we have already encoded so we don't encode them again
        
        current_encoding = []
        
        # h = torch.zeros(7) # assuming 7 for h0 size
        # last_out = None
        
        env_encoding = []
        
        if increment_iterator:
            next_req = self.next_req()
            """
            while next_req.type == "scale":
                if next_req.parent_elastic.accepted:
                    next_req.accepted = True # we must accept since we accepted elastic req
                    self.add_request(next_req, next_req.parent_elastic.path)
                next_req = self.next_req()
            """
            self.update_requests(next_req.arrival_time)

        for link in self.links.values():
            if link in links_processed:
                continue

                        
            # Commented because we don't want to encode any queue for phase 1
            
            # for req in link.serving_requests
                # request is [one hot source, one hot destination, bw, service time, one hot type]
                
                # one_hot_source = nn.functional.one_hot(torch.tensor([self.nodes.index(req.source)]), num_classes=len(self.nodes))
                # one_hot_dest   = nn.functional.one_hot(torch.tensor([self.nodes.index(req.sink)]), num_classes=len(self.nodes))

                # req_tensor = torch.Tensor([]) # mismatched dimensions??!
                # last_out, h = self.requests_in_service_encoder(req_tensor, h)

            # current_encoding.append(torch.cat(torch.Tensor([link.remaining_bw]), last_out))
            # torch.stack(current_encoding)
            
            # check implementation later
            
            env_encoding.append(link.remaining_bw())
            
            links_processed.append(link)
            
        if not increment_iterator:
            return torch.tensor(env_encoding)
        
        if self.sb3_compat:
            return torch.cat([torch.tensor(env_encoding), torch.tensor(next_req.get_encoding(self.nodes))])
        else:
            return torch.tensor(env_encoding), torch.tensor(next_req.get_encoding(self.nodes)), next_req
    
    def create_requests(self):
        requests = []
        
        for request_type in self.request_blueprints:
            arrival_times = []
            service_times = []
            last_arrival = 0
        
            while last_arrival < self.episode_timesteps: # we want to generate requests till we reach episode end
                last_arrival += np.random.exponential(request_type.arrival_rate)
                arrival_times.append(last_arrival)
                                
            for _ in arrival_times:
                service_times.append(np.random.exponential(request_type.service_rate))
                
            for arrival_time, service_time in zip(arrival_times, service_times):
                # start creating requests
                
                new_request = Request(request_type.type, service_time, arrival_time, request_type.source, request_type.sink, 
                                      request_type.bw[0], request_type.distribution, bw_dist=request_type.bw, request_type_template=request_type)
                requests.append(new_request)
                
                if request_type.type == "elastic": 
                    # we will start with the first bandwidth element as starting bw
                    # WE ASSUME that bw[0] < bw[1]
                    timesteps_from_deployment = 0
                    current_bw = request_type.bw[0]
                    while timesteps_from_deployment < service_time:
                        if current_bw == request_type.bw[0]:
                            # we want to generate a scale request to increase bw
                            scale_bw = request_type.bw[1] - current_bw
                            scale_service_time = np.random.exponential(request_type.switch_rate[1])
                            scale_request = Request("scale", scale_service_time, \
                                                    arrival_time + timesteps_from_deployment, request_type.source, \
                                                   request_type.sink, scale_bw, parent_elastic=new_request,
                                                   request_type_template=request_type)
                            requests.append(scale_request)
                            new_request.add_scale_request(scale_request)
                            
                            timesteps_from_deployment += scale_service_time
                            current_bw = request_type.bw[1] # request_type.bw[0] + scale_bw
                        elif current_bw == request_type.bw[1]:
                            # we want to go to lower bw and spend some time there
                            time_spent_on_lower_bw = np.random.exponential(request_type.switch_rate[0])
                            timesteps_from_deployment += time_spent_on_lower_bw
                            current_bw = request_type.bw[0]
                            
        # sort requests by arrival time
        requests.sort(key=lambda x: x.arrival_time)
        return requests
    
    def search(self, source, dest, visited_a, paths):
        visited_a.append(source)
        # print(visited_a)

        for link in set(env.links.values()):
            visited = visited_a.copy()
            if source in link.nodes:
                if dest in link.nodes:
                    visited.append(dest)
                    paths.append(visited)

                x = link.nodes.copy()
                x.remove(source)
                if x[0] not in visited:
                    self.search(x[0], dest, visited.copy(), paths)
        return paths
    
    def print_statistics(self):
        for req_type in self.request_blueprints:
            print(req_type.source + " | " +
                   req_type.sink + " | " + 
                 "BW: " + str(req_type.bw) + " | " +
                 "Arrival rate: " + str(req_type.arrival_rate) + " | " +
                 "Acceptance rate: " + str(req_type.num_accepted / req_type.num_made))

In [5]:
env = Environment(["a", "b", "c", "d", "e", "f"], [["a", "b", 10], ["a", "c", 10], ["b", "d", 10], \
                                                   ["c", "d", 20], ["c", "e", 10], ["d", "f", 10], \
                                                   ["e", "f", 10]], \
                  [RequestType("static", [2], 0.5, 0.75, "a", "b", [1]), \
                  RequestType("static", [8], 1, 1.5, "a", "b", [1]), \
                  RequestType("elastic", [4, 9], 1, 1.5, "a", "b", [0.8, 0.2], switch_rate=[0.08, 0.02]), \
                  RequestType("static", [1], 1, 1.5, "c", "d", [1]), \
                  RequestType("static", [7], 0.5, 0.75, "c", "d", [1]), \
                  RequestType("elastic", [3, 13], 2, 3, "c", "d", [0.9, 0.1], switch_rate=[0.09, 0.01]), \
                  RequestType("static", [3], 0.5, 0.75, "e", "f", [1]), \
                   RequestType("static", [6], 1, 1.5, "e", "f", [1]), \
                    RequestType("elastic", [5, 8], 2, 3, "e", "f", [0.7, 0.3], switch_rate=[0.07, 0.03])],
                 sb3_compat=True)


                # self, request_type, bandwidth, service_rate, arrival_rate, source, sink, distribution, switch_rate=None

### Choose shortest viable path

In [None]:
def policy(env_encoding, next_req_encoding, next_req_obj):    
    # find all paths between source and sink
    paths = (env.search(next_req_obj.source, next_req_obj.sink, [], []))
    paths.sort(key=lambda x: len(x)) # sort by shortest path
    selection = 0
    for path in paths:
        # check if this path works
        works = True
        nodes = [[path[i], path[i + 1]] for i in range(len(path) - 1)]
        for node_pair in nodes:
            if env.links[node_pair[0] + node_pair[1]].remaining_bw() < next_req_obj.bw:
                works = False
                
        if works:
            selection = paths.index(path)
            selection_one_hot = nn.functional.one_hot(torch.tensor([selection]), num_classes=3).flatten()
            next_req_obj.path = path
            return torch.cat([torch.tensor([1]), selection_one_hot])
        
    return torch.cat([torch.tensor([0]), torch.tensor([0,0,0])])

In [None]:
total_reward = 0
env_encoding, next_req_encoding, next_req_obj = env.reset()
done = False

while not done:
    decision = policy(env_encoding, next_req_encoding, next_req_obj)
    
    obs, reward, done, info = env.step(next_req_obj, decision)
    env_encoding, next_req_encoding, next_req_obj = obs
    
    total_reward += reward

print(total_reward)

In [None]:
env.print_statistics()

### PPO

In [6]:
model = PPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=600000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  return torch.cat([torch.tensor(env_encoding), torch.tensor(next_req.get_encoding(self.nodes))])


-----------------------------
| time/              |      |
|    fps             | 1789 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1481         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0090737585 |
|    clip_fraction        | 0.0643       |
|    clip_range           | 0.2          |
|    entropy_loss         | -5.64        |
|    explained_variance   | -0.0021      |
|    learning_rate        | 0.0003       |
|    loss                 | 1.95e+03     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0122      |
|    std                  | 0.986        |
|    value_loss           | 7.21e+03     |
----------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.74e+04    |
|    ep_rew_mean          | -4.8e+04    |
| time/                   |             |
|    fps                  | 1087        |
|    iterations           | 12          |
|    time_elapsed         | 22          |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.048880793 |
|    clip_fraction        | 0.127       |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.65       |
|    explained_variance   | -8.34e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 1.4e+03     |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00271    |
|    std                  | 0.997       |
|    value_loss           | 2.76e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.79e+04     |
|    ep_rew_mean          | -4.78e+04    |
| time/                   |              |
|    fps                  | 1016         |
|    iterations           | 21           |
|    time_elapsed         | 42           |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0060024885 |
|    clip_fraction        | 0.0473       |
|    clip_range           | 0.2          |
|    entropy_loss         | -5.67        |
|    explained_variance   | -3.58e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 7.14e+03     |
|    n_updates            | 200          |
|    policy_gradient_loss | -0.00351     |
|    std                  | 0.999        |
|    value_loss           | 1.63e+04     |
------------------------------------------
-----------------------------------------
| rollout/  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.76e+04     |
|    ep_rew_mean          | -4.56e+04    |
| time/                   |              |
|    fps                  | 1019         |
|    iterations           | 30           |
|    time_elapsed         | 60           |
|    total_timesteps      | 61440        |
| train/                  |              |
|    approx_kl            | 0.0031201765 |
|    clip_fraction        | 0.00532      |
|    clip_range           | 0.2          |
|    entropy_loss         | -5.66        |
|    explained_variance   | 0.0633       |
|    learning_rate        | 0.0003       |
|    loss                 | 4.59e+03     |
|    n_updates            | 290          |
|    policy_gradient_loss | -0.00317     |
|    std                  | 0.997        |
|    value_loss           | 1.33e+04     |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.77e+04    |
|    ep_rew_mean          | -4.24e+04   |
| time/                   |             |
|    fps                  | 1036        |
|    iterations           | 39          |
|    time_elapsed         | 77          |
|    total_timesteps      | 79872       |
| train/                  |             |
|    approx_kl            | 0.006341949 |
|    clip_fraction        | 0.0429      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.64       |
|    explained_variance   | 0.0304      |
|    learning_rate        | 0.0003      |
|    loss                 | 1.29e+03    |
|    n_updates            | 380         |
|    policy_gradient_loss | -0.00732    |
|    std                  | 0.992       |
|    value_loss           | 8.18e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.78e+04    |
|    ep_rew_mean          | -4.02e+04   |
| time/                   |             |
|    fps                  | 1041        |
|    iterations           | 48          |
|    time_elapsed         | 94          |
|    total_timesteps      | 98304       |
| train/                  |             |
|    approx_kl            | 0.007325098 |
|    clip_fraction        | 0.0555      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.6        |
|    explained_variance   | 0.119       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.33e+03    |
|    n_updates            | 470         |
|    policy_gradient_loss | -0.0113     |
|    std                  | 0.984       |
|    value_loss           | 2.08e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.77e+04    |
|    ep_rew_mean          | -3.58e+04   |
| time/                   |             |
|    fps                  | 1049        |
|    iterations           | 57          |
|    time_elapsed         | 111         |
|    total_timesteps      | 116736      |
| train/                  |             |
|    approx_kl            | 0.007209536 |
|    clip_fraction        | 0.0618      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.57       |
|    explained_variance   | 0.197       |
|    learning_rate        | 0.0003      |
|    loss                 | 2.2e+03     |
|    n_updates            | 560         |
|    policy_gradient_loss | -0.0112     |
|    std                  | 0.974       |
|    value_loss           | 5.05e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.77e+04    |
|    ep_rew_mean          | -3.28e+04   |
| time/                   |             |
|    fps                  | 1047        |
|    iterations           | 66          |
|    time_elapsed         | 129         |
|    total_timesteps      | 135168      |
| train/                  |             |
|    approx_kl            | 0.008146528 |
|    clip_fraction        | 0.08        |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.48       |
|    explained_variance   | 0.177       |
|    learning_rate        | 0.0003      |
|    loss                 | 866         |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.012      |
|    std                  | 0.953       |
|    value_loss           | 1.68e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.75e+04    |
|    ep_rew_mean          | -2.91e+04   |
| time/                   |             |
|    fps                  | 1038        |
|    iterations           | 75          |
|    time_elapsed         | 147         |
|    total_timesteps      | 153600      |
| train/                  |             |
|    approx_kl            | 0.008563705 |
|    clip_fraction        | 0.0834      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.43       |
|    explained_variance   | 0.253       |
|    learning_rate        | 0.0003      |
|    loss                 | 419         |
|    n_updates            | 740         |
|    policy_gradient_loss | -0.0115     |
|    std                  | 0.941       |
|    value_loss           | 1.4e+03     |
-----------------------------------------
-----------------------------------------
| rollout/                |       

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.74e+04     |
|    ep_rew_mean          | -2.66e+04    |
| time/                   |              |
|    fps                  | 1034         |
|    iterations           | 84           |
|    time_elapsed         | 166          |
|    total_timesteps      | 172032       |
| train/                  |              |
|    approx_kl            | 0.0047636284 |
|    clip_fraction        | 0.0269       |
|    clip_range           | 0.2          |
|    entropy_loss         | -5.32        |
|    explained_variance   | 0.000851     |
|    learning_rate        | 0.0003       |
|    loss                 | 5.47e+03     |
|    n_updates            | 830          |
|    policy_gradient_loss | -0.00551     |
|    std                  | 0.915        |
|    value_loss           | 1.89e+04     |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | -2.06e+04   |
| time/                   |             |
|    fps                  | 1028        |
|    iterations           | 93          |
|    time_elapsed         | 185         |
|    total_timesteps      | 190464      |
| train/                  |             |
|    approx_kl            | 0.007105916 |
|    clip_fraction        | 0.0621      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.24       |
|    explained_variance   | 0.223       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.32e+03    |
|    n_updates            | 920         |
|    policy_gradient_loss | -0.00867    |
|    std                  | 0.9         |
|    value_loss           | 4.99e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | -1.83e+04   |
| time/                   |             |
|    fps                  | 1024        |
|    iterations           | 102         |
|    time_elapsed         | 203         |
|    total_timesteps      | 208896      |
| train/                  |             |
|    approx_kl            | 0.007940619 |
|    clip_fraction        | 0.0896      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.17       |
|    explained_variance   | 0.177       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.12e+03    |
|    n_updates            | 1010        |
|    policy_gradient_loss | -0.0103     |
|    std                  | 0.885       |
|    value_loss           | 4.32e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | -1.62e+04   |
| time/                   |             |
|    fps                  | 1024        |
|    iterations           | 111         |
|    time_elapsed         | 221         |
|    total_timesteps      | 227328      |
| train/                  |             |
|    approx_kl            | 0.012335058 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.03       |
|    explained_variance   | 0.149       |
|    learning_rate        | 0.0003      |
|    loss                 | 804         |
|    n_updates            | 1100        |
|    policy_gradient_loss | -0.0125     |
|    std                  | 0.855       |
|    value_loss           | 2.09e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | -1.43e+04   |
| time/                   |             |
|    fps                  | 1023        |
|    iterations           | 120         |
|    time_elapsed         | 240         |
|    total_timesteps      | 245760      |
| train/                  |             |
|    approx_kl            | 0.009453473 |
|    clip_fraction        | 0.0876      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.93       |
|    explained_variance   | 0.167       |
|    learning_rate        | 0.0003      |
|    loss                 | 6.72e+03    |
|    n_updates            | 1190        |
|    policy_gradient_loss | -0.0109     |
|    std                  | 0.835       |
|    value_loss           | 1.07e+04    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.71e+04    |
|    ep_rew_mean          | -1.23e+04   |
| time/                   |             |
|    fps                  | 1023        |
|    iterations           | 129         |
|    time_elapsed         | 258         |
|    total_timesteps      | 264192      |
| train/                  |             |
|    approx_kl            | 0.012607036 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.7        |
|    explained_variance   | 0.342       |
|    learning_rate        | 0.0003      |
|    loss                 | 279         |
|    n_updates            | 1280        |
|    policy_gradient_loss | -0.0157     |
|    std                  | 0.789       |
|    value_loss           | 632         |
-----------------------------------------
-----------------------------------------
| rollout/                |       

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.7e+04      |
|    ep_rew_mean          | -1.04e+04    |
| time/                   |              |
|    fps                  | 1019         |
|    iterations           | 138          |
|    time_elapsed         | 277          |
|    total_timesteps      | 282624       |
| train/                  |              |
|    approx_kl            | 0.0037485238 |
|    clip_fraction        | 0.0323       |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.57        |
|    explained_variance   | 0.172        |
|    learning_rate        | 0.0003       |
|    loss                 | 441          |
|    n_updates            | 1370         |
|    policy_gradient_loss | -0.00604     |
|    std                  | 0.77         |
|    value_loss           | 2.95e+03     |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.7e+04     |
|    ep_rew_mean          | -9e+03      |
| time/                   |             |
|    fps                  | 1014        |
|    iterations           | 147         |
|    time_elapsed         | 296         |
|    total_timesteps      | 301056      |
| train/                  |             |
|    approx_kl            | 0.014728146 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.47       |
|    explained_variance   | 0.544       |
|    learning_rate        | 0.0003      |
|    loss                 | 439         |
|    n_updates            | 1460        |
|    policy_gradient_loss | -0.0181     |
|    std                  | 0.748       |
|    value_loss           | 952         |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.7e+04     |
|    ep_rew_mean          | -7.68e+03   |
| time/                   |             |
|    fps                  | 997         |
|    iterations           | 156         |
|    time_elapsed         | 320         |
|    total_timesteps      | 319488      |
| train/                  |             |
|    approx_kl            | 0.009899649 |
|    clip_fraction        | 0.123       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.43       |
|    explained_variance   | 0.323       |
|    learning_rate        | 0.0003      |
|    loss                 | 634         |
|    n_updates            | 1550        |
|    policy_gradient_loss | -0.0146     |
|    std                  | 0.743       |
|    value_loss           | 1.64e+03    |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.7e+04     |
|    ep_rew_mean          | -7.31e+03   |
| time/                   |             |
|    fps                  | 988         |
|    iterations           | 165         |
|    time_elapsed         | 341         |
|    total_timesteps      | 337920      |
| train/                  |             |
|    approx_kl            | 0.012614315 |
|    clip_fraction        | 0.118       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.29       |
|    explained_variance   | 0.12        |
|    learning_rate        | 0.0003      |
|    loss                 | 615         |
|    n_updates            | 1640        |
|    policy_gradient_loss | -0.0187     |
|    std                  | 0.715       |
|    value_loss           | 2.26e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.71e+04   |
|    ep_rew_mean          | -6.06e+03  |
| time/                   |            |
|    fps                  | 986        |
|    iterations           | 174        |
|    time_elapsed         | 361        |
|    total_timesteps      | 356352     |
| train/                  |            |
|    approx_kl            | 0.01721825 |
|    clip_fraction        | 0.182      |
|    clip_range           | 0.2        |
|    entropy_loss         | -4.17      |
|    explained_variance   | 0.494      |
|    learning_rate        | 0.0003     |
|    loss                 | 230        |
|    n_updates            | 1730       |
|    policy_gradient_loss | -0.0144    |
|    std                  | 0.697      |
|    value_loss           | 666        |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_me

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.71e+04    |
|    ep_rew_mean          | -4.73e+03   |
| time/                   |             |
|    fps                  | 979         |
|    iterations           | 183         |
|    time_elapsed         | 382         |
|    total_timesteps      | 374784      |
| train/                  |             |
|    approx_kl            | 0.012060663 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.09       |
|    explained_variance   | 0.254       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.77e+03    |
|    n_updates            | 1820        |
|    policy_gradient_loss | -0.0135     |
|    std                  | 0.685       |
|    value_loss           | 3.12e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.71e+04    |
|    ep_rew_mean          | -3.9e+03    |
| time/                   |             |
|    fps                  | 977         |
|    iterations           | 192         |
|    time_elapsed         | 402         |
|    total_timesteps      | 393216      |
| train/                  |             |
|    approx_kl            | 0.018630851 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.94       |
|    explained_variance   | 0.421       |
|    learning_rate        | 0.0003      |
|    loss                 | 311         |
|    n_updates            | 1910        |
|    policy_gradient_loss | -0.0114     |
|    std                  | 0.66        |
|    value_loss           | 996         |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.71e+04    |
|    ep_rew_mean          | -2.88e+03   |
| time/                   |             |
|    fps                  | 978         |
|    iterations           | 201         |
|    time_elapsed         | 420         |
|    total_timesteps      | 411648      |
| train/                  |             |
|    approx_kl            | 0.010061822 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.78       |
|    explained_variance   | 0.0891      |
|    learning_rate        | 0.0003      |
|    loss                 | 3.33e+03    |
|    n_updates            | 2000        |
|    policy_gradient_loss | -0.00388    |
|    std                  | 0.636       |
|    value_loss           | 3.44e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | -1.13e+03   |
| time/                   |             |
|    fps                  | 982         |
|    iterations           | 210         |
|    time_elapsed         | 437         |
|    total_timesteps      | 430080      |
| train/                  |             |
|    approx_kl            | 0.014563609 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.68       |
|    explained_variance   | 0.157       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.35e+03    |
|    n_updates            | 2090        |
|    policy_gradient_loss | -0.0146     |
|    std                  | 0.62        |
|    value_loss           | 5.12e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | -291        |
| time/                   |             |
|    fps                  | 986         |
|    iterations           | 219         |
|    time_elapsed         | 454         |
|    total_timesteps      | 448512      |
| train/                  |             |
|    approx_kl            | 0.013243984 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.51       |
|    explained_variance   | 0.551       |
|    learning_rate        | 0.0003      |
|    loss                 | 460         |
|    n_updates            | 2180        |
|    policy_gradient_loss | -0.0162     |
|    std                  | 0.59        |
|    value_loss           | 1.23e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | 618         |
| time/                   |             |
|    fps                  | 990         |
|    iterations           | 228         |
|    time_elapsed         | 471         |
|    total_timesteps      | 466944      |
| train/                  |             |
|    approx_kl            | 0.012868851 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.3        |
|    explained_variance   | 0.25        |
|    learning_rate        | 0.0003      |
|    loss                 | 148         |
|    n_updates            | 2270        |
|    policy_gradient_loss | -0.00827    |
|    std                  | 0.56        |
|    value_loss           | 1.01e+03    |
-----------------------------------------
------------------------------------------
| rollout/                |      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | 1.03e+03    |
| time/                   |             |
|    fps                  | 990         |
|    iterations           | 237         |
|    time_elapsed         | 490         |
|    total_timesteps      | 485376      |
| train/                  |             |
|    approx_kl            | 0.018718215 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.28       |
|    explained_variance   | 0.331       |
|    learning_rate        | 0.0003      |
|    loss                 | 415         |
|    n_updates            | 2360        |
|    policy_gradient_loss | -0.0178     |
|    std                  | 0.561       |
|    value_loss           | 1.25e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | 1.76e+03    |
| time/                   |             |
|    fps                  | 989         |
|    iterations           | 246         |
|    time_elapsed         | 509         |
|    total_timesteps      | 503808      |
| train/                  |             |
|    approx_kl            | 0.021527331 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.21       |
|    explained_variance   | 0.365       |
|    learning_rate        | 0.0003      |
|    loss                 | 490         |
|    n_updates            | 2450        |
|    policy_gradient_loss | -0.0153     |
|    std                  | 0.553       |
|    value_loss           | 1.17e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | 2.28e+03    |
| time/                   |             |
|    fps                  | 989         |
|    iterations           | 255         |
|    time_elapsed         | 527         |
|    total_timesteps      | 522240      |
| train/                  |             |
|    approx_kl            | 0.009567765 |
|    clip_fraction        | 0.0831      |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.15       |
|    explained_variance   | 0.274       |
|    learning_rate        | 0.0003      |
|    loss                 | 7.9e+03     |
|    n_updates            | 2540        |
|    policy_gradient_loss | -0.0081     |
|    std                  | 0.545       |
|    value_loss           | 1.54e+04    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | 2.82e+03    |
| time/                   |             |
|    fps                  | 989         |
|    iterations           | 264         |
|    time_elapsed         | 546         |
|    total_timesteps      | 540672      |
| train/                  |             |
|    approx_kl            | 0.019911189 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.2         |
|    entropy_loss         | -3          |
|    explained_variance   | 0.197       |
|    learning_rate        | 0.0003      |
|    loss                 | 648         |
|    n_updates            | 2630        |
|    policy_gradient_loss | -0.0136     |
|    std                  | 0.527       |
|    value_loss           | 1.53e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04    |
|    ep_rew_mean          | 3.42e+03    |
| time/                   |             |
|    fps                  | 988         |
|    iterations           | 273         |
|    time_elapsed         | 565         |
|    total_timesteps      | 559104      |
| train/                  |             |
|    approx_kl            | 0.021495178 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.82       |
|    explained_variance   | 0.196       |
|    learning_rate        | 0.0003      |
|    loss                 | 393         |
|    n_updates            | 2720        |
|    policy_gradient_loss | -0.0158     |
|    std                  | 0.504       |
|    value_loss           | 1.04e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |       

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.72e+04   |
|    ep_rew_mean          | 3.92e+03   |
| time/                   |            |
|    fps                  | 986        |
|    iterations           | 282        |
|    time_elapsed         | 585        |
|    total_timesteps      | 577536     |
| train/                  |            |
|    approx_kl            | 0.02076364 |
|    clip_fraction        | 0.235      |
|    clip_range           | 0.2        |
|    entropy_loss         | -2.74      |
|    explained_variance   | 0.211      |
|    learning_rate        | 0.0003     |
|    loss                 | 319        |
|    n_updates            | 2810       |
|    policy_gradient_loss | -0.0158    |
|    std                  | 0.496      |
|    value_loss           | 771        |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_me

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.72e+04  |
|    ep_rew_mean          | 4.53e+03  |
| time/                   |           |
|    fps                  | 986       |
|    iterations           | 291       |
|    time_elapsed         | 603       |
|    total_timesteps      | 595968    |
| train/                  |           |
|    approx_kl            | 0.0181052 |
|    clip_fraction        | 0.215     |
|    clip_range           | 0.2       |
|    entropy_loss         | -2.54     |
|    explained_variance   | 0.251     |
|    learning_rate        | 0.0003    |
|    loss                 | 230       |
|    n_updates            | 2900      |
|    policy_gradient_loss | -0.0128   |
|    std                  | 0.474     |
|    value_loss           | 845       |
---------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.72e+04

<stable_baselines3.ppo.ppo.PPO at 0x136aefa00>

In [7]:
total_reward = 0
observation = env.reset()
done = False
while not done:
    action, _states = model.predict(observation)
        
    observation, reward, done, info = env.step(action)
    total_reward += reward
    
print(total_reward)

  return torch.cat([torch.tensor(env_encoding), torch.tensor(next_req.get_encoding(self.nodes))])


20951.978287932187
