In [1]:
## VISSIM Modules
import win32com.client as com
import os

## RL Modules
import tensorflow as tf

from tensorflow.keras.models import load_model
    
## Data Management Modules
import pickle

## User Defined Modules
import math
import Simulator_Functions as SF

from Actor_Critic_Agents import ACAgent



from NParser import NetworkParser
from COMServer import COMServerDispatch, COMServerReload
from TupleToList import toList
from Utilities import log_progress, pltlive
## Other Modules
import numpy as np
import random
import matplotlib.pyplot as plt
import PER

%matplotlib inline

In [2]:
## RL Hyperparamenters
# Number of simulations, save every "n" episodes 
episodes = 400
partial_save_at = 25 #50



# Hyperparameters
Surtrac = False
AC = True
PER_activated = False


alpha   = 0.0000005
gamma   = 0.85  #0.85 # 0.99
entropy = 0.000001 # exploration
value = 5 #0.5 # weight attributed in the value loss during gradient descent

n_sample = 10 # number of sample for the value check
horizon = 100 # horizon of the value check, the number of step forward uses to compute the return

# In order to reduce entropy during training (not implemented yet)
reduce_entropy = False
reduce_entropy_every = 1000

n_step_size = 8 #16 # number of step in the n step learning 32
# Do not work if n_step_size is aver 31

# Timesteps per simulation (1 timestep = 0.1 sec), length for random population is a multiple of episode
timesteps_per_second = 1
seconds_per_green = 6
seconds_per_yellow = 3
simulation_length = 3600*1 + 1 # worked with 2400



## State-Action Parameters
action_type = "phases"        # options are "phases" and "programs"
state_size = (2,4,22)  #5 # 4 queues or 5 queues + signal state    #49 53 (1,8,6) for conv
action_size = 2

# Demand Schedule (times in seconds, demand in cars/hour as PPP) # worked with 600
demand_change_timesteps = 450
demand = {"h":600, 'm':300, 'l':150}
demand_list = [[demand['l'], demand['l']], [demand['m'], demand['l']],\
              [demand['h'], demand['l']], [demand['h'], demand['m']],\
              [demand['h'], demand['h']], [demand['m'], demand['h']],
              [demand['l'], demand['h']], [demand['l'], demand['m']]]


In [3]:
## Operation mode (selects functionalities)
mode = "training"
# "training" = training agents, maximum speed, frozen UI, mid amount of messages
# "retraining" = continue the training of previous agent
# "debug"    = trains for 1 episode, minimum speed, working UI, all messages
# "demo"     = loads pretrained agent, minimum speed, working UI
# "test"     = executes evaluation, maximum speed




## Network Model Parameters
model_name  = 'Single_Cross_Mod2'
# 'Single_Cross_Straight'
# 'Single_Cross_Triple'
# 'Single_Cross_Triple_Mod'
# 'Single_Cross_Mod2'
# 'Balance'

vissim_working_directory =  'C:\\Users\\Rzhang\\Desktop\\MLforFlowOptimisationOrigine\\Vissim\\'
agent_type = 'AC' # DQN, DuelingDQN, DDQN, DuelingDDQN AC
reward_type = 'Queues'

state_type  = 'CellsT'
#CellsSpeedOccSig'    # 'Queues', 'Delays', 'QueuesDifference' 'QueuesSpeedavrOccuperateSig' 'QueuesSig' CellsSpeedSig 
#CellsSpeedOccSig 'CellsOccSig' 'CellsT'
# 'Queues', 'Delays', 'QueuesDifference' 'QueuesSpeedavrOccuperateSig' 'QueuesSig'
Random_Seed = 42

## Use of additional files?
flag_read_additionally  = True
SaveResultsAgent = True
# Random demand
Random_Demand = False

# Loading the best agent during demo and training
best = True



# Session ID
#Session_ID = 'Ep_'+str(episodes)+'_A_'+agent_type+"_Act_"+action_type+"_Rew_"+reward_type

# Adding the state type to the Session_ID
Session_ID = 'Ep_'+str(episodes)+'_A_'+agent_type+"_State_"+state_type+"_Act_"+action_type+"_Rew_"+reward_type
print(Session_ID)

if mode == 'demo' :
    simulation_length = 3601
    demand_list = [[demand['l'], demand['l']]]
    demand_change_timesteps = simulation_length

if mode == 'test' : 
    simulation_length = 3601
    demand_change_timesteps = 450
    demand = {"h":800, 'm':400, 'l':200}
    demand_list = [[demand['l'], demand['l']], [demand['m'], demand['l']],\
                  [demand['h'], demand['l']], [demand['h'], demand['m']],\
                  [demand['h'], demand['h']], [demand['m'], demand['h']],
                  [demand['l'], demand['h']], [demand['l'], demand['m']]]
    Random_Seed = 1
    # Loading the best agent
    best = False


Ep_400_A_AC_State_CellsT_Act_phases_Rew_Queues


In [4]:
# Have to find a way to reduce entropy over time entropy = exploration

## Converging network
# - reward queue, state queue
# converging with updates every steps and entropy = 0.00001 and 1 core layer of 42
# converging well with updates every steps and entropy = 0.00001 and no core

# - reward queue state queues + sig
# converging well with updates every steps and entropy = 0.00001 and no core

# 64 is a good number

if __name__ == "__main__":
    # Initialize storage
    reward_storage = []
    best_agent_weights = []
    best_agent_memory = []
    reward_plot = np.zeros([episodes,])
    loss_plot = np.zeros([episodes,])

    # Initialize simulation
    Vissim, Simulation, Network, cache_flag = COMServerDispatch(model_name, vissim_working_directory,\
                                                                simulation_length, timesteps_per_second,\
                                                                delete_results = True, verbose = True)
    SF.Select_Vissim_Mode(Vissim,mode)
    
    runflag = True
    # Setting Random Seed
    Vissim.Simulation.SetAttValue('RandSeed', Random_Seed)
    print ('Random seed set in simulator. Random Seed = '+str(Random_Seed))

    # Deploy Network Parser (crawl network)
    npa = NetworkParser(Vissim)
    print('NetworkParser has succesfully crawled the model network.')
    
    # Initialize agents
    if agent_type in ['AC'] :
        Agents = [ACAgent(state_size, action_size, ID, state_type, npa, n_step_size, gamma, alpha, entropy, value, Vissim) for ID in npa.signal_controllers_ids] 
        for agent in Agents:
            # to initialise the computational graph ot the model (I am sure there is a better way to to this)
            agent.test()
        agents_deployed = True
    else:
        print("Incorrect Agent Class selected. Deployment could not be completed.")
        quit()
    if agents_deployed:
        print("Deployed {} agent(s) of the Class {}.".format(len(Agents), agent_type))
    
    ## EXECUTION OF A DEMONSTRATION RUN (slow, choice of best available agent)
    if mode == "demo" or mode == "populate" or mode == "debug" or mode == "test":
        # If mode or debug, set slow simulation
        if mode == "demo" or mode =="debug":
            timesteps_per_second = 10
            Vissim.Simulation.SetAttValue('SimRes', timesteps_per_second)
            
        # If memory population or test mode, set quick simulation
        elif mode == "populate" or mode == "test":
            SF.Set_Quickmode(Vissim, timesteps_per_second)
            
        # If on a test or a demo, load the best available agent and set exploration to zero
        if mode == "demo" or mode == "test":
            Agents , reward_storage = SF.load_agents(vissim_working_directory, model_name, Agents,\
                                    Session_ID, best = best)
            for agent in Agents:
                agent.epsilon = 0
                
    # Run the episode
        if mode == "demo" or mode == "debug":
            SF.run_simulation_episode(Agents, Vissim, state_type, reward_type, state_size, simulation_length,\
                                      timesteps_per_second, seconds_per_green, seconds_per_yellow,\
                                      demand_list, demand_change_timesteps, mode, PER_activated)
        elif mode == "test":
            SF.run_simulation_episode(Agents, Vissim, state_type, reward_type, state_size, simulation_length,\
                                      timesteps_per_second, seconds_per_green, seconds_per_yellow,\
                                      demand_list, demand_change_timesteps, mode, PER_activated)
        
        
        Vissim = None
    
    ## EXECUTION OF THE NORMAL TRAINING LOOP
    elif mode == "training" or mode == "retraining":
        print("Training")
        
            
        # Iterations of the simulation
        for episode in log_progress(range(episodes), every=1):
        
            # Reload map if it has already been run (previous episode or prepopulation)
            if episode !=0 or runflag == True:
                Simulation, Network = COMServerReload(Vissim, model_name, vissim_working_directory,\
                                                      simulation_length, timesteps_per_second, delete_results = True)

                

            # Change the random seed
            Random_Seed += 1
            Vissim.Simulation.SetAttValue('RandSeed', Random_Seed)
        
            # Run Episode at maximum speed
            
            SF.Select_Vissim_Mode(Vissim, mode)
            
            SF.run_simulation_episode(Agents, Vissim, state_type, reward_type, state_size, simulation_length, timesteps_per_second,\
                                      seconds_per_green, seconds_per_yellow, demand_list, demand_change_timesteps, mode,\
                                      PER_activated,Surtrac = Surtrac)
        
            # Calculate episode average reward
            reward_storage, average_reward = SF.average_reward(reward_storage, Agents, episode, episodes)
            best_agent_weights, best_agent_memory = SF.best_agent(reward_storage, average_reward,\
                                                                  best_agent_weights, best_agent_memory,\
                                                                  vissim_working_directory, model_name, Agents, Session_ID)
            
            for index, agent in enumerate(Agents):
                predicted_values, true_values, logit0, logits = agent.value_check(horizon, n_sample)
                print ("Agent {} : Predicted Values and True Return : \n {} \n {}" .format(index, predicted_values, true_values))
                print ("Agent {} : Logits on those states : \n {}" .format(index, logits))
                print ("Agent {} : Logits on the 0 state : \n {}" .format(index, logit0))
               
        
            
            # Security save for long trainings
            if SaveResultsAgent:
                if (episode+1)%partial_save_at == 0:
                    SF.save_agents(vissim_working_directory, model_name, Agents, Session_ID, reward_storage)
                    print('Saved Partial results at the end of episode {}.'.format(episode+1))
            
            # line to reduce the entropy of the actor_critic.
            if reduce_entropy:
                pass
            
        #Saving agents memory, weights and optimizer
        if SaveResultsAgent:
            SF.save_agents(vissim_working_directory, model_name, Agents, Session_ID, reward_storage)
            print("Model, architecture, weights, optimizer, memory and training results succesfully saved.\
            Succesfully Terminated.")
    
    else:
        print("ERROR: Mode selected not recognized. TERMINATING.")
    # Close Vissim
    Vissim = None

Working Directory set to: C:\Users\Rzhang\Desktop\MLforFlowOptimisationOrigine\Vissim\
Generating Cache...
Cache generated.

****************************
*   COM Server dispatched  *
****************************

Attempting to load Model File: Single_Cross_Mod2.inpx ...
Load process successful
Simulation length set to 3601 seconds.
Results from Previous Simulations: Deleted. Fresh Start Available.
Fetched and containerized Simulation Object
Fetched and containerized Network Object 

*******************************************************
*                                                     *
*                 SETUP COMPLETE                      *
*                                                     *
*******************************************************

Random seed set in simulator. Random Seed = 42
NetworkParser has succesfully crawled the model network.
Deploying instance of Actor_Critic Agent(s) !!! TENSORFLOW 2 IS NEEDED !!! 
Model: "modelconv"
________________________________

VBox(children=(HTML(value=''), IntProgress(value=0, max=400)))

Episode: 1/400, Epsilon:0, Average reward: -65.38
Saving architecture, weights, optimizer state for best agent-0
New best agent found. Saved in C:\Users\Rzhang\Desktop\MLforFlowOptimisationOrigine\Vissim\Single_Cross_Mod2\Agents_Results\Ep_400_A_AC_State_CellsT_Act_phases_Rew_Queues\BestAgent0_Memory.p
Dumping Training Results into pickle file.
Dumping Loss Results into pickle file.
Agent 0 : Predicted Values and True Return : 
 [-0.0, -0.0, -0.0, 0.0, 0.0, -0.0, 0.0, 0.0, -0.0, 0.0] 
 [-206.0, -1225.0, -407.0, -175.0, -616.0, -323.0, -176.0, -345.0, -1100.0, -37.0]
Agent 0 : Logits on those states : 
 [[-0.51, 0.31], [-0.01, 0.06], [0.09, -0.39], [0.23, -0.14], [-0.14, 0.36], [0.0, -0.0], [0.01, 0.23], [0.05, -0.1], [0.59, -0.53], [0.01, -0.01]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 2/400, Epsilon:0, Average reward: -56.24
Saving architecture, weights, optimizer state for best agent-0
New best agent found. Saved in C:\Users\Rzhang\Desktop\MLforFlowOptimisationOrigine\

Episode: 17/400, Epsilon:0, Average reward: -60.67
Agent 0 : Predicted Values and True Return : 
 [-1.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0] 
 [-682.0, -56.0, -391.0, -56.0, -264.0, -299.0, -1110.0, -162.0, -62.0, -760.0]
Agent 0 : Logits on those states : 
 [[-0.1, 0.15], [-0.19, -0.11], [0.13, -0.18], [0.24, -0.26], [-0.12, 0.06], [0.84, -0.64], [0.12, 0.08], [0.04, 0.07], [-0.03, -0.19], [-0.27, -0.12]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 18/400, Epsilon:0, Average reward: -67.58
Agent 0 : Predicted Values and True Return : 
 [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0] 
 [-827.0, -55.0, -721.0, -55.0, -58.0, -1368.0, -57.0, -67.0, -281.0, -56.0]
Agent 0 : Logits on those states : 
 [[0.17, -0.25], [-0.1, 0.03], [0.56, -0.43], [0.09, -0.07], [0.01, -0.0], [0.68, -0.45], [-0.5, 0.11], [0.38, -0.25], [0.06, -0.01], [0.32, -0.32]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 19/400, Epsilon:0, Average reward: -46.7
Agent 0 

Episode: 34/400, Epsilon:0, Average reward: -55.12
Agent 0 : Predicted Values and True Return : 
 [-2.0, -2.0, -3.0, -1.0, -0.0, -1.0, -1.0, -0.0, -2.0, -1.0] 
 [-876.0, -486.0, -516.0, -27.0, -32.0, -479.0, -615.0, -290.0, -112.0, -541.0]
Agent 0 : Logits on those states : 
 [[0.07, -0.04], [-0.14, 0.27], [0.09, -0.22], [0.06, -0.22], [0.0, -0.0], [0.25, -0.37], [0.28, -0.23], [-0.03, 0.15], [0.13, -0.04], [0.23, -0.12]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 35/400, Epsilon:0, Average reward: -42.01
Agent 0 : Predicted Values and True Return : 
 [-1.0, -2.0, -2.0, -0.0, -1.0, -1.0, -2.0, -2.0, -3.0, -2.0] 
 [-745.0, -112.0, -350.0, -18.0, -321.0, -758.0, -425.0, -349.0, -282.0, -150.0]
Agent 0 : Logits on those states : 
 [[0.58, -0.56], [0.2, -0.34], [0.13, -0.12], [0.04, -0.03], [0.08, -0.15], [0.09, -0.06], [0.47, -0.22], [0.27, -0.28], [-0.04, 0.16], [-0.11, -0.14]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 36/400, Epsilon:0, Average reward: -51.81
Ag

Episode: 51/400, Epsilon:0, Average reward: -79.69
Agent 0 : Predicted Values and True Return : 
 [-0.0, -3.0, -2.0, -0.0, -6.0, -3.0, -2.0, -1.0, -2.0, -3.0] 
 [-53.0, -62.0, -525.0, -25.0, -403.0, -62.0, -1109.0, -537.0, -45.0, -908.0]
Agent 0 : Logits on those states : 
 [[0.0, -0.0], [0.28, -0.35], [0.14, -0.29], [0.05, -0.02], [0.05, 0.02], [0.06, 0.04], [-0.21, -0.05], [0.1, -0.14], [0.32, -0.24], [0.04, -0.17]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 52/400, Epsilon:0, Average reward: -89.01
Agent 0 : Predicted Values and True Return : 
 [-3.0, -6.0, -1.0, -2.0, -2.0, -6.0, -0.0, -2.0, -2.0, -2.0] 
 [-1571.0, -1986.0, -1720.0, -236.0, -27.0, -1986.0, -49.0, -1501.0, -397.0, -328.0]
Agent 0 : Logits on those states : 
 [[-0.08, -0.06], [0.4, -0.15], [0.04, -0.0], [0.06, -0.02], [0.03, -0.09], [0.4, -0.15], [0.0, -0.0], [0.04, 0.01], [0.25, -0.44], [0.27, -0.21]]
Agent 0 : Logits on the 0 state : 
 [-0. -0.]
Episode: 53/400, Epsilon:0, Average reward: -49.41
Agent 0

Episode: 68/400, Epsilon:0, Average reward: -46.09
Agent 0 : Predicted Values and True Return : 
 [-6.0, -9.0, -8.0, -5.0, -12.0, -4.0, -1.0, -7.0, -4.0, -10.0] 
 [-113.0, -385.0, -163.0, -60.0, -484.0, -119.0, -55.0, -56.0, -58.0, -125.0]
Agent 0 : Logits on those states : 
 [[0.05, -0.12], [0.11, -0.18], [0.47, -0.43], [0.56, -0.29], [0.34, 0.32], [0.04, -0.12], [0.09, -0.23], [0.0, 0.11], [-0.1, -0.01], [-0.22, 0.06]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 69/400, Epsilon:0, Average reward: -42.81
Agent 0 : Predicted Values and True Return : 
 [-9.0, -7.0, -0.0, -1.0, -7.0, -4.0, -7.0, -6.0, -12.0, -1.0] 
 [-37.0, -342.0, -107.0, -45.0, -542.0, -27.0, -172.0, -58.0, -172.0, -175.0]
Agent 0 : Logits on those states : 
 [[0.23, -0.39], [0.07, 0.11], [0.0, 0.0], [0.11, -0.07], [0.24, -0.31], [0.57, -0.04], [-0.01, -0.35], [-0.0, -0.1], [-0.02, -0.14], [0.26, -0.21]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 70/400, Epsilon:0, Average reward: -139.61
Agent 0

Episode: 85/400, Epsilon:0, Average reward: -76.42
Agent 0 : Predicted Values and True Return : 
 [-0.0, -9.0, -12.0, -21.0, -1.0, -6.0, -9.0, -7.0, -2.0, -15.0] 
 [-33.0, -1178.0, -656.0, -698.0, -436.0, -644.0, -1565.0, -93.0, -103.0, -43.0]
Agent 0 : Logits on those states : 
 [[0.0, -0.0], [0.07, -0.07], [0.23, -0.19], [0.09, 0.09], [0.05, -0.0], [0.15, -0.11], [0.04, -0.19], [0.15, -0.28], [0.1, -0.15], [-0.07, -0.06]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 86/400, Epsilon:0, Average reward: -35.9
Agent 0 : Predicted Values and True Return : 
 [-17.0, -23.0, -13.0, -0.0, -12.0, -14.0, -17.0, -20.0, -0.0, -9.0] 
 [-531.0, -493.0, -316.0, -89.0, -82.0, -549.0, -248.0, -246.0, -89.0, -103.0]
Agent 0 : Logits on those states : 
 [[0.68, -0.23], [0.14, -0.18], [-0.04, -0.11], [0.0, -0.0], [0.19, 0.15], [0.76, -0.59], [-0.1, 0.18], [0.46, -0.57], [0.0, -0.0], [0.37, -0.26]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 87/400, Epsilon:0, Average reward: -39.06
A

Episode: 102/400, Epsilon:0, Average reward: -72.18
Agent 0 : Predicted Values and True Return : 
 [-37.0, -12.0, -40.0, -49.0, -13.0, -35.0, -13.0, -1.0, -24.0, -25.0] 
 [-536.0, -84.0, -442.0, -476.0, -229.0, -475.0, -16.0, -124.0, -1882.0, -1631.0]
Agent 0 : Logits on those states : 
 [[0.12, -0.07], [0.08, -0.25], [-0.1, -0.2], [0.07, -0.01], [0.46, -0.03], [-0.11, -0.04], [0.21, 0.13], [0.09, -0.04], [0.56, -0.63], [0.24, -0.33]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 103/400, Epsilon:0, Average reward: -49.0
Agent 0 : Predicted Values and True Return : 
 [-14.0, -36.0, -29.0, -22.0, -8.0, -12.0, -26.0, -18.0, -13.0, -10.0] 
 [-199.0, -920.0, -787.0, -335.0, -57.0, -87.0, -508.0, -94.0, -628.0, -63.0]
Agent 0 : Logits on those states : 
 [[-0.01, -0.09], [0.75, -0.53], [0.25, -0.4], [0.07, -0.1], [-0.0, -0.06], [-0.02, -0.28], [-0.32, 0.2], [-0.09, 0.04], [0.19, -0.28], [-0.16, -0.08]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 104/400, Epsilon:0, Avera

Episode: 120/400, Epsilon:0, Average reward: -49.88
Agent 0 : Predicted Values and True Return : 
 [-101.0, -16.0, -38.0, -26.0, -14.0, -4.0, -20.0, -34.0, -51.0, -0.0] 
 [-720.0, -855.0, -788.0, -43.0, -75.0, -191.0, -17.0, -353.0, -901.0, -18.0]
Agent 0 : Logits on those states : 
 [[0.04, -0.38], [0.23, -0.11], [-0.14, 0.03], [-0.25, -0.2], [0.0, 0.02], [0.14, -0.14], [-0.03, -0.32], [0.13, -0.1], [-0.39, 0.35], [0.0, -0.0]]
Agent 0 : Logits on the 0 state : 
 [-0. -0.]
Episode: 121/400, Epsilon:0, Average reward: -51.83
Agent 0 : Predicted Values and True Return : 
 [-47.0, -77.0, -41.0, -51.0, -0.0, -32.0, -38.0, -60.0, -40.0, -12.0] 
 [-1026.0, -579.0, -196.0, -1160.0, -35.0, -187.0, -298.0, -540.0, -538.0, -430.0]
Agent 0 : Logits on those states : 
 [[0.18, -0.28], [0.24, -0.18], [-0.18, -0.14], [-0.02, 0.19], [-0.0, 0.0], [0.17, -0.08], [0.08, 0.01], [0.16, -0.21], [0.53, -0.31], [0.05, -0.03]]
Agent 0 : Logits on the 0 state : 
 [-0. -0.]
Episode: 122/400, Epsilon:0, Average 

Episode: 136/400, Epsilon:0, Average reward: -45.03
Agent 0 : Predicted Values and True Return : 
 [-22.0, -53.0, -102.0, -21.0, -54.0, -33.0, -62.0, -29.0, -71.0, -14.0] 
 [-47.0, -85.0, -472.0, -686.0, -718.0, -121.0, -479.0, -222.0, -858.0, -446.0]
Agent 0 : Logits on those states : 
 [[0.18, -0.2], [-0.19, -0.14], [0.19, -0.23], [0.01, 0.08], [-0.05, 0.09], [-0.05, -0.03], [0.25, -0.58], [-0.02, -0.04], [0.03, -0.19], [0.06, 0.09]]
Agent 0 : Logits on the 0 state : 
 [-0.  0.]
Episode: 137/400, Epsilon:0, Average reward: -60.46
Agent 0 : Predicted Values and True Return : 
 [-48.0, -29.0, -66.0, -27.0, -31.0, -86.0, -116.0, -7.0, -13.0, -21.0] 
 [-281.0, -54.0, -284.0, -43.0, -126.0, -264.0, -267.0, -71.0, -52.0, -911.0]
Agent 0 : Logits on those states : 
 [[-0.1, 0.05], [0.05, -0.07], [0.77, -0.27], [0.17, 0.11], [-0.03, 0.0], [0.09, -0.26], [0.25, -0.33], [-0.02, -0.04], [-0.01, -0.05], [0.02, 0.03]]
Agent 0 : Logits on the 0 state : 
 [-0.  0.]
Episode: 138/400, Epsilon:0, Aver

Episode: 153/400, Epsilon:0, Average reward: -48.1
Agent 0 : Predicted Values and True Return : 
 [-100.0, -76.0, -132.0, -71.0, -9.0, -68.0, -31.0, -153.0, -67.0, -109.0] 
 [-145.0, -83.0, -419.0, -681.0, -63.0, -839.0, -178.0, -418.0, -71.0, -240.0]
Agent 0 : Logits on those states : 
 [[0.04, -0.24], [-0.14, 0.07], [0.28, -0.13], [0.07, -0.18], [0.25, -0.21], [-0.09, -0.21], [-0.03, -0.15], [0.63, -0.35], [0.2, -0.15], [0.32, -0.33]]
Agent 0 : Logits on the 0 state : 
 [-0.  0.]
Episode: 154/400, Epsilon:0, Average reward: -49.77
Agent 0 : Predicted Values and True Return : 
 [-36.0, -142.0, -103.0, -71.0, -130.0, -52.0, -31.0, -132.0, -2.0, -8.0] 
 [-281.0, -256.0, -323.0, -1258.0, -910.0, -804.0, -45.0, -409.0, -38.0, -71.0]
Agent 0 : Logits on those states : 
 [[-0.03, 0.09], [-0.41, 0.29], [-0.02, 0.18], [-0.06, -0.02], [0.33, 0.07], [0.5, 0.22], [-0.08, 0.05], [0.26, -0.04], [0.05, -0.06], [0.02, -0.03]]
Agent 0 : Logits on the 0 state : 
 [-0.  0.]
Episode: 155/400, Epsilon:0,

Episode: 170/400, Epsilon:0, Average reward: -69.13
Agent 0 : Predicted Values and True Return : 
 [-199.0, -154.0, -165.0, -94.0, -79.0, -112.0, -90.0, -83.0, -128.0, -64.0] 
 [-1280.0, -980.0, -1059.0, -1166.0, -210.0, -1084.0, -978.0, -1442.0, -1232.0, -1475.0]
Agent 0 : Logits on those states : 
 [[-0.24, -0.2], [0.25, 0.58], [-0.32, 0.2], [0.14, 0.22], [0.28, 0.05], [0.11, -0.06], [0.21, 0.13], [0.08, 0.1], [-0.07, -0.03], [0.06, -0.04]]
Agent 0 : Logits on the 0 state : 
 [-0.  0.]
Failed load attempt 1/5. Re-attempting.
Failed load attempt 2/5. Re-attempting.
Failed load attempt 3/5. Re-attempting.
Failed load attempt 4/5. Re-attempting.


Exception: Failed 5th loading attempt. Please restart program. TERMINATING NOW.

In [None]:
# Plotting training progress
plt.figure(figsize=(8,4.5))
x_series = range(1,len(reward_storage)+1)
fit = np.polyfit(x_series,reward_storage,1)
fit_fn = np.poly1d(fit) 
plt.plot(x_series,reward_storage, '-b', x_series, fit_fn(x_series), '--r')
plt.xlabel('Episodes')
plt.ylabel('Average agent reward in episode')
plt.title('Training evolution and trend')
plt.gca().legend(('Episode Reward','Linear Trend'))
plt.show()

# Plotting training loss
plt.figure(figsize=(8,4.5))
x_series = range(1,len(Agents[0].loss)+1)
plt.plot(x_series,Agents[0].loss, '-b')
plt.xlabel('Training Epoch')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.gca().legend(('Loss'))
plt.show()

print(reward_storage)

In [None]:
# Plotting test progress:
West_queue = list()
South_queue= list()
East_queue = list()
North_queue= list()
# Queue Lengths
plt.figure(figsize=(8,4.5))
x_series = range(1,len(Agents[0].queues_over_time)+1)
for i in range(len(Agents[0].queues_over_time)):
    West_queue.append(Agents[0].queues_over_time[i][0])
    South_queue.append(Agents[0].queues_over_time[i][1])
    East_queue.append(Agents[0].queues_over_time[i][2])
    North_queue.append(Agents[0].queues_over_time[i][3])
plt.plot(x_series, West_queue, '-b',\
         x_series, South_queue, '-r',\
         x_series, East_queue, '-g',\
         x_series, North_queue, '-y')
plt.xlabel('Time [s]')
plt.ylabel('Queue Length')
plt.title('Training evolution and trend')
plt.gca().legend(('West Queue','South Queue', 'East Queue', 'North Queue'))
plt.show()

# Accumulated delay over time
plt.figure(figsize=(8,4.5))
x_series = range(1,len(Agents[0].accumulated_delay)+1)
plt.plot(x_series,Agents[0].accumulated_delay, '-b')
plt.xlabel('Time [s]')
plt.ylabel('Accumulated global Delay')
plt.title('Global Delay')
plt.gca().legend('GLlobal accumulated delay')
plt.show()

average_queue_length = np.mean(Agents[0].queues_over_time)
print("Average queue size is {}".format(np.round(average_queue_length,2)))