In [1]:
## VISSIM Modules
import win32com.client as com
import os

## RL Modules
import tensorflow as tf

from tensorflow.keras.models import load_model
    
## Data Management Modules
import pickle

## User Defined Modules
import math
import Simulator_Functions as SF

from Actor_Critic_Agents import ACAgent



from NParser import NetworkParser
from COMServer import COMServerDispatch, COMServerReload
from TupleToList import toList
from Utilities import log_progress, pltlive
## Other Modules
import numpy as np
import random
import matplotlib.pyplot as plt
import PER

%matplotlib inline

In [2]:
## RL Hyperparamenters
# Number of simulations, save every "n" episodes 
episodes = 400
partial_save_at = 25 #50



# Hyperparameters
Surtrac = False
AC = True
PER_activated = False


alpha   = 0.0000005
gamma   = 0.85  #0.85 # 0.99
entropy = 0.000001 # exploration
value = 5 #0.5 # weight attributed in the value loss during gradient descent

n_sample = 10 # number of sample for the value check
horizon = 100 # horizon of the value check, the number of step forward uses to compute the return

# In order to reduce entropy during training (not implemented yet)
reduce_entropy = False
reduce_entropy_every = 1000

n_step_size = 8 #16 # number of step in the n step learning 32
# Do not work if n_step_size is aver 31

# Timesteps per simulation (1 timestep = 0.1 sec), length for random population is a multiple of episode
timesteps_per_second = 1
seconds_per_green = 6
seconds_per_yellow = 3
simulation_length = 3600*1 + 1 # worked with 2400



## State-Action Parameters
action_type = "phases"        # options are "phases" and "programs"
state_size = (2,4,11)  #5 # 4 queues or 5 queues + signal state    #49 53 (1,8,6) for conv
action_size = 2

# Demand Schedule (times in seconds, demand in cars/hour as PPP) # worked with 600
demand_change_timesteps = 450
demand = {"h":600, 'm':300, 'l':150}
demand_list = [[demand['l'], demand['l']], [demand['m'], demand['l']],\
              [demand['h'], demand['l']], [demand['h'], demand['m']],\
              [demand['h'], demand['h']], [demand['m'], demand['h']],
              [demand['l'], demand['h']], [demand['l'], demand['m']]]


In [3]:
## Operation mode (selects functionalities)
mode = "training"
# "training" = training agents, maximum speed, frozen UI, mid amount of messages
# "retraining" = continue the training of previous agent
# "debug"    = trains for 1 episode, minimum speed, working UI, all messages
# "demo"     = loads pretrained agent, minimum speed, working UI
# "test"     = executes evaluation, maximum speed




## Network Model Parameters
model_name  = 'Single_Cross_Mod2'
# 'Single_Cross_Straight'
# 'Single_Cross_Triple'
# 'Single_Cross_Triple_Mod'
# 'Single_Cross_Mod2'
# 'Balance'

vissim_working_directory =  'C:\\Users\\Rzhang\\Desktop\\MLforFlowOptimisationOrigine\\Vissim\\'
agent_type = 'AC' # DQN, DuelingDQN, DDQN, DuelingDDQN AC
reward_type = 'Queues'

state_type  = 'CellsT'
#CellsSpeedOccSig'    # 'Queues', 'Delays', 'QueuesDifference' 'QueuesSpeedavrOccuperateSig' 'QueuesSig' CellsSpeedSig 
#CellsSpeedOccSig 'CellsOccSig' 'CellsT'
# 'Queues', 'Delays', 'QueuesDifference' 'QueuesSpeedavrOccuperateSig' 'QueuesSig'
Random_Seed = 42

## Use of additional files?
flag_read_additionally  = True
SaveResultsAgent = True
# Random demand
Random_Demand = False

# Loading the best agent during demo and training
best = True



# Session ID
#Session_ID = 'Ep_'+str(episodes)+'_A_'+agent_type+"_Act_"+action_type+"_Rew_"+reward_type

# Adding the state type to the Session_ID
Session_ID = 'Ep_'+str(episodes)+'_A_'+agent_type+"_State_"+state_type+"_Act_"+action_type+"_Rew_"+reward_type
print(Session_ID)

if mode == 'demo' :
    simulation_length = 3601
    demand_list = [[demand['l'], demand['l']]]
    demand_change_timesteps = simulation_length

if mode == 'test' : 
    simulation_length = 3601
    demand_change_timesteps = 450
    demand = {"h":800, 'm':400, 'l':200}
    demand_list = [[demand['l'], demand['l']], [demand['m'], demand['l']],\
                  [demand['h'], demand['l']], [demand['h'], demand['m']],\
                  [demand['h'], demand['h']], [demand['m'], demand['h']],
                  [demand['l'], demand['h']], [demand['l'], demand['m']]]
    Random_Seed = 1
    # Loading the best agent
    best = False


Ep_400_A_AC_State_CellsT_Act_phases_Rew_Queues


In [None]:
# Have to find a way to reduce entropy over time entropy = exploration

## Converging network
# - reward queue, state queue
# converging with updates every steps and entropy = 0.00001 and 1 core layer of 42
# converging well with updates every steps and entropy = 0.00001 and no core

# - reward queue state queues + sig
# converging well with updates every steps and entropy = 0.00001 and no core

# 64 is a good number

if __name__ == "__main__":
    # Initialize storage
    reward_storage = []
    best_agent_weights = []
    best_agent_memory = []
    reward_plot = np.zeros([episodes,])
    loss_plot = np.zeros([episodes,])

    # Initialize simulation
    Vissim, Simulation, Network, cache_flag = COMServerDispatch(model_name, vissim_working_directory,\
                                                                simulation_length, timesteps_per_second,\
                                                                delete_results = True, verbose = True)
    SF.Select_Vissim_Mode(Vissim,mode)
    
    runflag = True
    # Setting Random Seed
    Vissim.Simulation.SetAttValue('RandSeed', Random_Seed)
    print ('Random seed set in simulator. Random Seed = '+str(Random_Seed))

    # Deploy Network Parser (crawl network)
    npa = NetworkParser(Vissim)
    print('NetworkParser has succesfully crawled the model network.')
    
    # Initialize agents
    if agent_type in ['AC'] :
        Agents = [ACAgent(state_size, action_size, ID, state_type, npa, n_step_size, gamma, alpha, entropy, value, Vissim) for ID in npa.signal_controllers_ids] 
        for agent in Agents:
            # to initialise the computational graph ot the model (I am sure there is a better way to to this)
            agent.test()
        agents_deployed = True
    else:
        print("Incorrect Agent Class selected. Deployment could not be completed.")
        quit()
    if agents_deployed:
        print("Deployed {} agent(s) of the Class {}.".format(len(Agents), agent_type))
    
    ## EXECUTION OF A DEMONSTRATION RUN (slow, choice of best available agent)
    if mode == "demo" or mode == "populate" or mode == "debug" or mode == "test":
        # If mode or debug, set slow simulation
        if mode == "demo" or mode =="debug":
            timesteps_per_second = 10
            Vissim.Simulation.SetAttValue('SimRes', timesteps_per_second)
            
        # If memory population or test mode, set quick simulation
        elif mode == "populate" or mode == "test":
            SF.Set_Quickmode(Vissim, timesteps_per_second)
            
        # If on a test or a demo, load the best available agent and set exploration to zero
        if mode == "demo" or mode == "test":
            Agents , reward_storage = SF.load_agents(vissim_working_directory, model_name, Agents,\
                                    Session_ID, best = best)
            for agent in Agents:
                agent.epsilon = 0
                
    # Run the episode
        if mode == "demo" or mode == "debug":
            SF.run_simulation_episode(Agents, Vissim, state_type, reward_type, state_size, simulation_length,\
                                      timesteps_per_second, seconds_per_green, seconds_per_yellow,\
                                      demand_list, demand_change_timesteps, mode, PER_activated)
        elif mode == "test":
            SF.run_simulation_episode(Agents, Vissim, state_type, reward_type, state_size, simulation_length,\
                                      timesteps_per_second, seconds_per_green, seconds_per_yellow,\
                                      demand_list, demand_change_timesteps, mode, PER_activated)
        
        
        Vissim = None
    
    ## EXECUTION OF THE NORMAL TRAINING LOOP
    elif mode == "training" or mode == "retraining":
        print("Training")
        
            
        # Iterations of the simulation
        for episode in log_progress(range(episodes), every=1):
        
            # Reload map if it has already been run (previous episode or prepopulation)
            if episode !=0 or runflag == True:
                Simulation, Network = COMServerReload(Vissim, model_name, vissim_working_directory,\
                                                      simulation_length, timesteps_per_second, delete_results = True)

                

            # Change the random seed
            Random_Seed += 1
            Vissim.Simulation.SetAttValue('RandSeed', Random_Seed)
        
            # Run Episode at maximum speed
            
            SF.Select_Vissim_Mode(Vissim, mode)
            
            SF.run_simulation_episode(Agents, Vissim, state_type, reward_type, state_size, simulation_length, timesteps_per_second,\
                                      seconds_per_green, seconds_per_yellow, demand_list, demand_change_timesteps, mode,\
                                      PER_activated,Surtrac = Surtrac)
        
            # Calculate episode average reward
            reward_storage, average_reward = SF.average_reward(reward_storage, Agents, episode, episodes)
            best_agent_weights, best_agent_memory = SF.best_agent(reward_storage, average_reward,\
                                                                  best_agent_weights, best_agent_memory,\
                                                                  vissim_working_directory, model_name, Agents, Session_ID)
            
            for index, agent in enumerate(Agents):
                predicted_values, true_values, logit0, logits = agent.value_check(horizon, n_sample)
                print ("Agent {} : Predicted Values and True Return : \n {} \n {}" .format(index, predicted_values, true_values))
                print ("Agent {} : Logits on those states : \n {}" .format(index, logits))
                print ("Agent {} : Logits on the 0 state : \n {}" .format(index, logit0))
               
        
            
            # Security save for long trainings
            if SaveResultsAgent:
                if (episode+1)%partial_save_at == 0:
                    SF.save_agents(vissim_working_directory, model_name, Agents, Session_ID, reward_storage)
                    print('Saved Partial results at the end of episode {}.'.format(episode+1))
            
            # line to reduce the entropy of the actor_critic.
            if reduce_entropy:
                pass
            
        #Saving agents memory, weights and optimizer
        if SaveResultsAgent:
            SF.save_agents(vissim_working_directory, model_name, Agents, Session_ID, reward_storage)
            print("Model, architecture, weights, optimizer, memory and training results succesfully saved.\
            Succesfully Terminated.")
    
    else:
        print("ERROR: Mode selected not recognized. TERMINATING.")
    # Close Vissim
    Vissim = None

Working Directory set to: C:\Users\Rzhang\Desktop\MLforFlowOptimisationOrigine\Vissim\
Generating Cache...
Cache generated.

****************************
*   COM Server dispatched  *
****************************

Attempting to load Model File: Single_Cross_Mod2.inpx ...
Load process successful
Simulation length set to 3601 seconds.
Results from Previous Simulations: Deleted. Fresh Start Available.
Fetched and containerized Simulation Object
Fetched and containerized Network Object 

*******************************************************
*                                                     *
*                 SETUP COMPLETE                      *
*                                                     *
*******************************************************

Random seed set in simulator. Random Seed = 42
NetworkParser has succesfully crawled the model network.
Deploying instance of Actor_Critic Agent(s) !!! TENSORFLOW 2 IS NEEDED !!! 
Model: "modelconv"
________________________________

VBox(children=(HTML(value=''), IntProgress(value=0, max=400)))

Episode: 1/400, Epsilon:0, Average reward: -47.24
Saving architecture, weights, optimizer state for best agent-0
New best agent found. Saved in C:\Users\Rzhang\Desktop\MLforFlowOptimisationOrigine\Vissim\Single_Cross_Mod2\Agents_Results\Ep_400_A_AC_State_CellsT_Act_phases_Rew_Queues\BestAgent0_Memory.p
Dumping Training Results into pickle file.
Dumping Loss Results into pickle file.
Agent 0 : Predicted Values and True Return : 
 [-4.0, -4.0, -3.0, -2.0, 1.0, -7.0, -6.0, -7.0, -0.0, -6.0] 
 [-53.0, -281.0, -33.0, -55.0, -53.0, -80.0, -144.0, -121.0, -1776.0, -140.0]
Agent 0 : Logits on those states : 
 [[0.87, 0.05], [2.09, 3.29], [2.42, -0.42], [0.26, 1.84], [2.64, -1.93], [2.56, 1.82], [-0.15, 0.41], [-1.74, 4.55], [-1.61, 1.93], [3.5, -0.64]]
Agent 0 : Logits on the 0 state : 
 [ 0. -0.]
Episode: 2/400, Epsilon:0, Average reward: -30.12
Saving architecture, weights, optimizer state for best agent-0
New best agent found. Saved in C:\Users\Rzhang\Desktop\MLforFlowOptimisationOrigine\Vi

Episode: 17/400, Epsilon:0, Average reward: -35.67
Agent 0 : Predicted Values and True Return : 
 [-6.0, -22.0, -5.0, -13.0, -21.0, -18.0, -13.0, -6.0, -20.0, -15.0] 
 [-74.0, -274.0, -10.0, -514.0, -438.0, -97.0, -36.0, -85.0, -244.0, -164.0]
Agent 0 : Logits on those states : 
 [[3.69, -0.68], [9.29, -4.33], [2.14, -0.81], [2.58, 0.44], [0.7, 5.32], [3.14, 0.63], [6.23, -1.92], [2.4, 0.63], [1.41, 0.01], [-0.22, 5.39]]
Agent 0 : Logits on the 0 state : 
 [0. 0.]
Episode: 18/400, Epsilon:0, Average reward: -45.03
Agent 0 : Predicted Values and True Return : 
 [-9.0, -17.0, -31.0, -12.0, -20.0, -17.0, -28.0, -18.0, -12.0, -7.0] 
 [-61.0, -113.0, -746.0, -102.0, -610.0, -131.0, -193.0, -871.0, -189.0, -194.0]
Agent 0 : Logits on those states : 
 [[-0.38, 1.86], [3.84, -0.65], [2.09, -1.07], [1.91, 0.59], [1.93, 7.14], [-1.64, 2.29], [-4.25, 4.36], [1.26, -1.0], [4.0, -0.99], [2.78, -0.65]]
Agent 0 : Logits on the 0 state : 
 [0. 0.]
Episode: 19/400, Epsilon:0, Average reward: -33.04
Age

Episode: 34/400, Epsilon:0, Average reward: -59.16
Agent 0 : Predicted Values and True Return : 
 [-29.0, -40.0, -39.0, -41.0, -19.0, -37.0, -14.0, -22.0, -44.0, -66.0] 
 [-332.0, -200.0, -683.0, -250.0, -210.0, -53.0, -1889.0, -279.0, -569.0, -811.0]
Agent 0 : Logits on those states : 
 [[5.43, -3.44], [2.36, 3.31], [4.89, -2.2], [4.35, -1.21], [3.69, 1.02], [2.15, 1.06], [1.11, 0.19], [-1.31, 3.03], [-2.27, 7.79], [-2.38, 7.19]]
Agent 0 : Logits on the 0 state : 
 [0. 0.]
Episode: 35/400, Epsilon:0, Average reward: -30.64
Agent 0 : Predicted Values and True Return : 
 [-30.0, -57.0, -35.0, -52.0, -17.0, -50.0, -30.0, -40.0, -31.0, -12.0] 
 [-138.0, -634.0, -346.0, -189.0, -219.0, -155.0, -205.0, -478.0, -156.0, -53.0]
Agent 0 : Logits on those states : 
 [[1.78, 0.76], [-0.97, 4.01], [3.05, 0.36], [-1.77, 2.37], [1.69, 0.29], [5.12, 3.39], [1.92, -2.38], [1.15, -0.65], [-1.82, 2.9], [2.38, -0.29]]
Agent 0 : Logits on the 0 state : 
 [0. 0.]
Episode: 36/400, Epsilon:0, Average reward:

Episode: 51/400, Epsilon:0, Average reward: -52.99
Agent 0 : Predicted Values and True Return : 
 [-67.0, -45.0, -73.0, -55.0, -78.0, -44.0, -53.0, -33.0, -26.0, -45.0] 
 [-431.0, -136.0, -1179.0, -240.0, -936.0, -308.0, -109.0, -51.0, -53.0, -136.0]
Agent 0 : Logits on those states : 
 [[0.94, 0.95], [3.36, 0.08], [-0.43, 2.53], [6.01, -2.86], [3.54, -1.53], [1.99, 3.11], [4.24, -0.47], [3.05, -0.72], [3.22, -0.01], [3.36, 0.08]]
Agent 0 : Logits on the 0 state : 
 [-0.  0.]
Episode: 52/400, Epsilon:0, Average reward: -43.35
Agent 0 : Predicted Values and True Return : 
 [-37.0, -50.0, -78.0, -91.0, -66.0, -54.0, -60.0, -73.0, -73.0, -80.0] 
 [-35.0, -223.0, -1439.0, -953.0, -215.0, -203.0, -244.0, -233.0, -478.0, -300.0]
Agent 0 : Logits on those states : 
 [[0.51, 4.41], [4.41, 0.56], [4.14, 2.61], [3.03, -1.64], [-0.01, 4.23], [0.95, 6.1], [2.36, 2.13], [1.45, -0.26], [2.28, -0.9], [3.84, -2.61]]
Agent 0 : Logits on the 0 state : 
 [-0.  0.]
Episode: 53/400, Epsilon:0, Average rewa

In [None]:
# Plotting training progress
plt.figure(figsize=(8,4.5))
x_series = range(1,len(reward_storage)+1)
fit = np.polyfit(x_series,reward_storage,1)
fit_fn = np.poly1d(fit) 
plt.plot(x_series,reward_storage, '-b', x_series, fit_fn(x_series), '--r')
plt.xlabel('Episodes')
plt.ylabel('Average agent reward in episode')
plt.title('Training evolution and trend')
plt.gca().legend(('Episode Reward','Linear Trend'))
plt.show()

# Plotting training loss
plt.figure(figsize=(8,4.5))
x_series = range(1,len(Agents[0].loss)+1)
plt.plot(x_series,Agents[0].loss, '-b')
plt.xlabel('Training Epoch')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.gca().legend(('Loss'))
plt.show()

print(reward_storage)

In [None]:
# Plotting test progress:
West_queue = list()
South_queue= list()
East_queue = list()
North_queue= list()
# Queue Lengths
plt.figure(figsize=(8,4.5))
x_series = range(1,len(Agents[0].queues_over_time)+1)
for i in range(len(Agents[0].queues_over_time)):
    West_queue.append(Agents[0].queues_over_time[i][0])
    South_queue.append(Agents[0].queues_over_time[i][1])
    East_queue.append(Agents[0].queues_over_time[i][2])
    North_queue.append(Agents[0].queues_over_time[i][3])
plt.plot(x_series, West_queue, '-b',\
         x_series, South_queue, '-r',\
         x_series, East_queue, '-g',\
         x_series, North_queue, '-y')
plt.xlabel('Time [s]')
plt.ylabel('Queue Length')
plt.title('Training evolution and trend')
plt.gca().legend(('West Queue','South Queue', 'East Queue', 'North Queue'))
plt.show()

# Accumulated delay over time
plt.figure(figsize=(8,4.5))
x_series = range(1,len(Agents[0].accumulated_delay)+1)
plt.plot(x_series,Agents[0].accumulated_delay, '-b')
plt.xlabel('Time [s]')
plt.ylabel('Accumulated global Delay')
plt.title('Global Delay')
plt.gca().legend('GLlobal accumulated delay')
plt.show()

average_queue_length = np.mean(Agents[0].queues_over_time)
print("Average queue size is {}".format(np.round(average_queue_length,2)))