## Imports

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import itertools

from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.losses import CategoricalCrossentropy

import collections
import random

from Intersection import Lane, Approach, Intersection, Exit, Trafficlight

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Intersection configuration

In [None]:
# Exit N
exit_n_r = Lane([0], None, True, None)
exit_n = Exit(0, [exit_n_r])

# Exit E
exit_e_r = Lane([0], None, True, None)
exit_e_l = Lane([0], None, True, None)
exit_e = Exit(1, [exit_e_r, exit_e_l])

# Exit S
exit_s_r = Lane([0], None, True, None)
exit_s = Exit(2, [exit_s_r])

# Exit W
exit_w_r = Lane([0], None, True, None)
exit_w_l = Lane([0], None, True, None)
exit_w = Exit(3, [exit_w_r, exit_w_l])

# Approach N
traffic_light_n_r = Trafficlight(0, [1], 0, 1, 2)
traffic_light_n_l = Trafficlight(1, [0, 2], 0, 1, 2)

lane_n_r = Lane([1], traffic_light_n_r, False, [exit_w_r])
lane_n_l = Lane([0, 2], traffic_light_n_l, False, [exit_s_r, exit_e_l])

approach_n = Approach(0, [lane_n_r, lane_n_l], 90)

# Approach E
traffic_light_e_r = Trafficlight(2, [0, 1], 0, 1, 2)
traffic_light_e_m = Trafficlight(3, [0], 0, 1, 2)
traffic_light_e_l = Trafficlight(4, [2], 0, 1, 2)

lane_e_r = Lane([0, 1], traffic_light_e_r, False, [exit_n_r, exit_w_r])
lane_e_m = Lane([0], traffic_light_e_m, False, [exit_w_l])
lane_e_l = Lane([2], traffic_light_e_l, False, [exit_s_r])

approach_e = Approach(1, [lane_e_r, lane_e_m, lane_e_l], 0)

# Approach S
traffic_light_s_r = Trafficlight(5, [0, 1], 0, 1, 2)
traffic_light_s_l = Trafficlight(6, [2], 0, 1, 2)

lane_s_r = Lane([0, 1], traffic_light_s_r, False, [exit_e_r, exit_n_r])
lane_s_l = Lane([2], traffic_light_s_l, False, [exit_w_l])

approach_s = Approach(2, [lane_s_r, lane_s_l], 270)

# Approach W
traffic_light_w_r = Trafficlight(7, [0, 1], 0, 1, 2)
traffic_light_w_m = Trafficlight(8, [0], 0, 1, 2)
traffic_light_w_l = Trafficlight(9, [2], 0, 1, 2)

lane_w_r = Lane([0, 1], traffic_light_w_r, False, [exit_s_r, exit_e_r])
lane_w_m = Lane([0], traffic_light_w_m, False, [exit_e_l])
lane_w_l = Lane([2], traffic_light_w_l, False, [exit_n_r])

approach_w = Approach(3, [lane_w_r, lane_w_m, lane_w_l], 180)

u =  [8, 8, 9, 9, 9, 8, 8, 9, 9, 9]
i =  [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

V = [[1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
     [0, 0, 0, 1, 1, 0, 0, 0, 1, 1],
     [0, 0, 1, 0, 0, 0, 0, 1, 0, 0]]

m = len(V)
a_max = 5

intersection = Intersection([approach_n, approach_e, approach_s, approach_w], [exit_n, exit_e, exit_s, exit_w], u, i, V, a_max)

## DQN
### Properties

In [None]:
DISCOUNT = 0.70
REPLAY_MEMORY_CAPACITY = 10000
BATCH_SIZE = 64
UPDATE_TARGET_INTERVAL = 50
EPSILON = 0.95
MIN_EPSILON = 0.01
DECAY = 0.9999
NR_EPISODES = 30

state = intersection.reset()
done =  False

action_space = itertools.product(np.arange(0, a_max), repeat=m)
action_space = np.array(list(action_space))
action_space = action_space[~np.sometrue(action_space == 0, axis=1)]

POSSIBLE_ACTIONS = np.arange(0, len(action_space)).tolist()

### DQN Class

In [None]:
class DQAgent:
    def __init__(self, replayCapacity, action_space):
        ## Initialize replay memory
        self.capacity = replayCapacity
        self.memory = collections.deque(maxlen=self.capacity)
        self.populated = False
        ## q network


        self.state_shape = [10]
        self.action_space = np.array(action_space)
        self.action_shape = len(self.action_space)

        self.q_model = self.buildNetwork()
        ## Target network
        self.target_model = self.buildNetwork()
        self.target_model.set_weights(self.q_model.get_weights())

    def addToReplayMemory(self, step):
        self.step = step
        self.memory.append(self.step)

    def sampleFromReplayMemory(self, batchSize):
        self.batchSize = batchSize
        if self.batchSize > len(self.memory):
            self.populated = False
            return self.populated
        else:
            return random.sample(self.memory, self.batchSize)

    def buildNetwork(self):
        model = Sequential()
        model.add(Dense(32, input_shape=self.state_shape, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))

        model.add(Dense(48, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        
        model.add(Dense(self.action_shape, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr = 0.001), metrics=['MeanSquaredError'])
        return model

    def q_network_fit(self,batch, batchSize):
        self.batchSize = batchSize
        self.batch = batch

    def q_network_predict(self, state):
        self.state = state
        self.qPolicy = self.q_model.predict(self.state)
        return self.qPolicy

    def target_network_predict(self, state):
        self.state = state
        self.qTarget = self.target_model.predict(self.state)
        return self.qTarget

    def update_target_network(self):
        self.target_model.set_weights(self.q_model.get_weights())

In [None]:
agent = DQAgent(replayCapacity= REPLAY_MEMORY_CAPACITY, action_space=action_space)

### Training

In [None]:
updateCounter = 0
rewardHistory = []

action_space = itertools.product(np.arange(0, a_max), repeat=m)
action_space = np.array(list(action_space))
action_space = action_space[~np.sometrue(action_space == 0, axis=1)]


for episode in range(NR_EPISODES):
    episodeReward = 0
    stepCounter = 0  # count the number of successful steps within the episode

    #print('\n', episode)
    state = intersection.reset()
    done = False
    #state = np.expand_dims(state, axis=0)

    while not done :
        r = random.random()

        if r <= EPSILON:
            action = random.sample(POSSIBLE_ACTIONS, 1)[0]
            print(action)
            #print('exploration')
        else:
            #print('exploitation')
            qValues = agent.q_network_predict(state.reshape(1,-1))
            action = np.argmax(qValues[0])
            #print('action =', action)
            #print(qValues)

        newState, reward, done, info = intersection.step(action_space[action])

        stepCounter +=1
        #print('stepcounter = ', stepCounter)

        #newState = np.expand_dims(newState, axis=0)
        # store step in replay memory
        step = (state, action, reward, newState, done)
        agent.addToReplayMemory(step)
        state = newState
        episodeReward += reward
        #print('episodeReward = ',episodeReward)
        # When enough steps in replay memory -> train policy network
        if len(agent.memory) >= (BATCH_SIZE ):
            EPSILON = DECAY * EPSILON
            print(EPSILON)
            if EPSILON < MIN_EPSILON:
                EPSILON = MIN_EPSILON
            # sample minibatch from replay memory
            miniBatch = agent.sampleFromReplayMemory(BATCH_SIZE)
            miniBatch_states = np.asarray(list(zip(*miniBatch))[0],dtype=float)
            miniBatch_actions = np.asarray(list(zip(*miniBatch))[1], dtype = int)
            miniBatch_rewards = np.asarray(list(zip(*miniBatch))[2], dtype = float)
            miniBatch_next_state = np.asarray(list(zip(*miniBatch))[3],dtype=float)
            miniBatch_done = np.asarray(list(zip(*miniBatch))[4],dtype=bool)

            current_state_q_values = agent.q_network_predict(miniBatch_states)
            y = current_state_q_values
            #print(y.shape)
            #miniBatch_next_state = np.squeeze(miniBatch_next_state, axis =1)

            next_state_q_values = agent.target_network_predict(miniBatch_next_state)
            #print(next_state_q_values.shape)
            max_q_next_state = np.max(next_state_q_values,axis=1)

            for i in range(BATCH_SIZE):
                if miniBatch_done[i]:
                    y[i,miniBatch_actions[i]] = miniBatch_rewards[i]
                else:
                    y[i,miniBatch_actions[i]] = miniBatch_rewards[i] + DISCOUNT * max_q_next_state[i]

            agent.q_model.fit(miniBatch_states, y, batch_size=BATCH_SIZE, verbose = 0)
            #print(y)

        else:
            # intersection.render()
            continue
        if updateCounter == UPDATE_TARGET_INTERVAL:
            agent.update_target_network()
            print('target updated')
            updateCounter = 0
        updateCounter += 1
    print('episodeReward for episode ', episode, '= ', episodeReward, 'with epsilon = ', EPSILON)
    rewardHistory.append(episodeReward)

intersection.close()

## Reward visualization

In [None]:
plt.plot(rewardHistory)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward per episode')
plt.show()

In [None]:
plt.plot(intersection.drukte_hist)
plt.xlabel('Time')
plt.ylabel('Aantal autos')
plt.title('Drukte - DQN')
plt.show()