In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
from state import get_state, queue
import os
import traci

In [46]:
from dotenv import load_dotenv
load_dotenv()


sumo_bin = os.getenv("SUMO")
sumo_gui_bin = os.getenv("SUMO-GUI")
simulConfig = os.getenv("SIMUL-CONFIG")

In [47]:
import tensorflow as tf
import numpy as np
from collections import deque

In [48]:
# @tf.keras.saving.register_keras_serializable()
class DuelingDQN(tf.keras.Model):
    def __init__(self, input_shape, n_outputs):
        super(DuelingDQN, self).__init__()

        # Couche d'entrée commune
        self.shared_layers = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation="relu", input_shape=input_shape),
            tf.keras.layers.Dense(128, activation="relu"),
        ])

        # **Stream Valeur** V(s)
        self.value_stream = tf.keras.layers.Dense(1)  # Une seule sortie : la valeur de l'état

        # **Stream Avantage** A(s, a)
        self.advantage_stream = tf.keras.layers.Dense(n_outputs)  # Une sortie par action

    def call(self, inputs):
        x = self.shared_layers(inputs)

        V = self.value_stream(x)  # Calcul de la valeur de l’état
        A = self.advantage_stream(x)  # Calcul des avantages

        # Normalisation de A pour éviter le biais
        Q = V + (A - tf.reduce_mean(A, axis=1, keepdims=True))

        return Q


In [49]:
tf.random.set_seed(42)
input_shape = [48]  # Taille de l'état (exemple)
n_outputs = 4  # Nombre d'actions possibles

# Création du modèle principal et du réseau cible
model_action = DuelingDQN(input_shape, n_outputs)
target = DuelingDQN(input_shape, n_outputs)
target.set_weights(model_action.get_weights())  # Copie des poids initiaux

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [50]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)  # random action
    else:
        Q_values = model_action.predict(state[np.newaxis], verbose=0)[0]
        print(f"state : {state}")
        print(f"Q_values : {Q_values}")
        return Q_values.argmax()  # optimal action according to the DQN

In [51]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(4)
    ]
    return states, actions, rewards, next_states

In [None]:
batch_size = 32
discount_factor = 0.5
optimizer = tf.keras.optimizers.Adam(learning_rate=0.05)
loss_fn = tf.keras.losses.MeanSquaredError()

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states = experiences

    # Sélection des meilleures actions avec le modèle d'action
    next_Q_values = model_action.predict(next_states, verbose=0)
    best_next_actions = next_Q_values.argmax(axis=1)  # Meilleures actions

    # Évaluation de ces actions avec le modèle cible (target)
    next_Q_values_target = target.predict(next_states, verbose=0)
    max_next_Q_values = tf.reduce_sum(next_Q_values_target * tf.one_hot(best_next_actions, n_outputs), axis=1)

    # Calcul de la cible des Q-values
    target_Q_values = rewards + discount_factor * max_next_Q_values
    target_Q_values = target_Q_values.reshape(-1, 1)

    # Masque pour ne mettre à jour que les actions prises
    mask = tf.one_hot(actions, n_outputs)

    with tf.GradientTape() as tape:
        all_Q_values = model_action(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))

    grads = tape.gradient(loss, model_action.trainable_variables)
    optimizer.apply_gradients(zip(grads, model_action.trainable_variables))

In [52]:
batch_size = 32
discount_factor = 0.5
optimizer = tf.keras.optimizers.Adam(learning_rate=0.05)
loss_fn = tf.keras.losses.MeanSquaredError()


def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states = experiences  # a changer
    #################### CHANGED SECTION ####################
    next_Q_values = model_action.predict(next_states, verbose=0)  # ≠ target.predict()
    best_next_actions = next_Q_values.argmax(axis=1)
    next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
    max_next_Q_values = (target.predict(next_states, verbose=0) * next_mask
                        ).sum(axis=1)
    #########################################################
    next_Q_values = target.predict(next_states, verbose=0)
    max_next_Q_values = next_Q_values.max(axis=1)
    # runs = 1.0 - (dones | truncateds)  # episode is not done or truncated
    target_Q_values = rewards + discount_factor * max_next_Q_values
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model_action(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))

    grads = tape.gradient(loss, model_action.trainable_variables)
    optimizer.apply_gradients(zip(grads, model_action.trainable_variables))

In [53]:
sumoConfig3 = r"Traditional_traffic/traditional_traffic.sumo.cfg"

replay_buffer = deque(maxlen=10000)


sumoCmd = [sumo_bin, "-c",simulConfig,'--start','--no-warnings']
print(sumoCmd)
for episode in range(40):
    if traci.isLoaded():
        traci.close()
    traci.start(sumoCmd)
    lane_ids =  traci.lane.getIDList()
    # print(lane_ids[0])

    trafic_light_ids = traci.trafficlight.getIDList()

    # state = np.array(queue(lane_ids))
    state=np.array(get_state(lane_ids))
    action=-1
    # print(state)
    for step in range(130000): ## TO CHANGED
        epsilon = max(1 - episode+120 / 160, 0.01)

        if step%2000 == 0:
            #######################################################################
            next_state = np.array(get_state(lane_ids))
            reward = np.sum(state[:24])- np.sum(next_state[:24])
            replay_buffer.append((state, action, reward, next_state))
            #######################################################################

            state=next_state
            action = epsilon_greedy_policy(state, epsilon)
            traci.trafficlight.setPhase(trafic_light_ids[0],2*action)

            if len(replay_buffer) >= batch_size*10:
                training_step(batch_size)

        traci.simulationStep()

    if episode%5==0:
        target.set_weights(model_action.get_weights())
    print(f'episode : {episode}')
    traci.close()




 Retrying in 1 seconds
Step #307.00 (1ms ~= 10.00*RT, ~72000.00UPS, TraCI: 0ms, vehicl72 BUF 0)                  

KeyboardInterrupt: 

Interrupt signal received, trying to exit gracefully.T 492 ACT 103 BUF 0)                 s, vehicles TOT 432 ACT 74 BUF 0)                  


Another interrupt signal received, hard exit.


In [None]:
# model_action.save_weights("model_3DQN.weights.h5")
# model_action.save("full_model.keras")

In [None]:
# model_action = tf.keras.models.load_model("full_model.keras")

TypeError: Could not locate class 'DuelingDQN'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': None, 'class_name': 'DuelingDQN', 'config': {'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None}}, 'registered_name': 'DuelingDQN', 'build_config': {'input_shape': [1, 48]}}

In [None]:
sumoCmd = [sumo_bin, "-c",simulConfig,"--start", "--no-warnings"]
os.environ["SUMO_HOME"] = sumo_bin
if traci.isLoaded():
    traci.close()
traci.start(sumoCmd)
lane_ids =  traci.lane.getIDList()
trafic_light_ids = traci.trafficlight.getIDList()
print(lane_ids)
print([lane_id for lane_id in lane_ids if lane_id[0]!=":"])
state = np.array(get_state(lane_ids))
action=1
# print(state)
alpha=0.1
for step in range(130000): ## TO CHANGED
    nom_du_feu= traci.trafficlight.getIDList()[0]
    print()
    if step%2000 == 0:
        state=np.array(get_state(lane_ids))
        action = epsilon_greedy_policy(state,0)*2
        traci.trafficlight.setPhase(trafic_light_ids[0],action)
    traci.simulationStep()

traci.close()
traci.simulationStep()
traci.simulationStep()
traci.simulationStep()
traci.simulationStep()
traci.simulationStep()
traci.simulationStep()
if traci.isLoaded():
   traci.close()
os.system("pkill sumo")


 Retrying in 1 seconds




Step #0.06 (0ms ?*RT. ?UPS, TraCI: 0ms, vehicles TOT 4 ACT 4 BUF 20)                      


256

'/Users/arseneclaustre/sumo/bin/sumo'