In [1]:
import time
from collections import deque, namedtuple
import numpy as np
import tensorflow as tf
import random
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers.legacy import Adam
from force3env1 import Force3Env

In [2]:
SEED = 0
MINIBATCH_SIZE = 64
TAU = 1e-3
E_DECAY = 0.99
E_MIN = 0.01

In [3]:
def get_experiences(memory_buffer):
    experiences = random.sample(memory_buffer, k=MINIBATCH_SIZE)
    states = tf.convert_to_tensor(
        np.array([e.state for e in experiences if e is not None]), dtype=tf.float32
    )
    actions = tf.convert_to_tensor(
        np.array([e.action for e in experiences if e is not None]), dtype=tf.float32
    )
    rewards = tf.convert_to_tensor(
        np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32
    )
    next_states = tf.convert_to_tensor(
        np.array([e.next_state for e in experiences if e is not None]), dtype=tf.float32
    )
    done_vals = tf.convert_to_tensor(
        np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
        dtype=tf.float32,
    )
    return (states, actions, rewards, next_states, done_vals)

In [4]:
def get_new_eps(epsilon):
    return max(E_MIN, epsilon * E_DECAY)

In [5]:
def update_target_network(q_network, target_q_network):
    for target_weights, q_net_weights in zip(
        target_q_network.weights, q_network.weights
    ):
        target_weights.assign(TAU * q_net_weights + (1.0 - TAU) * target_weights)

In [7]:
def check_update_conditions(t, num_steps_upd, memory_buffer):
    if (t + 1) % num_steps_upd == 0 and len(memory_buffer) > MINIBATCH_SIZE:
        return True
    else:
        return False

In [15]:
# Définissez la graine aléatoire pour TensorFlow
tf.random.set_seed(SEED)

ALPHA = 1e-3
MEMORY_SIZE = 100_000     
GAMMA = 0.99               
NUM_STEPS_FOR_UPDATE = 2 

In [16]:
env = Force3Env()

state_size = env.observation_space.shape
num_actions = len(env.valid_actions)


print('State Shape:', state_size)
print('Number of actions:', num_actions)

State Shape: (9,)
Number of actions: 251


In [18]:
q_network = Sequential([
    Dense(128, activation='relu', input_shape=state_size),
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(num_actions, activation='linear')
])

target_q_network = Sequential([
    Dense(128, activation='relu', input_shape=state_size),
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(num_actions, activation='linear')
])

optimizer = Adam(learning_rate=ALPHA)

In [19]:
experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

In [21]:
def compute_loss(experiences, gamma, q_network, target_q_network):
    states, actions, rewards, next_states, done_vals = experiences

    action_types, start_pos, target_pos, _ = tf.split(actions, num_or_size_splits=4, axis=1)
    
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)
    
    y_targets = rewards + (1 - done_vals) * gamma * max_qsa

    # Convertir les composants d'action en indices uniques
    N = 9  # Nombre de valeurs possibles pour start_pos et target_pos
    # Convertir les composants d'action en indices uniques
    # Pour les actions de type 0 (Placer), les indices vont de 0 à 8
    # Pour les actions de type 1 (Déplacer un pion rond), les indices vont de 9 à 89
    # Pour les actions de type 2 (Déplacer un carré), les indices vont de 90 à 251
    action_indices = action_types * (N**2 + N**2) + start_pos * N + target_pos
    # Pour les actions de type 0, les indices vont de 0 à 8
    action_indices = tf.where(action_types == 0, target_pos, action_types)
    # Pour les actions de type 1, les indices vont de 9 à 89 (9 + 9 * 9)
    action_indices = tf.where(action_types == 1, 9 + start_pos * N + target_pos, action_indices)
    # Pour les actions de type 2, les indices vont de 90 à 251 (90 + 9 * 9 * 2)
    action_indices = tf.where(action_types == 2, 90 + start_pos * N + target_pos * 2, action_indices)
    action_indices = tf.squeeze(action_indices, axis=-1)  # Pour enlever une dimension inutile

    
    q_values = q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]), 
                                                tf.cast(action_indices, tf.int32)], axis=1))
        
    loss = MSE(q_values, y_targets)
    
    return loss

In [22]:
@tf.function
def agent_learn(experiences, gamma):
    with tf.GradientTape() as tape:
        loss = compute_loss(experiences, gamma, q_network, target_q_network)

    gradients = tape.gradient(loss, q_network.trainable_variables)
    
    optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))

    update_target_network(q_network, target_q_network)

In [23]:
def get_action(q_values, epsilon=0.0):
    if random.random() > epsilon:
        # Exploiter: choisir la meilleure action basée sur les valeurs Q prédites
        return env.valid_actions[np.argmax(q_values.numpy())]
    else:
        # Explorer: choisir une action au hasard
        return random.choice(env.valid_actions)

In [24]:
start = time.time()

num_episodes = 1000
max_num_timesteps = 100

total_point_history = []

num_p_av = 100    
epsilon = 1.0     


memory_buffer = deque(maxlen=MEMORY_SIZE)


target_q_network.set_weights(q_network.get_weights())

for i in range(num_episodes):
    
    
    state = env.reset()
    total_points = 0
    
    for t in range(max_num_timesteps):
        
        
        state_qn = np.expand_dims(state, axis=0)  
        q_values = q_network(state_qn)
        action = get_action(q_values, epsilon)
        
       
        next_state, reward, done, _ = env.step(action)
        
        
        memory_buffer.append(experience(state, action, reward, next_state, done))
        
        
        update = check_update_conditions(t, NUM_STEPS_FOR_UPDATE, memory_buffer)
        
        if update:
            
            experiences = get_experiences(memory_buffer)
            
            
            agent_learn(experiences, GAMMA)
        
        state = next_state.copy()
        total_points += reward
        
        if done:
            break
            
    total_point_history.append(total_points)
    av_latest_points = np.mean(total_point_history[-num_p_av:])
    
    
    epsilon = get_new_eps(epsilon)

    print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}", end="")

    if (i+1) % num_p_av == 0:
        print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}")

    if av_latest_points >= 200.0:
        print(f"\n\nEnvironment solved in {i+1} episodes!")
        q_network.save('force3.h5')
        break
        
tot_time = time.time() - start

print(f"\nTotal Runtime: {tot_time:.2f} s ({(tot_time/60):.2f} min)")

Episode 100 | Total point average of the last 100 episodes: -1792.80
Episode 200 | Total point average of the last 100 episodes: -1766.40
Episode 300 | Total point average of the last 100 episodes: -1776.40
Episode 400 | Total point average of the last 100 episodes: -1897.00
Episode 500 | Total point average of the last 100 episodes: -1940.80
Episode 600 | Total point average of the last 100 episodes: -1942.60
Episode 700 | Total point average of the last 100 episodes: -1940.00
Episode 800 | Total point average of the last 100 episodes: -1938.40
Episode 900 | Total point average of the last 100 episodes: -1938.40
Episode 1000 | Total point average of the last 100 episodes: -1938.40

Total Runtime: 246.49 s (4.11 min)
