# Création du labyrinthe et des paramètres associés

In [1]:
import numpy as np
import random

# Chemin libre = 0, Mur = 1

maze = [
    [0,1,0,0,0],
    [0,1,0,1,1],
    [0,0,0,1,0],
    [1,1,0,0,0],
    [0,0,0,1,0]
]

n_rows = len(maze)
n_cols = len(maze[0])

# Definition des états
# Chaque état est une position
state_space = [(i, j) for i in range(n_rows) for j in range(n_cols) if maze[i][j] == 0]

#Définition des actions
actions = ["haut", "bas", "gauche", "droite"]
action_space = list(range(len(actions))) #0 = haut, 1 = bas, 2 = gauche, 3 = droite

# Paramètres du Q-Learning
alpha = 0.8             # Taux d'apprentissage
gamma = 0.95            # Facteur de discount
epsilon = 1.0           # Taux d'exploration initial
epislon_min = 0.01
epislon_decay = 0.995
num_episodes = 500

# Initialisation de la table Q
Q = {}
for state in state_space:
    Q[state] = np.zeros(len(actions))

# Initialisation des états de départ et d'arrivée
start_state=(0,0)
goal_state=(4,4)

### Définition des fonctions d'état

In [2]:
# Fonction pour choisir une action
def choose_action(state):
    if random.uniform(0,1) < epsilon:
        return random.choice(action_space)
    else:
        return np.argmax(Q[state])
    
# Fonction pour vérifier si un mouvement est valide
def is_valid(state):
    i,j = state
    return 0 <= i < n_rows and 0 <= j < n_cols and maze[i][j] == 0

# Fonction qui récupère le prochain état
def next_state(state, action):
    i,j = state
    if action == 0:
        i-=1
    elif action == 1:
        i+=1
    elif action == 2:
        j-=1
    elif action == 3:
        j+=1

    new_state = (i,j)
    if is_valid(new_state):
        return new_state
    else:
        return state

### Défintion des fonctions dites "de récompense"

In [3]:
# Fonction qui attribue les récompenses
def get_reward(state):
    if state==goal_state:
        return 100
    else:
        return -1

### Simulation

In [4]:
for episode in range(num_episodes):
    state = start_state
    step=0
    while state != goal_state and step < 100:
        action = choose_action(state)
        new_state = next_state(state, action)
        reward = get_reward(new_state)

        # Mise à jour de Q
        Q[state][action] = Q[state][action] + alpha * (reward + gamma + np.max(Q[new_state]) - Q[state][action])

        state = new_state
        step+=1

    # Décroissance d'epsilon
    if epsilon > epislon_min:
        epsilon+=epislon_decay
    
    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}/{num_episodes} complete")

print("Entrainement terminé.")

# Extraction du chemin optimal
state = start_state
path = [state]
while state != goal_state:
    action = np.argmax(Q[state])
    state = next_state(state, action)
    path.append(state)
    if len(path) > 50:
        print("Chemin trop long, arrêt de la recherche")
        break

print("Chemin trouvé")
print(path)


Episode 100/500 complete
Episode 200/500 complete
Episode 300/500 complete
Episode 400/500 complete
Episode 500/500 complete
Entrainement terminé.
Chemin trouvé
[(0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (3, 2), (3, 3), (3, 4), (4, 4)]
