In [1]:
import numpy as np 
env = [
    [0, 0, 0, 0, 0],
    [0, -10, 0, -10, 0],
    [0, 0, 0, 0, 0],
    [0, 0, 10, 0, 0],
    [0, 0, 0, 0, 0]
]

In [2]:
rows = len(env)
cols = len(env[0])

In [3]:
rewards = {}
for i in range(rows):
    for j in range(cols):
        rewards[(i, j)] = 0 

def get_reward(state):
    i, j = state
    return env[i][j]
visit_count = {(i, j): 0 for i in range(rows) for j in range(cols)}


In [4]:
def take_action(state, action):
    i, j = state
    if action == 'UP':
        new_state = (max(i - 1, 0), j)
    elif action == 'DOWN':
        new_state = (min(i + 1, rows - 1), j)
    elif action == 'LEFT':
        new_state = (i, max(j - 1, 0))
    elif action == 'RIGHT':
        new_state = (i, min(j + 1, cols - 1))
    return new_state

In [5]:
def choose_action(state, epsilon=0.1):
    possible_actions = ['UP', 'DOWN', 'LEFT', 'RIGHT'] # epsilon grand on explore , petit en exploite
    if np.random.random() < epsilon:
        # Exploration : choisir une action aléatoire
        return np.random.choice(possible_actions)
    else:
        # Exploitation : choisir l'action avec la meilleure récompense attendue
        best_action = None
        best_reward = -float('inf')
        for action in possible_actions:
            new_state = take_action(state, action)
            if rewards[new_state] > best_reward:
                best_reward = rewards[new_state]
                best_action = action
        return best_action


In [6]:
def update_reward(state, reward, alpha=0.1):
    visit_count[state] += 1
    rewards[state] += alpha * (reward - rewards[state])

In [7]:
episodes = 100
max_steps_per_episode = 50
epsilon = 1.0  # Taux d'exploration initial
epsilon_decay = 0.99  # Décroissance de epsilon
min_epsilon = 0.01  # Taux d'exploration minimal
# Apprentissage
for episode in range(episodes):
    state = (0, 0)  # État initial
    steps = 0

    while True:
        if steps >= max_steps_per_episode:
            print(f"Episode {episode} stopped: too many steps.")
            break

        action = choose_action(state, epsilon)
        new_state = take_action(state, action)
        reward = get_reward(new_state)

        if reward == 0:
            reward = -1  # Pénalité pour les états neutres

        update_reward(new_state, reward)
        visit_count[new_state] += 1

        state = new_state
        steps += 1

        if env[state[0]][state[1]] in [-10, 10]:
            print(f"Episode {episode} finished with reward {reward}.")
            break

    # Décroissance de epsilon
    epsilon = max(epsilon * epsilon_decay, min_epsilon)

Episode 0 finished with reward -10.
Episode 1 finished with reward -10.
Episode 2 finished with reward -10.
Episode 3 finished with reward -10.
Episode 4 finished with reward 10.
Episode 5 finished with reward -10.
Episode 6 finished with reward -10.
Episode 7 finished with reward -10.
Episode 8 finished with reward 10.
Episode 9 finished with reward 10.
Episode 10 finished with reward -10.
Episode 11 finished with reward -10.
Episode 12 finished with reward -10.
Episode 13 finished with reward 10.
Episode 14 finished with reward -10.
Episode 15 finished with reward -10.
Episode 16 finished with reward -10.
Episode 17 finished with reward -10.
Episode 18 finished with reward 10.
Episode 19 finished with reward -10.
Episode 20 finished with reward 10.
Episode 21 finished with reward -10.
Episode 22 finished with reward 10.
Episode 23 finished with reward -10.
Episode 24 finished with reward -10.
Episode 25 finished with reward -10.
Episode 26 finished with reward -10.
Episode 27 finishe

In [8]:
print("\nRécompenses moyennes pour chaque état :")
for i in range(rows):
    for j in range(cols):
        val = rewards[(i, j)]
        if visit_count[(i, j)] == 0:
            print(f"État ({i}, {j}) : Jamais visité")
        else:
            print(f"État ({i}, {j}) : {val:.2f}")



Récompenses moyennes pour chaque état :
État (0, 0) : -1.00
État (0, 1) : -1.00
État (0, 2) : -0.99
État (0, 3) : -0.94
État (0, 4) : -0.94
État (1, 0) : -1.00
État (1, 1) : -9.94
État (1, 2) : -0.94
État (1, 3) : -7.71
État (1, 4) : -0.75
État (2, 0) : -1.00
État (2, 1) : -0.97
État (2, 2) : -0.92
État (2, 3) : -0.52
État (2, 4) : -0.65
État (3, 0) : -0.97
État (3, 1) : -0.90
État (3, 2) : 9.82
État (3, 3) : -0.57
État (3, 4) : -0.65
État (4, 0) : -0.89
État (4, 1) : -0.88
État (4, 2) : -0.69
État (4, 3) : -0.52
État (4, 4) : -0.57


In [None]:
#Exploration finale avec la politique apprise
state = (0, 0)
steps = 0
visited_states = set()

print("\nExploration finale :")
while True:
    print(f"Step {steps}: {state}")

    if state in visited_states:
        print("Boucle infini.")
        break
    visited_states.add(state)

    # Choisir la meilleure action basée sur les récompenses
    possible_actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
    best_action = None
    best_reward = -float('inf')

    for action in possible_actions:
        new_state = take_action(state, action)
        if rewards[new_state] > best_reward:
            best_reward = rewards[new_state]
            best_action = action

    state = take_action(state, best_action)
    steps += 1

    if env[state[0]][state[1]] == -10:
        print("Loser.")
        break
    elif env[state[0]][state[1]] == 10:
        print("Winner!")
        break

    if steps > 50:
        print("Boucle infini.")
        break


Exploration finale :
Step 0: (0, 0)
Step 1: (1, 0)
Step 2: (2, 0)
Step 3: (3, 0)
Step 4: (4, 0)
Step 5: (4, 1)
Step 6: (4, 2)
Nee9iii!
