<p style="color: yellow; font-size: 30px;">
 Exercice 1 : Initialisation de l’Environnement 🚖
</p>

In [1]:
import gymnasium as gym
import numpy as np

env = gym.make("Taxi-v3")

state_size = env.observation_space.n
action_size = env.action_space.n

policy_table = np.ones((state_size, action_size)) / action_size
value_table = np.zeros(state_size)

print("Premières lignes de policy_table:")
print(policy_table[:5])

print("\nPremières valeurs de value_table:")
print(value_table[:5])


Premières lignes de policy_table:
[[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]]

Premières valeurs de value_table:
[0. 0. 0. 0. 0.]


<p style="color: yellow; font-size: 30px;">
 Exercice 2 : Collecte d’épisodes 📂
</p>

In [2]:
import gymnasium as gym

env = gym.make("Taxi-v3")

for episode in range(20):
    state, _ = env.reset()
    done = False
    step = 0
    print(f"\nÉpisode {episode + 1}:")

    while not done:
        action = env.action_space.sample()  # Action aléatoire
        next_state, reward, done, _, _ = env.step(action)
        print(f"Étape {step}: Action = {action}, Récompense = {reward}")
        step += 1



Épisode 1:
Étape 0: Action = 3, Récompense = -1
Étape 1: Action = 4, Récompense = -10
Étape 2: Action = 2, Récompense = -1
Étape 3: Action = 2, Récompense = -1
Étape 4: Action = 0, Récompense = -1
Étape 5: Action = 0, Récompense = -1
Étape 6: Action = 0, Récompense = -1
Étape 7: Action = 3, Récompense = -1
Étape 8: Action = 0, Récompense = -1
Étape 9: Action = 2, Récompense = -1
Étape 10: Action = 0, Récompense = -1
Étape 11: Action = 3, Récompense = -1
Étape 12: Action = 3, Récompense = -1
Étape 13: Action = 5, Récompense = -10
Étape 14: Action = 0, Récompense = -1
Étape 15: Action = 5, Récompense = -10
Étape 16: Action = 0, Récompense = -1
Étape 17: Action = 1, Récompense = -1
Étape 18: Action = 1, Récompense = -1
Étape 19: Action = 3, Récompense = -1
Étape 20: Action = 4, Récompense = -10
Étape 21: Action = 3, Récompense = -1
Étape 22: Action = 0, Récompense = -1
Étape 23: Action = 2, Récompense = -1
Étape 24: Action = 0, Récompense = -1
Étape 25: Action = 0, Récompense = -1
Étape 

<p style="color: yellow; font-size: 30px;">
 Exercice 3 : Mise à jour PPO 🔄
</p>

In [None]:
import gymnasium as gym
import numpy as np

env = gym.make("Taxi-v3")

gamma = 0.99
lr_policy = 0.1
lr_value = 0.1
clip_epsilon = 0.2
n_episodes = 20

state_size = env.observation_space.n
action_size = env.action_space.n

policy_table = np.ones((state_size, action_size)) / action_size
value_table = np.zeros(state_size)

def compute_discounted_rewards(rewards):
    discounted_rewards = []
    running_add = 0
    for r in reversed(rewards):
        running_add = r + gamma * running_add
        discounted_rewards.insert(0, running_add)
    return discounted_rewards

def update_policy_ppo(states, actions, advantages, old_probs):
    for state, action, advantage, old_prob in zip(states, actions, advantages, old_probs):
        current_prob = policy_table[state, action]
        ratio = current_prob / old_prob
        clipped_ratio = np.clip(ratio, 1 - clip_epsilon, 1 + clip_epsilon)
        policy_loss = -np.minimum(ratio * advantage, clipped_ratio * advantage)
        policy_table[state, action] -= lr_policy * policy_loss
        policy_table[state] = np.clip(policy_table[state], 1e-8, 1)
        policy_table[state] /= policy_table[state].sum()

for episode in range(n_episodes):
    state, _ = env.reset()
    done = False
    episode_states = []
    episode_actions = []
    episode_rewards = []
    episode_old_probs = []
    
    while not done:
        action_probs = policy_table[state]
        action = np.random.choice(action_size, p=action_probs)
        old_prob = action_probs[action]
        episode_old_probs.append(old_prob)
        next_state, reward, done, _, _ = env.step(action)
        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)
        state = next_state
    
    discounted_rewards = compute_discounted_rewards(episode_rewards)
    
    advantages = []
    for state, reward in zip(episode_states, discounted_rewards):
        advantage = reward - value_table[state]
        advantages.append(advantage)
        value_table[state] += lr_value * advantage
    
    update_policy_ppo(episode_states, episode_actions, advantages, episode_old_probs)
    
    total_reward = sum(episode_rewards)
    print(f"Épisode {episode + 1}, Récompense totale: {total_reward}")

env.close()

Épisode 1, Récompense totale: -1974
Épisode 2, Récompense totale: -2849
Épisode 3, Récompense totale: -4008
Épisode 4, Récompense totale: -4136


<p style="color: yellow; font-size: 30px;">
 Exercice 4 : Évaluation de l’Agent 📊
</p>

In [None]:
import gymnasium as gym
import numpy as np

env = gym.make("Taxi-v3")
num_eval_episodes = 20

def evaluate_agent(policy_table, num_episodes):
    total_rewards = []
    for ep in range(num_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = np.argmax(policy_table[state])
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = next_state
        total_rewards.append(total_reward)
    return total_rewards

random_policy = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n
random_rewards = evaluate_agent(random_policy, num_eval_episodes)
print("Avant entraînement:")
print(f"Récompenses: {random_rewards}")
print(f"Moyenne: {np.mean(random_rewards):.2f}")
print(f"Écart-type: {np.std(random_rewards):.2f}\n")

trained_rewards = evaluate_agent(policy_table, num_eval_episodes)
print("Après entraînement:")
print(f"Récompenses: {trained_rewards}")
print(f"Moyenne: {np.mean(trained_rewards):.2f}")
print(f"Écart-type: {np.std(trained_rewards):.2f}")

env.close()