In [3]:
import numpy as np

Implémentation de environnement LineWorld

In [6]:
class LineWorld:
    def __init__(self, length=5):
        self.length = length
        self.start_state = 0
        self.end_state = length - 1
        self.reset()

    def reset(self):
        self.state = self.start_state
        return self.state

    def step(self, action):
        # 0 = gauche, 1 = droite
        if action == 0:
            self.state = max(0, self.state - 1)
        elif action == 1:
            self.state = min(self.length - 1, self.state + 1)

        reward = 1 if self.state == self.end_state else 0
        done = self.state == self.end_state
        return self.state, reward, done, {}

    def get_valid_actions(self):
        return [0, 1]


Lancement des Agents test sur l'environnement LinWorld :

Agent Random

In [15]:
import random

def run_random_agent(env, num_episodes=1000):
    total_rewards = []
    total_steps = []

    for episode in range(num_episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        steps = 0

        while not done:
            action = random.choice(env.get_valid_actions())
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            steps += 1

        total_rewards.append(episode_reward)
        total_steps.append(steps)

    avg_reward = sum(total_rewards) / num_episodes
    avg_steps = sum(total_steps) / num_episodes
    print(f"Random Agent on LineWorld ({env.length} states)")
    print(f"Average reward: {avg_reward:.2f}")
    print(f"Average episode length: {avg_steps:.2f} steps")

# Exécution
if __name__ == "__main__":
    env = LineWorld(length=5)
    run_random_agent(env)


Random Agent on LineWorld (5 states)
Average reward: 1.00
Average episode length: 20.09 steps


Agent TabularQLearning

In [None]:
import random
from collections import defaultdict

class TabularQLearningAgent:
    def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
        self.q_table = defaultdict(lambda: np.zeros(n_actions))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_actions = n_actions

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        return np.argmax(self.q_table[state])

    def learn(self, state, action, reward, next_state, done):
        q_predict = self.q_table[state][action]
        q_target = reward if done else reward + self.gamma * np.max(self.q_table[next_state])
        self.q_table[state][action] += self.alpha * (q_target - q_predict)

Agent DeepQLearning

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

class DQNNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.net(x)

class DQNAgent:
    def __init__(self, state_dim, n_actions, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, lr=1e-3):
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        self.model = DQNNetwork(state_dim, n_actions)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        with torch.no_grad():
            state_tensor = torch.FloatTensor([state])
            q_values = self.model(state_tensor)
            return torch.argmax(q_values).item()

    def learn(self, state, action, reward, next_state, done):
        state_tensor = torch.FloatTensor([state])
        next_state_tensor = torch.FloatTensor([next_state])
        reward_tensor = torch.tensor(reward)
        done_tensor = torch.tensor(done, dtype=torch.float32)

        q_values = self.model(state_tensor)
        next_q_values = self.model(next_state_tensor)

        target = reward_tensor if done else reward_tensor + self.gamma * torch.max(next_q_values).detach()

        loss = self.criterion(q_values[0][action], target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


Agent DoubleDeepQLearning

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

class DoubleDQNNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.net(x)

class DoubleDQNAgent:
    def __init__(self, state_dim, n_actions, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, lr=1e-3):
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        self.policy_net = DoubleDQNNetwork(state_dim, n_actions)
        self.target_net = DoubleDQNNetwork(state_dim, n_actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.update_counter = 0

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        with torch.no_grad():
            state_tensor = torch.FloatTensor([state])
            q_values = self.policy_net(state_tensor)
            return torch.argmax(q_values).item()

    def learn(self, state, action, reward, next_state, done):
        state_tensor = torch.FloatTensor([state])
        next_state_tensor = torch.FloatTensor([next_state])
        reward_tensor = torch.tensor(reward)
        done_tensor = torch.tensor(done, dtype=torch.float32)

        current_q = self.policy_net(state_tensor)[0][action]

        # Double DQN trick
        with torch.no_grad():
            next_action = torch.argmax(self.policy_net(next_state_tensor), dim=1)
            next_q = self.target_net(next_state_tensor)[0][next_action]
            target_q = reward_tensor if done else reward_tensor + self.gamma * next_q

        loss = self.criterion(current_q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        self.update_counter += 1
        if self.update_counter % 10 == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())


Agent DoubleDeepQLearningWithExperienceReplay :

Buffer

In [16]:
from collections import deque

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


DDQL ER

In [17]:
class DoubleDQNWithReplayAgent:
    def __init__(self, state_dim, n_actions, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, lr=1e-3, buffer_size=10000, batch_size=64):
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size

        self.policy_net = DoubleDQNNetwork(state_dim, n_actions)
        self.target_net = DoubleDQNNetwork(state_dim, n_actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.criterion = nn.MSELoss()

        self.memory = ReplayBuffer(capacity=buffer_size)
        self.update_counter = 0

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        with torch.no_grad():
            state_tensor = torch.FloatTensor([state])
            q_values = self.policy_net(state_tensor)
            return torch.argmax(q_values).item()

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

    def learn(self):
        if len(self.memory) < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        current_q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze()

        with torch.no_grad():
            next_actions = torch.argmax(self.policy_net(next_states), dim=1)
            next_q = self.target_net(next_states).gather(1, next_actions.unsqueeze(1)).squeeze()
            target_q = rewards + self.gamma * next_q * (1 - dones)

        loss = self.criterion(current_q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        self.update_counter += 1
        if self.update_counter % 10 == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())


Agent DoubleDeepQLearningWithPrioritizedExperienceReplay