In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import time
import random

Implémentation de l'environnement TicTacToe

In [None]:
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = [' '] * 9
        self.current_player = 'X'
        return self.board.copy()

    def step(self, action):
        if self.board[action] != ' ':
            return self.board.copy(), -10, True, {"invalid": True}

        self.board[action] = self.current_player
        if self.check_win(self.current_player):
            return self.board.copy(), 1, True, {}

        if ' ' not in self.board:
            return self.board.copy(), 0, True, {}

        # adversaire random
        opponent_move = random.choice([i for i, v in enumerate(self.board) if v == ' '])
        self.board[opponent_move] = 'O'
        if self.check_win('O'):
            return self.board.copy(), -1, True, {}

        return self.board.copy(), 0, False, {}

    def check_win(self, player):
        wins = [(0,1,2), (3,4,5), (6,7,8), (0,3,6), (1,4,7), (2,5,8), (0,4,8), (2,4,6)]
        return any(all(self.board[i] == player for i in combo) for combo in wins)

    def render(self):
        for i in range(3):
            print(self.board[3*i:3*(i+1)])


Random

In [None]:
import random
class RandomAgent:
    def select_action(self, state):
        return random.choice([i for i, v in enumerate(state) if v == ' '])

    def learn(self, *args, **kwargs):
        pass

In [None]:
def train_agent(agent, env, episodes=1000):
    rewards = []
    lengths = []
    for ep in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        steps = 0
        while not done:
            action = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            ep_reward += reward
            steps += 1
        rewards.append(ep_reward)
        lengths.append(steps)
        if (ep + 1) % 100 == 0:
            print(f"Episode {ep+1}/{episodes} - Moyenne des 100 derniers rewards : {np.mean(rewards[-100:]):.3f}")
    return rewards, lengths

def evaluate_agent(agent, env, episodes=100):
    rewards = []
    lengths = []
    action_times = []
    for _ in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        steps = 0
        while not done:
            start = time.time()
            action = agent.select_action(state)
            action_times.append(time.time() - start)
            state, reward, done, info = env.step(action)
            ep_reward += reward
            steps += 1
        rewards.append(ep_reward)
        lengths.append(steps)
    return {
        "score_moyen": np.mean(rewards),
        "longueur_moyenne": np.mean(lengths),
        "temps_moyen_action": np.mean(action_times) * 1000  # en ms
    }

In [None]:
def plot_rewards(rewards, title="Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label='Reward par épisode')
    plt.plot(np.convolve(rewards, np.ones(100)/100, mode='valid'),
             label='Reward moyenne (fenêtre=50)', color='orange')
    plt.xlabel('Épisodes')
    plt.ylabel('Reward')
    plt.title(title)
    plt.grid()
    plt.legend()
    plt.show()


env = TicTacToe()
random_agent = RandomAgent()

print(" Entraînement de l'agent Random...")
rewards, lengths = train_agent(random_agent, env, episodes=1000)

print("\n Graphique des récompenses")
plot_rewards(rewards, title="Random Agent - TicTacToe")

print("\n Évaluation de la policy Random")
results_random = evaluate_agent(random_agent, env, episodes=100)
print(f"Score moyen (ε=0): {results_random['score_moyen']:.3f}")
print(f"Longueur moyenne : {results_random['longueur_moyenne']:.2f} steps")
print(f"Temps moyen par action : {results_random['temps_moyen_action']:.3f} ms")

TabularQLearning

In [None]:
from collections import defaultdict


class TabularQLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.1):
        self.q_table = defaultdict(lambda: np.zeros(9))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def state_to_key(self, state):
        return ''.join(state)

    def select_action(self, state):
        key = self.state_to_key(state)
        if random.random() < self.epsilon:
            available = [i for i, v in enumerate(state) if v == ' ']
            return random.choice(available)
        q_values = self.q_table[key].copy()
        q_values = [q if state[i] == ' ' else -np.inf for i, q in enumerate(q_values)]
        return int(np.argmax(q_values))

    def learn(self, state, action, reward, next_state, done):
        key = self.state_to_key(state)
        next_key = self.state_to_key(next_state)

        max_next_q = max([self.q_table[next_key][i] for i, v in enumerate(next_state) if v == ' '], default=0)
        td_target = reward + self.gamma * max_next_q
        td_error = td_target - self.q_table[key][action]
        self.q_table[key][action] += self.alpha * td_error

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
def train_agent(agent, env, episodes=1000):
    rewards = []
    lengths = []
    for ep in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        steps = 0
        while not done:
            action = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            ep_reward += reward
            steps += 1
        rewards.append(ep_reward)
        lengths.append(steps)
        if (ep + 1) % 100 == 0:
            print(f"Episode {ep+1}/{episodes} - Moyenne des 100 derniers rewards : {np.mean(rewards[-100:]):.3f}")
    return rewards, lengths

def evaluate_agent(agent, env, episodes=100):
    rewards = []
    lengths = []
    action_times = []
    
    for _ in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        steps = 0
        while not done:
            start = time.time()
            action = agent.select_action(state)
            action_times.append(time.time() - start)
            state, reward, done, info = env.step(action)
            ep_reward += reward
            steps += 1
        rewards.append(ep_reward)
        lengths.append(steps)
    return {
        "score_moyen": np.mean(rewards),
        "longueur_moyenne": np.mean(lengths),
        "temps_moyen_action": np.mean(action_times) * 1000  # ms
    }


In [None]:
def plot_rewards(rewards, title="Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label='Reward par épisode')
    plt.plot(np.convolve(rewards, np.ones(50)/50, mode='valid'),
             label='Reward moyenne (fenêtre=50)', color='orange')
    plt.xlabel('Épisodes')
    plt.ylabel('Reward')
    plt.title(title)
    plt.grid()
    plt.legend()
    plt.show()


env = TicTacToe()
tabular_agent = TabularQLearningAgent()

print(" Entraînement de l'agent Tabular Q-Learning...")
rewards_tabular, lengths_tabular = train_agent(tabular_agent, env, episodes=1000)

print("\n Graphique des récompenses")
plot_rewards(rewards_tabular, title="Tabular Q-Learning - TicTacToe")

print("\n Évaluation de la policy Tabular Q-Learning")
results_tabular = evaluate_agent(tabular_agent, env, episodes=100)
print(f"Score moyen (ε=0): {results_tabular['score_moyen']:.3f}")
print(f"Longueur moyenne : {results_tabular['longueur_moyenne']:.2f} steps")
print(f"Temps moyen par action : {results_tabular['temps_moyen_action']:.3f} ms")



DeepQLearning

In [None]:
class DQNNetwork(nn.Module):
    def __init__(self, input_dim=9, output_dim=9):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.net(x)
    
def encode_board(board):
    encoding = {' ': 0, 'X': 1, 'O': -1}
    return np.array([encoding[cell] for cell in board], dtype=np.float32)

In [None]:




class DQNAgent:
    def __init__(self, lr=1e-3, gamma=0.99, epsilon=0.1, epsilon_decay=0.995, epsilon_min=0.1):
        self.model = DQNNetwork()
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def select_action(self, state):
        available = [i for i, v in enumerate(state) if v == ' ']
        if random.random() < self.epsilon:
            return random.choice(available)
        with torch.no_grad():
            state_encoded = torch.tensor(encode_board(state)).unsqueeze(0)
            q_values = self.model(state_encoded)[0].numpy()
            q_values = [q if state[i] == ' ' else -np.inf for i, q in enumerate(q_values)]
            return int(np.argmax(q_values))

    def learn(self, state, action, reward, next_state, done):
        state_tensor = torch.tensor(encode_board(state)).unsqueeze(0)
        next_state_tensor = torch.tensor(encode_board(next_state)).unsqueeze(0)
        reward_tensor = torch.tensor(reward, dtype=torch.float32)

        q_values = self.model(state_tensor)
        next_q_values = self.model(next_state_tensor)

        target = q_values.clone().detach()
        if done:
            target[0][action] = reward_tensor
        else:
            target[0][action] = reward_tensor + self.gamma * torch.max(next_q_values)

        loss = self.criterion(q_values, target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
def trainDeepQLearningAgent(env, agent, episodes=1000):
    total_rewards = []
    total_steps = []

    for episode in range(episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        steps = 0

        while not done:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward
            steps += 1

        total_rewards.append(episode_reward)
        total_steps.append(steps)

        if (episode + 1) % 100 == 0:
            print(f"Épisode {episode+1}/{episodes} - Moyenne reward (100 derniers) : {np.mean(total_rewards[-100:]):.3f}")

    return total_rewards, total_steps

def evaluateDeepQLearningAgent(agent, env, episodes=100):
    original_epsilon = agent.epsilon
    agent.epsilon = 0.0

    total_rewards = []
    total_steps = []
    action_times = []

    for _ in range(episodes):
        state = env.reset()
        done = False
        reward_sum = 0
        steps = 0

        while not done:
            start = time.time()
            action = agent.select_action(state)
            action_times.append(time.time() - start)
            state, reward, done, _ = env.step(action)
            reward_sum += reward
            steps += 1

        total_rewards.append(reward_sum)
        total_steps.append(steps)

    agent.epsilon = original_epsilon

    print("\n Évaluation de la policy DQN (ε = 0) :")
    print(f"  - Score moyen : {np.mean(total_rewards):.3f}")
    print(f"  - Longueur moyenne : {np.mean(total_steps):.2f} steps")
    print(f"  - Temps moyen par action : {np.mean(action_times) * 1000:.3f} ms")


In [None]:
def plot_rewards(rewards, title="Deep Q-Learning - Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label="Reward brut", alpha=0.3)
    if len(rewards) >= 100:
        plt.plot(np.convolve(rewards, np.ones(100)/100, mode="valid"),
                 label="Moyenne glissante (100)", color="orange")
    plt.xlabel("Épisodes")
    plt.ylabel("Récompense")
    plt.ylim(-1.5, 1.5)
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()

env = TicTacToe()
dqn_agent = DQNAgent()

print(" Entraînement de l'agent Deep Q-Learning...")
rewards_dqn, steps_dqn = trainDeepQLearningAgent(env, dqn_agent, episodes=1000)

print("\n Affichage du graphique des récompenses")
plot_rewards(rewards_dqn, title="Deep Q-Learning - TicTacToe")

print("\n Résultats d'évaluation de la policy DQN")
evaluateDeepQLearningAgent(dqn_agent, env, episodes=100)


DoubleDeepQLearningWithExperienceReplay

In [None]:
from collections import deque

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state.copy(), action, reward, next_state.copy(), done))

    def sample(self, batch_size):
        samples = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*samples)
        return list(states), list(actions), list(rewards), list(next_states), list(dones)

    def __len__(self):
        return len(self.buffer)

class DoubleDeepQLearningWithExperienceReplay:
    def __init__(self, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.1, lr=1e-3, batch_size=32):
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size

        self.model = DQNNetwork()
        self.target_model = DQNNetwork()
        self.target_model.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.replay_buffer = ReplayBuffer()
        self.update_counter = 0

    def board_to_tensor(self, board):
        return torch.FloatTensor([(1 if x == 'X' else -1 if x == 'O' else 0) for x in board])

    def select_action(self, board):
        valid_moves = [i for i, v in enumerate(board) if v == ' ']
        if random.random() < self.epsilon:
            return random.choice(valid_moves)
        with torch.no_grad():
            state_tensor = self.board_to_tensor(board).unsqueeze(0)
            q_values = self.model(state_tensor).squeeze()
            q_values = [q_values[i] if i in valid_moves else -1e9 for i in range(9)]
            return int(torch.argmax(torch.tensor(q_values)).item())

    def learn(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)
        if len(self.replay_buffer) < self.batch_size:
            return

        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)

        state_tensors = torch.stack([self.board_to_tensor(s) for s in states])
        next_state_tensors = torch.stack([self.board_to_tensor(ns) for ns in next_states])
        actions_tensor = torch.tensor(actions, dtype=torch.long)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
        dones_tensor = torch.tensor(dones, dtype=torch.float32)

        q_values = self.model(state_tensors).gather(1, actions_tensor.unsqueeze(1)).squeeze()

        with torch.no_grad():
            next_actions = torch.argmax(self.model(next_state_tensors), dim=1)
            next_q_values = self.target_model(next_state_tensors).gather(1, next_actions.unsqueeze(1)).squeeze()
            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values

        loss = self.criterion(q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_counter += 1
        if self.update_counter % 10 == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
def trainDoubleDQNReplayAgent(env, agent, episodes=1000):
    rewards = []
    steps = []

    for ep in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        ep_steps = 0

        while not done:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            ep_reward += reward
            ep_steps += 1

        rewards.append(ep_reward)
        steps.append(ep_steps)

        if (ep + 1) % 100 == 0:
            print(f"Épisode {ep+1}/{episodes} - Moyenne des 100 derniers : {np.mean(rewards[-100:]):.3f}")

    return rewards, steps

def evaluateDoubleDQNReplayAgent(agent, env, episodes=100):
    original_epsilon = agent.epsilon
    agent.epsilon = 0.0

    rewards = []
    lengths = []
    action_times = []

    for _ in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        ep_steps = 0

        while not done:
            start = time.time()
            action = agent.select_action(state)
            action_times.append(time.time() - start)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            ep_steps += 1

        rewards.append(ep_reward)
        lengths.append(ep_steps)

    agent.epsilon = original_epsilon

    print("\n Évaluation de la policy Double DQN + Replay (ε = 0) :")
    print(f"  - Score moyen : {np.mean(rewards):.3f}")
    print(f"  - Longueur moyenne : {np.mean(lengths):.2f} steps")
    print(f"  - Temps moyen par action : {np.mean(action_times) * 1000:.3f} ms")


In [None]:
def plot_rewards(rewards, title="Double DQN + Replay - Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label="Reward brut", alpha=0.3)
    if len(rewards) >= 100:
        plt.plot(np.convolve(rewards, np.ones(100)/100, mode="valid"),
                 label="Moyenne glissante (100)", color="orange")
    plt.xlabel("Épisodes")
    plt.ylabel("Récompense")
    plt.ylim(-1.5, 1.5)
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()

env = TicTacToe()
ddqn_replay_agent = DoubleDeepQLearningWithExperienceReplay()

print(" Entraînement de l'agent Double DQN avec Experience Replay...")
rewards_ddqn, steps_ddqn = trainDoubleDQNReplayAgent(env, ddqn_replay_agent, episodes=1000)

print("\n Affichage du graphique")
plot_rewards(rewards_ddqn, title="Double DQN avec Experience Replay - TicTacToe")

print("\n Évaluation finale")
evaluateDoubleDQNReplayAgent(ddqn_replay_agent, env, episodes=100)


DoubleDeepQLearningWithPrioritizedExperienceReplay


Buffer

In [None]:
class PrioritizedReplayBuffer:
    def __init__(self, capacity=10000, alpha=0.6):
        self.capacity = capacity
        self.buffer = []
        self.priorities = []
        self.alpha = alpha

    def push(self, state, action, reward, next_state, done):
        max_priority = max(self.priorities, default=1.0)
        self.buffer.append((state.copy(), action, reward, next_state.copy(), done))
        self.priorities.append(max_priority)

        if len(self.buffer) > self.capacity:
            self.buffer.pop(0)
            self.priorities.pop(0)

    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == 0:
            return [], [], [], [], [], []

        priorities = np.array(self.priorities, dtype=np.float32)
        probs = priorities ** self.alpha
        probs /= probs.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[i] for i in indices]
        total = len(self.buffer)
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()

        states, actions, rewards, next_states, dones = zip(*samples)
        return list(states), list(actions), list(rewards), list(next_states), list(dones), list(weights), indices

    def update_priorities(self, indices, errors, eps=1e-6):
        for i, err in zip(indices, errors):
            self.priorities[i] = abs(err) + eps

    def __len__(self):
        return len(self.buffer)


In [None]:
class DoubleDQNWithPERAgent:
    def __init__(self, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.1, lr=1e-3, batch_size=32):
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size

        self.model = DQNNetwork()
        self.target_model = DQNNetwork()
        self.target_model.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss(reduction='none')
        self.replay_buffer = PrioritizedReplayBuffer()
        self.update_counter = 0

    def board_to_tensor(self, board):
        return torch.FloatTensor([(1 if x == 'X' else -1 if x == 'O' else 0) for x in board])

    def select_action(self, board):
        valid_moves = [i for i, v in enumerate(board) if v == ' ']
        if random.random() < self.epsilon:
            return random.choice(valid_moves)
        with torch.no_grad():
            state_tensor = self.board_to_tensor(board).unsqueeze(0)
            q_values = self.model(state_tensor).squeeze()
            q_values = [q_values[i] if i in valid_moves else -1e9 for i in range(9)]
            return int(torch.argmax(torch.tensor(q_values)).item())

    def learn(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)
        if len(self.replay_buffer) < self.batch_size:
            return

        states, actions, rewards, next_states, dones, weights, indices = self.replay_buffer.sample(self.batch_size)

        state_tensors = torch.stack([self.board_to_tensor(s) for s in states])
        next_state_tensors = torch.stack([self.board_to_tensor(ns) for ns in next_states])
        actions_tensor = torch.tensor(actions, dtype=torch.long)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
        dones_tensor = torch.tensor(dones, dtype=torch.float32)
        weights_tensor = torch.tensor(weights, dtype=torch.float32)

        q_values = self.model(state_tensors).gather(1, actions_tensor.unsqueeze(1)).squeeze()

        with torch.no_grad():
            next_actions = torch.argmax(self.model(next_state_tensors), dim=1)
            next_q_values = self.target_model(next_state_tensors).gather(1, next_actions.unsqueeze(1)).squeeze()
            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values

        loss_elements = self.criterion(q_values, target_q_values)
        loss = (loss_elements * weights_tensor).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        errors = (q_values - target_q_values).detach().abs().tolist()
        self.replay_buffer.update_priorities(indices, errors)

        self.update_counter += 1
        if self.update_counter % 10 == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
def trainPERAgent(env, agent, episodes=1000):
    rewards = []
    steps = []

    for ep in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        ep_steps = 0

        while not done:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            ep_reward += reward
            ep_steps += 1

        rewards.append(ep_reward)
        steps.append(ep_steps)

        if (ep + 1) % 100 == 0:
            print(f"Épisode {ep+1}/{episodes} - Moyenne reward (100 derniers) : {np.mean(rewards[-100:]):.3f}")

    return rewards, steps

def evaluatePERAgent(agent, env, episodes=100):
    original_epsilon = agent.epsilon
    agent.epsilon = 0.0

    rewards = []
    steps = []
    times = []
    

    for _ in range(episodes):
        state = env.reset()
        done = False
        rsum = 0   
        count = 0

        while not done:
            t0 = time.time()
            action = agent.select_action(state)
            times.append(time.time() - t0)
            state, reward, done, _ = env.step(action)
            rsum += reward
            count += 1

        rewards.append(rsum)
        steps.append(count)

    agent.epsilon = original_epsilon

    print("\n Évaluation Double DQN avec Prioritized Replay (ε = 0)")
    print(f"  - Score moyen : {np.mean(rewards):.3f}")
    print(f"  - Longueur moyenne : {np.mean(steps):.2f} steps")
    print(f"  - Temps moyen par action : {np.mean(times)*1000:.3f} ms")

In [None]:
def plot_rewards(rewards, title="Double DQN avec Prioritized Replay - Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label="Reward brut", alpha=0.3)
    if len(rewards) >= 100:
        plt.plot(np.convolve(rewards, np.ones(100)/100, mode="valid"),
                 label="Moyenne glissante (100)", color="orange")
    plt.xlabel("Épisodes")
    plt.ylabel("Récompense")
    plt.ylim(-1.5, 1.5)
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()

env = TicTacToe()
ddqn_per_agent = DoubleDQNWithPERAgent()

print("Entraînement de Double DQN avec Prioritized Experience Replay...")
rewards_per, steps_per = trainPERAgent(env, ddqn_per_agent, episodes=10000)

print("\n Graphe des récompenses")
plot_rewards(rewards_per, title="Double DQN avec Prioritized Replay - TicTacToe")

print("\n Évaluation finale")
evaluatePERAgent(ddqn_per_agent, env, episodes=100)


REINFORCE


In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim=9, output_dim=9):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class REINFORCEAgent:
    def __init__(self, lr=1e-3, gamma=0.99):
        self.policy = PolicyNetwork()
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        self.trajectory = []

    def board_to_tensor(self, board):
        return torch.FloatTensor([(1 if x == 'X' else -1 if x == 'O' else 0) for x in board])

    def select_action(self, board):
        state_tensor = self.board_to_tensor(board).unsqueeze(0)
        probs = self.policy(state_tensor).squeeze()
        mask = torch.tensor([0.0 if board[i] == ' ' else -float('inf') for i in range(9)])
        masked_probs = torch.softmax(probs + mask, dim=-1)
        dist = torch.distributions.Categorical(masked_probs)
        action = dist.sample()
        self.trajectory.append((state_tensor, action, dist.log_prob(action)))
        return int(action.item())

    def learn(self, state, action, reward, next_state, done):
        # handled at end of episode via update_policy
        pass

    def update_policy(self, rewards):
        G = 0
        returns = []
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)

        loss = 0
        for (_, _, log_prob), Gt in zip(self.trajectory, returns):
            loss -= log_prob * Gt

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.trajectory = []


In [None]:
def trainREINFORCEAgent(env, agent, episodes=1000):
    rewards = []
    lengths = []

    for ep in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        steps = 0
        episode_rewards = []

        while not done:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            episode_rewards.append(reward)
            state = next_state
            ep_reward += reward
            steps += 1

        agent.update_policy(episode_rewards)
        rewards.append(ep_reward)
        lengths.append(steps)

        if (ep + 1) % 100 == 0:
            print(f"Épisode {ep+1}/{episodes} - Moyenne reward (100 derniers) : {np.mean(rewards[-100:]):.3f}")

    return rewards, lengths

def evaluateREINFORCEAgent(agent, env, episodes=100):
    rewards = []
    steps = []
    times = []

    for _ in range(episodes):
        state = env.reset()
        done = False
        total_r = 0
        count = 0

        while not done:
            start = time.time()
            action = agent.select_action(state)
            times.append(time.time() - start)
            state, reward, done, _ = env.step(action)
            total_r += reward
            count += 1

        rewards.append(total_r)
        steps.append(count)

    print("\n Évaluation de REINFORCE (ε = 0)")
    print(f"  - Score moyen : {np.mean(rewards):.3f}")
    print(f"  - Longueur moyenne : {np.mean(steps):.2f} steps")
    print(f"  - Temps moyen par action : {np.mean(times)*1000:.3f} ms")

In [None]:
def plot_rewards(rewards, title="REINFORCE - Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label="Reward brut", alpha=0.3)
    if len(rewards) >= 100:
        plt.plot(np.convolve(rewards, np.ones(100)/100, mode="valid"),
                 label="Moyenne glissante (100)", color="orange")
    plt.xlabel("Épisodes")
    plt.ylabel("Récompense")
    plt.ylim(-1.5, 1.5)
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()

env = TicTacToe()
reinforce_agent = REINFORCEAgent()

print(" Entraînement de l'agent REINFORCE...")
rewards_reinforce, steps_reinforce = trainREINFORCEAgent(env, reinforce_agent, episodes=10000)

print("\n Graphe des récompenses")
plot_rewards(rewards_reinforce, title="REINFORCE - TicTacToe")

print("\n Évaluation finale")
evaluateREINFORCEAgent(reinforce_agent, env, episodes=100)


REINFORCE with mean baseline

In [None]:
class REINFORCEWithMeanBaseline:
    def __init__(self, lr=1e-3, gamma=0.99):
        self.policy = PolicyNetwork()
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        self.trajectory = []
        self.return_history = []

    def board_to_tensor(self, board):
        return torch.FloatTensor([(1 if x == 'X' else -1 if x == 'O' else 0) for x in board])

    def select_action(self, board):
        state_tensor = self.board_to_tensor(board).unsqueeze(0)
        probs = self.policy(state_tensor).squeeze()
        mask = torch.tensor([0.0 if board[i] == ' ' else -float('inf') for i in range(9)])
        masked_probs = torch.softmax(probs + mask, dim=-1)
        dist = torch.distributions.Categorical(masked_probs)
        action = dist.sample()
        self.trajectory.append((state_tensor, action, dist.log_prob(action)))
        return int(action.item())

    def learn(self, rewards):
        G = 0
        returns = []
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)

        self.return_history.extend(returns.tolist())
        baseline = np.mean(self.return_history)

        loss = 0
        for (_, _, log_prob), Gt in zip(self.trajectory, returns):
            advantage = Gt - baseline
            loss -= log_prob * advantage

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.trajectory = []


In [None]:
def trainForREINFORCEAgent(env, agent, episodes=1000):
    rewards = []
    steps = []

    for ep in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        ep_steps = 0
        ep_rewards = []

        while not done:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            ep_rewards.append(reward)
            state = next_state
            ep_reward += reward
            ep_steps += 1

        agent.learn(ep_rewards)
        rewards.append(ep_reward)
        steps.append(ep_steps)

        if (ep + 1) % 100 == 0:
            print(f"Épisode {ep+1}/{episodes} - Moyenne (100) : {np.mean(rewards[-100:]):.3f}")

    return rewards, steps

def evaluateForREINFORCEAgent(agent, env, episodes=100):
    rewards = []
    steps = []
    action_time = []

    for _ in range(episodes):
        state = env.reset()
        done = False
        rsum = 0
        n = 0
        while not done:
            start = time.time()
            action = agent.select_action(state)
            action_time.append(time.time() - start)
            state, reward, done, _ = env.step(action)
            rsum += reward
            n += 1
        rewards.append(rsum)
        steps.append(n)

    print("\n Évaluation :")
    print(f"  - Score moyen : {np.mean(rewards):.3f}")
    print(f"  - Longueur moyenne : {np.mean(steps):.2f} steps")
    print(f"  - Temps moyen par action : {np.mean(action_time)*1000:.3f} ms")


In [None]:
def plot_rewards(rewards, title="REINFORCE - Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label="Reward brut", alpha=0.3)
    if len(rewards) >= 100:
        plt.plot(np.convolve(rewards, np.ones(100)/100, mode="valid"),
                 label="Moyenne glissante (100)", color="orange")
    plt.xlabel("Épisodes")
    plt.ylabel("Récompense")
    plt.ylim(-1.5, 1.5)
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()

env = TicTacToe()
agent_mb = REINFORCEWithMeanBaseline()

rewards_mb, steps_mb = trainForREINFORCEAgent(env, agent_mb, episodes=1000)
plot_rewards(rewards_mb, title="REINFORCE avec baseline moyenne - TicTacToe")
evaluateForREINFORCEAgent(agent_mb, env, episodes=100)


REINFORCE with Baseline Learned by a Critic

In [None]:
class ValueNetwork(nn.Module):
    def __init__(self, input_dim=9):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

class REINFORCEWithCritic:
    def __init__(self, lr_policy=1e-3, lr_value=1e-3, gamma=0.99):
        self.policy = PolicyNetwork()
        self.value_net = ValueNetwork()
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=lr_policy)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=lr_value)
        self.gamma = gamma
        self.trajectory = []

    def board_to_tensor(self, board):
        return torch.FloatTensor([(1 if x == 'X' else -1 if x == 'O' else 0) for x in board])

    def select_action(self, board):
        state_tensor = self.board_to_tensor(board).unsqueeze(0)
        probs = self.policy(state_tensor).squeeze()
        mask = torch.tensor([0.0 if board[i] == ' ' else -float('inf') for i in range(9)])
        masked_probs = torch.softmax(probs + mask, dim=-1)
        dist = torch.distributions.Categorical(masked_probs)
        action = dist.sample()
        self.trajectory.append((state_tensor, action, dist.log_prob(action)))
        return int(action.item())

    def learn(self, rewards):
        G = 0
        returns = []
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)

        # Update critic
        for (state_tensor, _, _), Gt in zip(self.trajectory, returns):
            value = self.value_net(state_tensor).squeeze()
            loss_v = (Gt - value) ** 2
            self.value_optimizer.zero_grad()
            loss_v.backward()
            self.value_optimizer.step()

        # Update policy using advantage
        loss = 0
        for (state_tensor, _, log_prob), Gt in zip(self.trajectory, returns):
            advantage = Gt - self.value_net(state_tensor).squeeze().detach()
            loss -= log_prob * advantage

        self.policy_optimizer.zero_grad()
        loss.backward()
        self.policy_optimizer.step()
        self.trajectory = []


In [None]:
def trainForREINFORCEAgent(env, agent, episodes=1000):
    rewards = []
    steps = []

    for ep in range(episodes):
        state = env.reset()
        done = False
        ep_reward = 0
        ep_steps = 0
        ep_rewards = []

        while not done:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            ep_rewards.append(reward)
            state = next_state
            ep_reward += reward
            ep_steps += 1

        agent.learn(ep_rewards)
        rewards.append(ep_reward)
        steps.append(ep_steps)

        if (ep + 1) % 100 == 0:
            print(f"Épisode {ep+1}/{episodes} - Moyenne (100) : {np.mean(rewards[-100:]):.3f}")

    return rewards, steps

def evaluateForREINFORCEAgent(agent, env, episodes=100):
    rewards = []
    steps = []
    action_time = []

    for _ in range(episodes):
        state = env.reset()
        done = False
        rsum = 0
        n = 0
        while not done:
            start = time.time()
            action = agent.select_action(state)
            action_time.append(time.time() - start)
            state, reward, done, _ = env.step(action)
            rsum += reward
            n += 1
        rewards.append(rsum)
        steps.append(n)

    print("\n🎯 Évaluation :")
    print(f"  - Score moyen : {np.mean(rewards):.3f}")
    print(f"  - Longueur moyenne : {np.mean(steps):.2f} steps")
    print(f"  - Temps moyen par action : {np.mean(action_time)*1000:.3f} ms")


In [None]:
def plot_rewards(rewards, title="REINFORCE - Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label="Reward brut", alpha=0.3)
    if len(rewards) >= 100:
        plt.plot(np.convolve(rewards, np.ones(100)/100, mode="valid"),
                 label="Moyenne glissante (100)", color="orange")
    plt.xlabel("Épisodes")
    plt.ylabel("Récompense")
    plt.ylim(-1.5, 1.5)
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()

env = TicTacToe()
agent_mb = REINFORCEWithMeanBaseline()

rewards_mb, steps_mb = trainForREINFORCEAgent(env, agent_mb, episodes=1000)
plot_rewards(rewards_mb, title="REINFORCE avec baseline moyenne - TicTacToe")
evaluateForREINFORCEAgent(agent_mb, env, episodes=100)


PPO A2C style

In [None]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, input_dim=9, output_dim=9):
        super().__init__()
        self.shared = nn.Sequential(nn.Linear(input_dim, 128), nn.ReLU())
        self.policy_head = nn.Sequential(nn.Linear(128, output_dim), nn.Softmax(dim=-1))
        self.value_head = nn.Linear(128, 1)

    def forward(self, x):
        shared = self.shared(x)
        return self.policy_head(shared), self.value_head(shared)

class A2CStyleAgent:
    def __init__(self, gamma=0.99, lr=1e-3):
        self.model = ActorCriticNetwork()
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = gamma
        self.trajectory = []

    def board_to_tensor(self, board):
        return torch.FloatTensor([(1 if x == 'X' else -1 if x == 'O' else 0) for x in board])

    def select_action(self, board):
        state_tensor = self.board_to_tensor(board).unsqueeze(0)
        probs, _ = self.model(state_tensor)
        mask = torch.tensor([0.0 if board[i] == ' ' else -float('inf') for i in range(9)])
        masked_probs = torch.softmax(probs.squeeze() + mask, dim=-1)
        dist = torch.distributions.Categorical(masked_probs)
        action = dist.sample()
        self.trajectory.append((state_tensor, action, dist.log_prob(action)))
        return int(action.item())

    def learn(self, rewards):
        G = 0
        returns = []
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)

        loss = 0
        for (state_tensor, action, log_prob), Gt in zip(self.trajectory, returns):
            probs, value = self.model(state_tensor)
            advantage = Gt - value.squeeze()
            loss += -log_prob * advantage.detach() + advantage.pow(2)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.trajectory = []


Train pour RandomRollout, UCT et ExpertApprentice

In [None]:
def train_basic_agent(env, agent, episodes=10000):
    rewards = []
    steps = []
    for ep in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        count = 0
        while not done:
            action = agent.select_action(state)
            state, reward, done, _ = env.step(action)
            total_reward += reward
            count += 1
        rewards.append(total_reward)
        steps.append(count)
        if (ep + 1) % 100 == 0:
            print(f"Épisode {ep+1}/ {episodes}- Moyenne reward (100 derniers) : {np.mean(rewards[-100:]):.3f}")
    return rewards, steps

def evaluate_basic_agent(agent, env, episodes=100):
    rewards = []
    steps = []
    times = []
    for _ in range(episodes):
        state = env.reset()
        done = False
        total_r = 0
        count = 0
        while not done:
            t0 = time.time()
            action = agent.select_action(state)
            times.append(time.time() - t0)
            state, reward, done, _ = env.step(action)
            total_r += reward
            count += 1
        rewards.append(total_r)
        steps.append(count)

    print("\n🧪 Évaluation")
    print(f"  - Score moyen : {np.mean(rewards):.3f}")
    print(f"  - Longueur moyenne : {np.mean(steps):.2f} steps")
    print(f"  - Temps moyen par action : {np.mean(times)*1000:.3f} ms")

def plot_basic_rewards(rewards, title="Agent - Récompenses par épisode"):
    plt.figure(figsize=(10, 4))
    plt.plot(rewards, label="Reward brut", alpha=0.3)
    if len(rewards) >= 100:
        plt.plot(np.convolve(rewards, np.ones(100)/100, mode="valid"),
                 label="Moyenne glissante (100)", color="orange")
    plt.xlabel("Épisodes")
    plt.ylabel("Récompense")
    plt.ylim(-1.5, 1.5)
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()


RandomRollout

In [None]:
class RandomRolloutAgent:
    def __init__(self, simulations=20):
        self.simulations = simulations

    def simulate(self, board, move):
        wins = 0
        for _ in range(self.simulations):
            sim_board = board[:]
            sim_board[move] = 'X'
            done = False
            while not done:
                empty = [i for i, v in enumerate(sim_board) if v == ' ']
                if not empty:
                    break
                sim_board[random.choice(empty)] = 'O'
                if TicTacToe().check_win('O'):
                    break
                empty = [i for i, v in enumerate(sim_board) if v == ' ']
                if not empty:
                    break
                sim_board[random.choice(empty)] = 'X'
                if TicTacToe().check_win('X'):
                    wins += 1
                    break
        return wins

    def select_action(self, board):
        moves = [i for i, v in enumerate(board) if v == ' ']
        scores = {m: self.simulate(board, m) for m in moves}
        return max(scores, key=scores.get)

    def learn(self, *args, **kwargs):
        pass


In [None]:
env = TicTacToe()
agent_rollout = RandomRolloutAgent()

print("Entraînement de l'agent Random Rollout...")
rewards_rollout, steps_rollout = train_basic_agent(env, agent_rollout, episodes=1000)

print("\n Graphique des récompenses")
plot_basic_rewards(rewards_rollout, title="Random Rollout - TicTacToe")

print("\n Évaluation")
evaluate_basic_agent(agent_rollout, env)


Monte Carlo Tree Search (UCT)

In [None]:
class MCTSNode:
    def __init__(self, board, parent=None, move=None):
        self.board = board
        self.parent = parent
        self.move = move
        self.children = []
        self.wins = 0
        self.visits = 0

    def ucb1(self, total_simulations, exploration=1.41):
        if self.visits == 0:
            return float('inf')
        return (self.wins / self.visits) + exploration * np.sqrt(np.log(total_simulations) / self.visits)

class MCTSAgent:
    def __init__(self, simulations=100):
        self.simulations = simulations

    def select_action(self, board):
        root = MCTSNode(board[:])
        for _ in range(self.simulations):
            node = root
            sim_board = board[:]
            # Selection
            while node.children:
                node = max(node.children, key=lambda c: c.ucb1(root.visits + 1))
                sim_board[node.move] = 'X'
                if TicTacToe().check_win('X'):
                    break
            # Expansion
            if not TicTacToe().check_win('X'):
                available = [i for i, v in enumerate(sim_board) if v == ' ']
                for move in available:
                    new_board = sim_board[:]
                    new_board[move] = 'X'
                    node.children.append(MCTSNode(new_board, parent=node, move=move))
            # Simulation
            for child in node.children:
                result = self.rollout(child.board)
                self.backpropagate(child, result)
        return max(root.children, key=lambda c: c.visits).move

    def rollout(self, board):
        sim_board = board[:]
        while True:
            moves = [i for i, v in enumerate(sim_board) if v == ' ']
            if not moves:
                return 0
            sim_board[random.choice(moves)] = 'O'
            if TicTacToe().check_win('O'):
                return -1
            moves = [i for i, v in enumerate(sim_board) if v == ' ']
            if not moves:
                return 0
            sim_board[random.choice(moves)] = 'X'
            if TicTacToe().check_win('X'):
                return 1

    def backpropagate(self, node, result):
        while node:
            node.visits += 1
            node.wins += result
            node = node.parent

    def learn(self, *args, **kwargs):
        pass


In [None]:
env = TicTacToe()
agent_mcts = MCTSAgent(simulations=50)

print("🔧 Entraînement de l'agent MCTS...")
rewards_mcts, steps_mcts = train_basic_agent(env, agent_mcts, episodes=1000)

print("\n📈 Graphique des récompenses")
plot_basic_rewards(rewards_mcts, title="MCTS (UCT) - TicTacToe")

print("\n🧪 Évaluation")
evaluate_basic_agent(agent_mcts, env)


Expert Apprentice

In [None]:
class ExpertApprenticeAgent:
    def __init__(self, expert_policy):
        self.expert_policy = expert_policy

    def select_action(self, board):
        return self.expert_policy(board)

    def learn(self, *args, **kwargs):
        pass

# Exemple de politique experte très simple
def heuristic_expert(board):
    for i in range(9):
        if board[i] == ' ':
            return i


In [None]:
env = TicTacToe()
expert = ExpertApprenticeAgent(expert_policy=heuristic_expert)

print("🔧 Entraînement de l'agent ExpertApprentice...")
rewards_expert, steps_expert = train_basic_agent(env, expert, episodes=1000)

print("\n📈 Graphique des récompenses")
plot_basic_rewards(rewards_expert, title="Expert Apprentice - TicTacToe")

print("\n🧪 Évaluation")
evaluate_basic_agent(expert, env)


Joueur VS Bot

In [None]:
def play_against_agent(agent, env):
    print("Bienvenue dans TicTacToe contre l'agent entraîné !")
    print("Vous êtes 'X' (humain), l'agent est 'O'")
    env.reset()
    human_turn = True

    while True:
        env.render()
        if human_turn:
            try:
                move = int(input("Entrez votre coup (0-8) : "))
                if env.board[move] != ' ':
                    print("Case occupée. Réessayez.")
                    continue
                env.board[move] = 'X'
            except (ValueError, IndexError):
                print("Entrée invalide. Entrez un nombre entre 0 et 8.")
                continue

            if env.check_win('X'):
                env.render()
                print("Vous avez gagné !")
                break
            if ' ' not in env.board:
                env.render()
                print("Match nul.")
                break

            human_turn = False

        else:
            state = env.board.copy()
            action = agent.select_action(state)
            print(f"L'agent joue en case {action}.")
            env.board[action] = 'O'

            if env.check_win('O'):
                env.render()
                print(" L'agent a gagné.")
                break
            if ' ' not in env.board:
                env.render()
                print(" Match nul.")
                break

            human_turn = True


In [None]:
play_against_agent(agent_mcts, TicTacToe())