In [16]:
import sys
sys.path.append('./src')  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import time
import random
from tqdm.notebook import tqdm
import torch
import pygame
import os
import gymnasium as gym
from Agents.alpha_zero.model import SimpleAlphaZeroNet
from Agents.alpha_zero.mcts_alpha_zero import MCTSAgent
from Env.env import OthelloEnv
from Agents.alpha_zero.train import train_loop



# Définir une graine aléatoire pour la reproductibilité
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x7f263241f9f0>

In [17]:
# Initialize the environment, network, and MCTS agent.
env = OthelloEnv()
network = SimpleAlphaZeroNet()
agent = MCTSAgent(network=network, num_simulations=50)  


In [18]:
import Agents.alpha_zero.self_play as sp
print(sp.__file__)


/home/euler03/projects/rl_gym/src/Agents/alpha_zero/self_play.py


In [19]:
training_losses, training_rewards = train_loop()



Iteration 1/10, Loss: 5.1582, Final Reward: 164.00


KeyboardInterrupt: 

In [None]:


# Save the trained model weights to a file
torch.save(network.state_dict(), 'alpha_zero_model1.pt')
print("Model saved to alpha_zero_model.pt")


In [None]:
class RandomAgent:
    def __init__(self):
        self.name = "Random"
    
    def choose_action(self, env):
        """Choisit aléatoirement une action parmi les coups valides."""
        # Récupérer les coups valides
        obs = env._get_observation()
        valid_moves = [i for i, is_valid in enumerate(obs["valid_moves"]) if is_valid == 1]
        
        # Si aucun coup valide, retourner une action par défaut
        if not valid_moves:
            return 0
        
        # Retourner un coup aléatoire
        return random.choice(valid_moves)

In [None]:
def evaluate_agent(agent, opponent, num_games=100, render=False):
    """Évalue un agent contre un adversaire sur plusieurs parties."""
    env = OthelloEnv()
    wins = 0
    losses = 0
    draws = 0
    total_rewards = 0
    
    for game in tqdm(range(num_games), desc=f"{agent.name} vs {opponent.name}"):
        # Réinitialiser l'environnement
        obs, _ = env.reset()
        done = False
        game_reward = 0
        
        # Jouer la partie
        while not done:
            # Déterminer quel agent joue (BLACK commence)
            current_player = obs["current_player"]
            current_agent = agent if current_player == 0 else opponent  # 0 pour BLACK, 1 pour WHITE
            
            # Choisir une action
            action = current_agent.choose_action(env)
            
            # Exécuter l'action
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            
            # Accumuler la récompense (du point de vue de l'agent évalué)
            if current_player == 0:  # Si c'est notre agent qui a joué
                game_reward += reward
        
        # Analyser le résultat
        black_count, white_count = env._get_score()
        if black_count > white_count:  # BLACK a gagné
            wins += 1
        elif white_count > black_count:  # WHITE a gagné
            losses += 1
        else:  # Match nul
            draws += 1
        
        total_rewards += game_reward
    
    # Calculer les statistiques
    win_rate = wins / num_games
    avg_reward = total_rewards / num_games
    
    return {
        "wins": wins,
        "losses": losses,
        "draws": draws,
        "win_rate": win_rate,
        "avg_reward": avg_reward
    }

In [None]:
def visualize_game(agent1, agent2, delay=0.5):
    """Visualise une partie entre deux agents."""
    # Configuration de pygame
    pygame.init()
    
    # Constantes pour l'interface graphique
    SQUARE_SIZE = 60
    BOARD_WIDTH = BOARD_SIZE * SQUARE_SIZE
    INFO_PANEL_WIDTH = 300
    WINDOW_WIDTH = BOARD_WIDTH + INFO_PANEL_WIDTH
    WINDOW_HEIGHT = BOARD_WIDTH
    BACKGROUND_COLOR = (0, 120, 0)  # Vert foncé
    LINE_COLOR = (0, 0, 0)  # Noir
    BLACK_COLOR = (0, 0, 0)  # Noir
    WHITE_COLOR = (255, 255, 255)  # Blanc
    INFO_PANEL_COLOR = (50, 50, 50)  # Gris foncé
    TEXT_COLOR = (255, 255, 255)  # Blanc
    VALID_MOVE_COLOR = (0, 255, 0, 150)  # Vert semi-transparent
    
    # Création de la fenêtre
    screen = pygame.display.set_mode((WINDOW_WIDTH, WINDOW_HEIGHT))
    pygame.display.set_caption(f"{agent1.name} vs {agent2.name}")
    
    # Initialisation des polices
    title_font = pygame.font.SysFont("Arial", 30, bold=True)
    info_font = pygame.font.SysFont("Arial", 20)
    score_font = pygame.font.SysFont("Arial", 24, bold=True)
    
    # Créer une surface pour les coups valides
    valid_move_surface = pygame.Surface((SQUARE_SIZE, SQUARE_SIZE), pygame.SRCALPHA)
    valid_move_surface.fill((0, 255, 0, 100))  # Vert semi-transparent
    
    # Horloge pour contrôler la vitesse
    clock = pygame.time.Clock()
    
    # Initialiser l'environnement
    env = OthelloEnv()
    obs, _ = env.reset()
    done = False
    black_reward = 0
    white_reward = 0
    actions_history = []
    
    def draw_board():
        """Dessine le plateau de jeu."""
        # Fond du plateau
        screen.fill(BACKGROUND_COLOR, (0, 0, BOARD_WIDTH, WINDOW_HEIGHT))
        
        # Lignes du plateau
        for i in range(BOARD_SIZE + 1):
            pygame.draw.line(screen, LINE_COLOR, (i * SQUARE_SIZE, 0), 
                             (i * SQUARE_SIZE, BOARD_WIDTH), 2)
            pygame.draw.line(screen, LINE_COLOR, (0, i * SQUARE_SIZE), 
                             (BOARD_WIDTH, i * SQUARE_SIZE), 2)
        
        # Récupérer l'état actuel
        board = obs["board"]
        valid_moves_array = obs["valid_moves"]
        
        # Convertir le tableau valide_moves en liste de tuples
        valid_moves = []
        for i in range(len(valid_moves_array)):
            if valid_moves_array[i] == 1:
                row, col = i // BOARD_SIZE, i % BOARD_SIZE
                valid_moves.append((row, col))
        
        # Dessiner les mouvements valides en surbrillance
        for row, col in valid_moves:
            screen.blit(valid_move_surface, (col * SQUARE_SIZE, row * SQUARE_SIZE))
        
        # Dessiner les pièces
        for row in range(BOARD_SIZE):
            for col in range(BOARD_SIZE):
                center_x = col * SQUARE_SIZE + SQUARE_SIZE // 2
                center_y = row * SQUARE_SIZE + SQUARE_SIZE // 2
                
                if board[row][col] == BLACK:
                    pygame.draw.circle(screen, BLACK_COLOR, (center_x, center_y), 
                                      SQUARE_SIZE // 2 - 5)
                elif board[row][col] == WHITE:
                    pygame.draw.circle(screen, WHITE_COLOR, (center_x, center_y), 
                                      SQUARE_SIZE // 2 - 5)
    
    def draw_info_panel():
        """Dessine le panneau d'informations."""
        # Fond du panneau
        pygame.draw.rect(screen, INFO_PANEL_COLOR, 
                         (BOARD_WIDTH, 0, INFO_PANEL_WIDTH, WINDOW_HEIGHT))
        
        # Titre
        title = title_font.render("OTHELLO", True, TEXT_COLOR)
        screen.blit(title, (BOARD_WIDTH + INFO_PANEL_WIDTH // 2 - title.get_width() // 2, 20))
        
        # Agents
        agents_title = info_font.render(f"{agent1.name} (Noir) vs {agent2.name} (Blanc)", True, TEXT_COLOR)
        screen.blit(agents_title, (BOARD_WIDTH + INFO_PANEL_WIDTH // 2 - agents_title.get_width() // 2, 60))
        
        # Score
        black_count, white_count = env._get_score()
        
        score_title = info_font.render("SCORE", True, TEXT_COLOR)
        screen.blit(score_title, (BOARD_WIDTH + INFO_PANEL_WIDTH // 2 - score_title.get_width() // 2, 100))
        
        score_black = score_font.render(f"Noir: {black_count}", True, TEXT_COLOR)
        screen.blit(score_black, (BOARD_WIDTH + 20, 130))
        
        score_white = score_font.render(f"Blanc: {white_count}", True, TEXT_COLOR)
        screen.blit(score_white, (BOARD_WIDTH + 20, 160))
        
        # Tour du joueur
        current_player = "Noir" if obs["current_player"] == 0 else "Blanc"
        current_agent = agent1.name if obs["current_player"] == 0 else agent2.name
        player_text = info_font.render(f"Tour: {current_player} ({current_agent})", True, TEXT_COLOR)
        screen.blit(player_text, (BOARD_WIDTH + 20, 200))
        
        # Récompenses cumulatives
        reward_title = info_font.render("RÉCOMPENSES", True, TEXT_COLOR)
        screen.blit(reward_title, (BOARD_WIDTH + INFO_PANEL_WIDTH // 2 - reward_title.get_width() // 2, 240))
        
        reward_black = info_font.render(f"Noir: {black_reward:.1f}", True, TEXT_COLOR)
        screen.blit(reward_black, (BOARD_WIDTH + 20, 270))
        
        reward_white = info_font.render(f"Blanc: {white_reward:.1f}", True, TEXT_COLOR)
        screen.blit(reward_white, (BOARD_WIDTH + 20, 300))
        
        # Historique des actions
        history_title = info_font.render("DERNIÈRES ACTIONS", True, TEXT_COLOR)
        screen.blit(history_title, (BOARD_WIDTH + INFO_PANEL_WIDTH // 2 - history_title.get_width() // 2, 340))
        
        # Afficher les 5 dernières actions
        for i, (action, player) in enumerate(actions_history[-5:]):
            row, col = action // BOARD_SIZE, action % BOARD_SIZE
            player_name = "Noir" if player == 0 else "Blanc"
            action_text = info_font.render(f"{player_name}: ({row}, {col})", True, TEXT_COLOR)
            screen.blit(action_text, (BOARD_WIDTH + 20, 370 + i * 25))
        
        # Instructions
        instructions = info_font.render("Appuyez sur ESPACE pour faire avancer", True, TEXT_COLOR)
        screen.blit(instructions, (BOARD_WIDTH + INFO_PANEL_WIDTH // 2 - instructions.get_width() // 2, 490))
        
        quit_instr = info_font.render("ou Q pour quitter", True, TEXT_COLOR)
        screen.blit(quit_instr, (BOARD_WIDTH + INFO_PANEL_WIDTH // 2 - quit_instr.get_width() // 2, 520))
    
    # Boucle principale de visualisation
    running = True
    auto_play = False
    last_action_time = 0
    
    while running:
        current_time = time.time()
        
        # Gérer les événements
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False
            
            if event.type == pygame.KEYDOWN:
                if event.key == pygame.K_q:  # Quitter
                    running = False
                elif event.key == pygame.K_SPACE:  # Faire avancer manuellement
                    if not done:
                        auto_play = False
                        last_action_time = 0  # Forcer la prochaine action
                elif event.key == pygame.K_a:  # Mode automatique
                    auto_play = not auto_play
        
        # Mode automatique ou action manuelle
        if (auto_play and current_time - last_action_time > delay) or (not auto_play and last_action_time == 0):
            if not done:
                # Déterminer quel agent joue
                current_player = obs["current_player"]
                current_agent = agent1 if current_player == 0 else agent2
                
                # Choisir une action
                action = current_agent.choose_action(env)
                
                # Exécuter l'action
                next_obs, reward, terminated, truncated, info = env.step(action)
                
                # Enregistrer l'action dans l'historique
                actions_history.append((action, current_player))
                
                # Mettre à jour les récompenses
                if current_player == 0:  # BLACK
                    black_reward += reward
                else:  # WHITE
                    white_reward += reward
                
                # Mettre à jour l'état
                obs = next_obs
                done = terminated or truncated
                
                # Mettre à jour le temps de la dernière action
                last_action_time = current_time
            else:
                # Partie terminée, afficher le résultat
                black_count, white_count = env._get_score()
                if black_count > white_count:
                    print(f"Noir ({agent1.name}) a gagné! {black_count}-{white_count}")
                elif white_count > black_count:
                    print(f"Blanc ({agent2.name}) a gagné! {white_count}-{black_count}")
                else:
                    print(f"Match nul! {black_count}-{white_count}")
                
                # Attendre un peu avant de fermer
                if auto_play:
                    time.sleep(3)
                    running = False
        
        # Dessiner le jeu
        draw_board()
        draw_info_panel()
        
        # Mettre à jour l'affichage
        pygame.display.flip()
        
        # Limiter la fréquence d'images
        clock.tick(60)
    
    # Fermer pygame
    pygame.quit()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Supposons que train_loop() a renvoyé ces listes :
# training_losses et training_rewards

# Graphique de l'évolution des pertes
plt.figure(figsize=(12, 5))
plt.plot(training_losses, marker='o', label='Perte par itération')
plt.plot(pd.Series(training_losses).rolling(3).mean(), label='Moyenne mobile (3 itérations)', color='red')
plt.xlabel("Itération")
plt.ylabel("Perte")
plt.title("Évolution de la perte pendant l'entraînement")
plt.legend()
plt.grid(True)
plt.show()

# Graphique de l'évolution des récompenses finales
plt.figure(figsize=(12, 5))
plt.plot(training_rewards, marker='o', label='Récompense finale par partie')
plt.plot(pd.Series(training_rewards).rolling(3).mean(), label='Moyenne mobile (3 itérations)', color='red')
plt.xlabel("Itération")
plt.ylabel("Récompense finale")
plt.title("Évolution des récompenses pendant l'entraînement")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
def choose_action(self, env):
    obs = env._get_observation()
    board = obs["board"]
    state_input = torch.tensor(board, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(self.device)
    with torch.no_grad():
        network_output = self.network(state_input)
    policy_logits = network_output.policy_logits
    probs = torch.softmax(policy_logits, dim=1)
    action = torch.argmax(probs, dim=1).item()
    return action

random_agent = RandomAgent()

# Load your trained model (if you haven't already)
network = SimpleAlphaZeroNet()
network.load_state_dict(torch.load("alpha_zero_model1.pt", map_location=torch.device("cpu")))
network.eval()

agent.name= "AlphaGo"
random_agent.name = "Random"

# Evaluate
results = evaluate_agent(agent, random_agent, num_games=10, render=False)
print("Résultats:", results)

In [None]:
BOARD_SIZE=8
# Visualisation d'une partie entre l'agent AlphaGo et l'agent aléatoire
print("Visualisation d'une partie entre l'agent AlphaGo et l'agent aléatoire...")
print("Utilisez ESPACE pour faire avancer manuellement, A pour activer le mode automatique, Q pour quitter.")
visualize_game(agent, random_agent, delay=1.0)

