In [5]:
# üìö Imports et Configuration Compl√®te de TOUS les Algorithmes RL

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Any
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Configuration matplotlib pour analyses visuelles
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (16, 12)
plt.rcParams['font.size'] = 11
sns.set_palette("husl")

# üîß Configuration des chemins
project_root = os.path.abspath('../../')
sys.path.insert(0, project_root)
sys.path.insert(0, os.path.join(project_root, 'src'))
sys.path.insert(0, os.path.join(project_root, 'game', 'secret_env'))

print("üöÄ INITIALISATION DE L'ANALYSE COMPL√àTE DE TOUS LES ALGORITHMES RL")
print("=" * 80)

# üéÆ Import des Environnements Secrets
try:
    from secret_envs_wrapper import SecretEnv0, SecretEnv1, SecretEnv2, SecretEnv3
    print("‚úÖ Environnements secrets import√©s avec succ√®s")
    
    env_configs = [
        ("SecretEnv0", SecretEnv0),
        ("SecretEnv1", SecretEnv1), 
        ("SecretEnv2", SecretEnv2),
        ("SecretEnv3", SecretEnv3)
    ]
    
    env_info = {}
    for env_name, env_class in env_configs:
        try:
            env = env_class()
            states = env.num_states()
            actions = env.num_actions()
            env_info[env_name] = {'states': states, 'actions': actions, 'class': env_class}
            print(f"üìä {env_name} - √âtats: {states}, Actions: {actions}")
        except Exception as e:
            print(f"‚ùå Erreur avec {env_name}: {e}")
            env_info[env_name] = None
    
    available_envs = len([k for k,v in env_info.items() if v is not None])
    print(f"\nüéâ {available_envs}/4 environnements secrets fonctionnels !")
    
except Exception as e:
    print(f"‚ùå Erreur d'import des environnements secrets: {e}")
    env_info = {}
    available_envs = 0

# üß† Import de TOUS les Algorithmes RL du Projet
algorithms_status = {}

# 1Ô∏è‚É£ Dynamic Programming (Moins co√ªteux)
try:
    from dp import PolicyIteration, ValueIteration
    algorithms_status['DP'] = True
    print("‚úÖ Algorithmes Dynamic Programming import√©s depuis src/dp.py")
    print("   ‚Ä¢ PolicyIteration - It√©ration de politique avec √©valuation")
    print("   ‚Ä¢ ValueIteration - It√©ration directe des valeurs")
except Exception as e:
    print(f"‚ùå Erreur import DP: {e}")
    algorithms_status['DP'] = False

# 2Ô∏è‚É£ Temporal Difference (Co√ªt moyen)
try:
    from td import Sarsa, QLearning, ExpectedSarsa
    algorithms_status['TD'] = True
    print("‚úÖ Algorithmes Temporal Difference import√©s depuis src/td.py")
    print("   ‚Ä¢ Sarsa - On-policy TD control")
    print("   ‚Ä¢ QLearning - Off-policy TD control")
    print("   ‚Ä¢ ExpectedSarsa - Version expectation de Sarsa")
except Exception as e:
    print(f"‚ùå Erreur import TD: {e}")
    algorithms_status['TD'] = False

# 3Ô∏è‚É£ Monte Carlo (Co√ªt moyen-√©lev√©)
try:
    from monte_carlo import MonteCarloES, OnPolicyMC, OffPolicyMC
    algorithms_status['MC'] = True
    print("‚úÖ Algorithmes Monte Carlo import√©s depuis src/monte_carlo.py")
    print("   ‚Ä¢ MonteCarloES - Exploring Starts")
    print("   ‚Ä¢ OnPolicyMC - On-policy First-Visit")
    print("   ‚Ä¢ OffPolicyMC - Importance Sampling")
except Exception as e:
    print(f"‚ùå Erreur import MC: {e}")
    algorithms_status['MC'] = False

# 4Ô∏è‚É£ Dyna Planning (Plus co√ªteux)
try:
    from dyna import DynaQ, DynaQPlus
    algorithms_status['DYNA'] = True
    print("‚úÖ Algorithmes Dyna Planning import√©s depuis src/dyna.py")
    print("   ‚Ä¢ DynaQ - Q-Learning + Planning")
    print("   ‚Ä¢ DynaQPlus - Extension avec bonus exploration")
except Exception as e:
    print(f"‚ùå Erreur import DYNA: {e}")
    algorithms_status['DYNA'] = False

# üìä R√©sum√© de configuration
total_algorithms = sum(algorithms_status.values()) * 2  # Approximatif (2 algos par cat√©gorie)
total_combinations = available_envs * total_algorithms

print(f"\nüìà CONFIGURATION FINALE :")
print(f"   üéÆ Environnements disponibles: {available_envs}/4")
print(f"   üß† Familles d'algorithmes: {sum(algorithms_status.values())}/4")
print(f"   üîÑ Combinaisons totales: ~{total_combinations}")
print(f"   ‚è±Ô∏è  Temps estim√©: {total_combinations * 2}-{total_combinations * 5} minutes")

print("\n" + "=" * 80)
print("üéØ Pr√™t √† lancer l'analyse compl√®te de tous les algorithmes RL !")
print("=" * 80)


üöÄ INITIALISATION DE L'ANALYSE COMPL√àTE DE TOUS LES ALGORITHMES RL
‚úÖ Environnements secrets import√©s avec succ√®s
üìä SecretEnv0 - √âtats: 8192, Actions: 3
üìä SecretEnv1 - √âtats: 65536, Actions: 3
üìä SecretEnv2 - √âtats: 2097152, Actions: 3
üìä SecretEnv3 - √âtats: 65536, Actions: 3

üéâ 4/4 environnements secrets fonctionnels !
‚úÖ Algorithmes Dynamic Programming import√©s depuis src/dp.py
   ‚Ä¢ PolicyIteration - It√©ration de politique avec √©valuation
   ‚Ä¢ ValueIteration - It√©ration directe des valeurs
‚úÖ Algorithmes Temporal Difference import√©s depuis src/td.py
   ‚Ä¢ Sarsa - On-policy TD control
   ‚Ä¢ QLearning - Off-policy TD control
   ‚Ä¢ ExpectedSarsa - Version expectation de Sarsa
‚úÖ Algorithmes Monte Carlo import√©s depuis src/monte_carlo.py
   ‚Ä¢ MonteCarloES - Exploring Starts
   ‚Ä¢ OnPolicyMC - On-policy First-Visit
   ‚Ä¢ OffPolicyMC - Importance Sampling
‚úÖ Algorithmes Dyna Planning import√©s depuis src/dyna.py
   ‚Ä¢ DynaQ - Q-Learning + Planni

In [6]:
# üîß Adaptateur Universel Optimis√© pour TOUS les Algorithmes RL

# Import defaultdict requis
from collections import defaultdict

class UniversalSecretEnvAdapter:
    """
    Adaptateur universel OPTIMIS√â pour les environnements secrets avec espaces d'√©tats massifs.
    Compatible avec TOUS les algorithmes RL :
    - Dynamic Programming (avec gestion intelligente de la m√©moire)
    - Temporal Difference (Sarsa, Q-Learning, Expected Sarsa) 
    - Monte Carlo (ES, OnPolicy, OffPolicy)
    - Dyna Planning (Dyna-Q, Dyna-Q+)
    """
    
    def __init__(self, secret_env_class, env_name="SecretEnv"):
        self.secret_env_class = secret_env_class
        self.env_name = env_name
        
        # Obtenir les propri√©t√©s MDP
        temp_env = secret_env_class()
        self.nS = temp_env.num_states()
        self.nA = temp_env.num_actions()
        
        # √âtat et score pour suivi
        self.current_env = None
        self.current_state = None
        self.last_score = 0.0
        self.episode_steps = 0
        
        # Variables pour gestion des gros environnements
        self.is_large_mdp = self.nS > 10000  # Seuil pour "gros" environnements
        self.dp_compatible = not self.is_large_mdp  # DP possible seulement sur petits MDPs
        
        # Construire dynamiques MDP seulement si faisable
        self._build_mdp_dynamics_smart()
        
        status = "DP-Compatible" if self.dp_compatible else "TD/MC/Dyna-Only" 
        print(f"üèóÔ∏è  {env_name} Universal Adapter - √âtats: {self.nS}, Actions: {self.nA} ({status})")
    
    def _build_mdp_dynamics_smart(self):
        """Construit les dynamiques MDP intelligemment selon la taille"""
        
        if not self.dp_compatible:
            # Environnement trop gros - Pas de dynamiques compl√®tes
            print(f"üíæ {self.env_name}: Environnement massif ({self.nS} √©tats) - Skip dynamiques DP")
            self.P = None  # Pas de matrice de transition
            self.R = None  # Pas de matrice de r√©compense
            self.terminals = []
            return
        
        try:
            # Environnement de taille raisonnable - Construire les dynamiques
            print(f"üîß {self.env_name}: Construction des dynamiques DP ({self.nS} √©tats)...")
            
            # Matrices de transition P(s'|s,a) et r√©compense R(s,a)
            self.P = np.zeros((self.nS, self.nA, self.nS), dtype=np.float32)  # float32 pour √©conomiser
            self.R = np.zeros((self.nS, self.nA), dtype=np.float32)
            self.terminals = []
            
            # Exploration par √©chantillonnage (plus efficace)
            sample_env = self.secret_env_class()
            states_to_sample = min(self.nS, 100)  # Maximum 100 √©tats √©chantillonn√©s
            
            for s in range(states_to_sample):
                for a in range(self.nA):
                    # √âchantillonner cette transition
                    transitions = defaultdict(int)
                    rewards = []
                    samples = 3  # Moins d'√©chantillons pour aller plus vite
                    
                    for _ in range(samples):
                        try:
                            sample_env.reset()
                            
                            # Simuler l'√©tat s (si possible)
                            if hasattr(sample_env, 'set_state'):
                                sample_env.set_state(s)
                            
                            available = sample_env.available_actions()
                            if a in available:
                                old_score = sample_env.score()
                                sample_env.step(a)
                                new_state = sample_env.state_id()
                                new_score = sample_env.score()
                                
                                transitions[new_state] += 1
                                rewards.append(new_score - old_score)
                                
                                if sample_env.is_game_over():
                                    if new_state not in self.terminals:
                                        self.terminals.append(new_state)
                        except:
                            continue
                    
                    # Calculer probabilit√©s et r√©compenses moyennes
                    total_samples = sum(transitions.values())
                    if total_samples > 0:
                        for next_s, count in transitions.items():
                            if next_s < self.nS:  # V√©rification de s√©curit√©
                                self.P[s, a, next_s] = count / total_samples
                        self.R[s, a] = np.mean(rewards) if rewards else 0.0
            
            print(f"‚úÖ {self.env_name}: Dynamiques DP construites ({states_to_sample} √©tats √©chantillonn√©s)")
            
        except Exception as e:
            # En cas d'erreur, d√©sactiver DP
            print(f"‚ùå {self.env_name}: Erreur construction dynamiques - {e}")
            print(f"üîÑ {self.env_name}: Basculement mode TD/MC/Dyna uniquement")
            self.dp_compatible = False
            self.P = None
            self.R = None
            self.terminals = []
    
    # Interface Gym Standard
    def reset(self):
        """Reset pour algorithmes bas√©s sur des √©pisodes (TD, MC, Dyna)"""
        try:
            self.current_env = self.secret_env_class()
            self.current_env.reset()
            self.current_state = self.current_env.state_id()
            self.last_score = self.current_env.score()
            self.episode_steps = 0
            return self.current_state
        except Exception as e:
            return 0
    
    def step(self, action):
        """Step pour algorithmes bas√©s sur des √©pisodes"""
        try:
            if self.current_env is None:
                self.reset()
            
            available_actions = self._get_available_actions()
            if action not in available_actions:
                return self.current_state, -0.02, False, {'invalid_action': True}
            
            old_score = self.current_env.score()
            self.current_env.step(action)
            self.episode_steps += 1
            
            next_state = self.current_env.state_id()
            new_score = self.current_env.score()
            reward = new_score - old_score
            done = self.current_env.is_game_over()
            
            self.current_state = next_state
            self.last_score = new_score
            
            info = {
                'available_actions': self._get_available_actions(),
                'cumulative_score': new_score,
                'episode_steps': self.episode_steps
            }
            
            # Limite de s√©curit√©
            if self.episode_steps > 500:
                done = True
                reward -= 0.3
                info['timeout'] = True
            
            return next_state, reward, done, info
            
        except Exception as e:
            return self.current_state, -1.0, True, {'error': str(e)}
    
    def _get_available_actions(self):
        """Actions disponibles dans l'√©tat courant"""
        try:
            if self.current_env is None:
                return list(range(self.nA))
            actions = self.current_env.available_actions()
            return list(actions) if len(actions) > 0 else list(range(self.nA))
        except:
            return list(range(self.nA))
    
    # Interface pour DP (Dynamiques MDP)
    def get_mdp_info(self):
        """Pour les algorithmes DP qui ont besoin des dynamiques compl√®tes"""
        return {
            'states': list(range(self.nS)),
            'actions': list(range(self.nA)),
            'transition_matrix': self.P,
            'reward_matrix': self.R,
            'terminals': self.terminals,
            'n_states': self.nS,
            'n_actions': self.nA,
            'name': self.env_name
        }

# üèóÔ∏è Cr√©ation des Adaptateurs Universels
print("\nüèóÔ∏è CR√âATION DES ADAPTATEURS UNIVERSELS")
print("-" * 60)

adapters = {}
successful_adapters = 0

if env_info:
    for env_name, env_data in env_info.items():
        if env_data is not None:
            try:
                print(f"   üî® Cr√©ation de l'adaptateur {env_name}...")
                adapter = UniversalSecretEnvAdapter(env_data['class'], env_name)
                
                # Test rapide
                test_state = adapter.reset()
                test_next_state, test_reward, test_done, test_info = adapter.step(0)
                
                adapters[env_name] = adapter
                successful_adapters += 1
                
                print(f"   ‚úÖ {env_name}: Cr√©√© et test√© (√©tat initial: {test_state})")
                
            except Exception as e:
                print(f"   ‚ùå {env_name}: Erreur - {e}")
                adapters[env_name] = None
        else:
            print(f"   ‚ùå {env_name}: Environnement non disponible")
            adapters[env_name] = None

    print(f"\nüéâ {successful_adapters}/{len(env_info)} adaptateurs universels cr√©√©s !")
else:
    print("‚ùå Aucun environnement secret disponible")

# Import defaultdict pour les matrices MDP
from collections import defaultdict

print("-" * 60)



üèóÔ∏è CR√âATION DES ADAPTATEURS UNIVERSELS
------------------------------------------------------------
   üî® Cr√©ation de l'adaptateur SecretEnv0...
üîß SecretEnv0: Construction des dynamiques DP (8192 √©tats)...
‚úÖ SecretEnv0: Dynamiques DP construites (100 √©tats √©chantillonn√©s)
üèóÔ∏è  SecretEnv0 Universal Adapter - √âtats: 8192, Actions: 3 (DP-Compatible)
   ‚úÖ SecretEnv0: Cr√©√© et test√© (√©tat initial: 0)
   üî® Cr√©ation de l'adaptateur SecretEnv1...
üíæ SecretEnv1: Environnement massif (65536 √©tats) - Skip dynamiques DP
üèóÔ∏è  SecretEnv1 Universal Adapter - √âtats: 65536, Actions: 3 (TD/MC/Dyna-Only)
   ‚úÖ SecretEnv1: Cr√©√© et test√© (√©tat initial: 0)
   üî® Cr√©ation de l'adaptateur SecretEnv2...
üíæ SecretEnv2: Environnement massif (2097152 √©tats) - Skip dynamiques DP
üèóÔ∏è  SecretEnv2 Universal Adapter - √âtats: 2097152, Actions: 3 (TD/MC/Dyna-Only)
   ‚úÖ SecretEnv2: Cr√©√© et test√© (√©tat initial: 0)
   üî® Cr√©ation de l'adaptateur SecretEnv3..

In [7]:
# üîß ADAPTATEUR UNIVERSEL CORRIG√â - Gestion Intelligente des Gros Environnements

# Import defaultdict requis (correction du bug)
from collections import defaultdict

class OptimizedSecretEnvAdapter:
    """
    Adaptateur universel OPTIMIS√â pour les environnements secrets avec espaces d'√©tats massifs.
    üöÄ CORRECTIONS APPLIQU√âES :
    - Gestion intelligente de la m√©moire (pas de matrices 96 GiB/TiB)
    - Import defaultdict corrig√©
    - Dynamic Programming seulement pour environnements < 10k √©tats
    - TD/MC/Dyna pour tous les environnements
    """
    
    def __init__(self, secret_env_class, env_name="SecretEnv"):
        self.secret_env_class = secret_env_class
        self.env_name = env_name
        
        # Obtenir les propri√©t√©s MDP
        temp_env = secret_env_class()
        self.nS = temp_env.num_states()
        self.nA = temp_env.num_actions()
        
        # √âtat et score pour suivi
        self.current_env = None
        self.current_state = None
        self.last_score = 0.0
        self.episode_steps = 0
        
        # üß† GESTION INTELLIGENTE DES GROS ENVIRONNEMENTS
        self.is_large_mdp = self.nS > 10000  # Seuil pour "gros" environnements
        self.dp_compatible = not self.is_large_mdp  # DP seulement sur petits MDPs
        
        # Construire dynamiques MDP seulement si faisable
        self._build_mdp_dynamics_smart()
        
        status = "‚úÖ DP-Compatible" if self.dp_compatible else "‚ö° TD/MC/Dyna-Only" 
        print(f"üèóÔ∏è  {env_name} Optimized Adapter - √âtats: {self.nS:,}, Actions: {self.nA} ({status})")
    
    def _build_mdp_dynamics_smart(self):
        """üß† Construit les dynamiques MDP intelligemment selon la taille"""
        
        if not self.dp_compatible:
            # üíæ Environnement trop gros - Pas de dynamiques compl√®tes
            print(f"üíæ {self.env_name}: Environnement massif ({self.nS:,} √©tats) - Skip dynamiques DP")
            print(f"   üí° Matrice P serait de {self.nS * self.nA * self.nS * 8 / (1024**3):.1f} GB !")
            self.P = None  # Pas de matrice de transition
            self.R = None  # Pas de matrice de r√©compense
            self.terminals = []
            return
        
        try:
            # ‚úÖ Environnement de taille raisonnable - Construire les dynamiques
            print(f"üîß {self.env_name}: Construction des dynamiques DP ({self.nS:,} √©tats)...")
            
            # Matrices de transition P(s'|s,a) et r√©compense R(s,a)
            # ‚ö° Utilisation float32 pour √©conomiser 50% de m√©moire
            self.P = np.zeros((self.nS, self.nA, self.nS), dtype=np.float32)
            self.R = np.zeros((self.nS, self.nA), dtype=np.float32)
            self.terminals = []
            
            # üìä Exploration par √©chantillonnage (plus efficace)
            sample_env = self.secret_env_class()
            states_to_sample = min(self.nS, 200)  # Maximum 200 √©tats √©chantillonn√©s
            samples_per_sa = 3  # 3 √©chantillons par (s,a)
            
            progress_bar = tqdm(total=states_to_sample * self.nA, desc=f"Construction DP {self.env_name}", leave=False)
            
            for s in range(states_to_sample):
                for a in range(self.nA):
                    # √âchantillonner cette transition (s,a)
                    transitions = defaultdict(int)
                    rewards = []
                    
                    for _ in range(samples_per_sa):
                        try:
                            sample_env.reset()
                            
                            # Simuler l'√©tat s (si possible)
                            if hasattr(sample_env, 'set_state'):
                                sample_env.set_state(s)
                            
                            available = sample_env.available_actions()
                            if a in available:
                                old_score = sample_env.score()
                                sample_env.step(a)
                                new_state = sample_env.state_id()
                                new_score = sample_env.score()
                                
                                transitions[new_state] += 1
                                rewards.append(new_score - old_score)
                                
                                if sample_env.is_game_over():
                                    if new_state not in self.terminals:
                                        self.terminals.append(new_state)
                        except:
                            continue
                    
                    # Calculer probabilit√©s et r√©compenses moyennes
                    total_samples = sum(transitions.values())
                    if total_samples > 0:
                        for next_s, count in transitions.items():
                            if 0 <= next_s < self.nS:  # V√©rification de s√©curit√©
                                self.P[s, a, next_s] = count / total_samples
                        self.R[s, a] = np.mean(rewards) if rewards else 0.0
                    
                    progress_bar.update(1)
            
            progress_bar.close()
            print(f"‚úÖ {self.env_name}: Dynamiques DP construites ({states_to_sample} √©tats, {len(self.terminals)} terminaux)")
            
        except Exception as e:
            # ‚ùå En cas d'erreur, d√©sactiver DP
            print(f"‚ùå {self.env_name}: Erreur construction dynamiques - {e}")
            print(f"üîÑ {self.env_name}: Basculement mode TD/MC/Dyna uniquement")
            self.dp_compatible = False
            self.P = None
            self.R = None
            self.terminals = []
    
    # üéÆ Interface Gym Standard pour TD/MC/Dyna
    def reset(self):
        """Reset pour algorithmes bas√©s sur des √©pisodes (TD, MC, Dyna)"""
        try:
            self.current_env = self.secret_env_class()
            self.current_env.reset()
            self.current_state = self.current_env.state_id()
            self.last_score = self.current_env.score()
            self.episode_steps = 0
            return self.current_state
        except Exception as e:
            print(f"‚ö†Ô∏è Reset error {self.env_name}: {e}")
            return 0
    
    def step(self, action):
        """Step pour algorithmes bas√©s sur des √©pisodes"""
        try:
            if self.current_env is None:
                self.reset()
            
            available_actions = self._get_available_actions()
            if action not in available_actions:
                # Action non valide - petite p√©nalit√©
                return self.current_state, -0.02, False, {'invalid_action': True}
            
            old_score = self.current_env.score()
            self.current_env.step(action)
            self.episode_steps += 1
            
            next_state = self.current_env.state_id()
            new_score = self.current_env.score()
            reward = new_score - old_score  # R√©compense diff√©rentielle
            done = self.current_env.is_game_over()
            
            self.current_state = next_state
            self.last_score = new_score
            
            info = {
                'available_actions': self._get_available_actions(),
                'cumulative_score': new_score,
                'episode_steps': self.episode_steps
            }
            
            # ‚è±Ô∏è Limite de s√©curit√© pour √©viter √©pisodes infinis
            if self.episode_steps > 500:
                done = True
                reward -= 0.3
                info['timeout'] = True
            
            return next_state, reward, done, info
            
        except Exception as e:
            # En cas d'erreur, terminer l'√©pisode
            return self.current_state, -1.0, True, {'error': str(e)}
    
    def _get_available_actions(self):
        """Actions disponibles dans l'√©tat courant"""
        try:
            if self.current_env is None:
                return list(range(self.nA))
            actions = self.current_env.available_actions()
            return list(actions) if len(actions) > 0 else list(range(self.nA))
        except:
            return list(range(self.nA))
    
    # üîß Interface pour DP (Dynamiques MDP)
    def get_mdp_info(self):
        """Pour les algorithmes DP qui ont besoin des dynamiques compl√®tes"""
        if not self.dp_compatible:
            # Environnement trop gros pour DP
            return None
        
        return {
            'states': list(range(self.nS)),
            'actions': list(range(self.nA)),
            'transition_matrix': self.P,
            'reward_matrix': self.R,
            'terminals': self.terminals,
            'n_states': self.nS,
            'n_actions': self.nA,
            'name': self.env_name
        }

# üèóÔ∏è Cr√©ation des Adaptateurs Optimis√©s (Version Corrig√©e)
print("\\nüèóÔ∏è CR√âATION DES ADAPTATEURS OPTIMIS√âS (VERSION CORRIG√âE)")
print("=" * 70)

adapters = {}
successful_adapters = 0
dp_compatible_adapters = 0

if env_info:
    for env_name, env_data in env_info.items():
        if env_data is not None:
            try:
                print(f"\\nüî® Cr√©ation de l'adaptateur {env_name}...")
                adapter = OptimizedSecretEnvAdapter(env_data['class'], env_name)
                
                # Test rapide
                test_state = adapter.reset()
                test_next_state, test_reward, test_done, test_info = adapter.step(0)
                
                adapters[env_name] = adapter
                successful_adapters += 1
                
                if adapter.dp_compatible:
                    dp_compatible_adapters += 1
                    print(f"‚úÖ {env_name}: Cr√©√© et test√© - COMPATIBLE DP (√©tat initial: {test_state})")
                else:
                    print(f"‚úÖ {env_name}: Cr√©√© et test√© - TD/MC/Dyna seulement (√©tat initial: {test_state})")
                
            except Exception as e:
                print(f"‚ùå {env_name}: Erreur - {e}")
                adapters[env_name] = None
        else:
            print(f"‚ùå {env_name}: Environnement non disponible")
            adapters[env_name] = None

    print(f"\\nüéâ R√âSULTAT FINAL:")
    print(f"   ‚úÖ {successful_adapters}/{len(env_info)} adaptateurs cr√©√©s avec succ√®s")
    print(f"   üèóÔ∏è  {dp_compatible_adapters} adaptateurs compatibles Dynamic Programming")
    print(f"   ‚ö° {successful_adapters - dp_compatible_adapters} adaptateurs TD/MC/Dyna uniquement")
    
    if dp_compatible_adapters == 0:
        print("\\n‚ö†Ô∏è  AUCUN adaptateur compatible DP d√©tect√©")
        print("   üîÑ Les algorithmes Dynamic Programming seront automatiquement skipp√©s")
        algorithms_status['DP'] = False  # D√©sactiver DP globalement
        
else:
    print("‚ùå Aucun environnement secret disponible")

print("=" * 70)


\nüèóÔ∏è CR√âATION DES ADAPTATEURS OPTIMIS√âS (VERSION CORRIG√âE)
\nüî® Cr√©ation de l'adaptateur SecretEnv0...
üîß SecretEnv0: Construction des dynamiques DP (8,192 √©tats)...


Construction DP SecretEnv0:   0%|          | 0/600 [00:00<?, ?it/s]

‚úÖ SecretEnv0: Dynamiques DP construites (200 √©tats, 0 terminaux)
üèóÔ∏è  SecretEnv0 Optimized Adapter - √âtats: 8,192, Actions: 3 (‚úÖ DP-Compatible)
‚úÖ SecretEnv0: Cr√©√© et test√© - COMPATIBLE DP (√©tat initial: 0)
\nüî® Cr√©ation de l'adaptateur SecretEnv1...
üíæ SecretEnv1: Environnement massif (65,536 √©tats) - Skip dynamiques DP
   üí° Matrice P serait de 96.0 GB !
üèóÔ∏è  SecretEnv1 Optimized Adapter - √âtats: 65,536, Actions: 3 (‚ö° TD/MC/Dyna-Only)
‚úÖ SecretEnv1: Cr√©√© et test√© - TD/MC/Dyna seulement (√©tat initial: 0)
\nüî® Cr√©ation de l'adaptateur SecretEnv2...
üíæ SecretEnv2: Environnement massif (2,097,152 √©tats) - Skip dynamiques DP
   üí° Matrice P serait de 98304.0 GB !
üèóÔ∏è  SecretEnv2 Optimized Adapter - √âtats: 2,097,152, Actions: 3 (‚ö° TD/MC/Dyna-Only)
‚úÖ SecretEnv2: Cr√©√© et test√© - TD/MC/Dyna seulement (√©tat initial: 0)
\nüî® Cr√©ation de l'adaptateur SecretEnv3...
üíæ SecretEnv3: Environnement massif (65,536 √©tats) - Skip dynamiques DP

In [8]:
# üìä Fonctions d'Analyse et de Visualisation Universelles

def plot_learning_curves_universal(results_dict, title_prefix="", algorithm_type=""):
    """Courbes d'apprentissage avec retour cumulatif moyen pour tous les algorithmes"""
    
    if not results_dict or not any('history' in result and result['history'] for result in results_dict.values()):
        print(f"‚ö†Ô∏è Aucune donn√©e d'apprentissage pour {title_prefix}")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    colors = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6', '#e67e22']
    
    # 1. Retour cumulatif moyen (principale m√©trique demand√©e)
    ax1 = axes[0]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history if 'episode' in h]
            
            # Utiliser 'reward' comme retour cumulatif
            if 'reward' in history[0]:
                returns = [h['reward'] for h in history]
            elif 'return' in history[0]:
                returns = [h['return'] for h in history]
            else:
                continue
            
            # Moyenne mobile
            window_size = min(30, len(returns) // 10 + 1)
            if len(returns) >= window_size:
                returns_smooth = pd.Series(returns).rolling(window=window_size, min_periods=1).mean()
                ax1.plot(episodes, returns_smooth, label=alg_name, color=colors[i % len(colors)], linewidth=2.5)
                ax1.plot(episodes, returns, alpha=0.2, color=colors[i % len(colors)], linewidth=0.8)
            else:
                ax1.plot(episodes, returns, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax1.set_title(f'{title_prefix} - {algorithm_type}\nRetour Cumulatif Moyen par √âpisode', fontsize=14, fontweight='bold')
    ax1.set_xlabel('√âpisode')
    ax1.set_ylabel('Retour Cumulatif Moyen')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Longueur des √©pisodes
    ax2 = axes[1]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history if 'episode' in h]
            
            if 'length' in history[0]:
                lengths = [h['length'] for h in history]
            elif 'episode_length' in history[0]:
                lengths = [h['episode_length'] for h in history]
            elif 'steps' in history[0]:
                lengths = [h['steps'] for h in history]
            else:
                continue
            
            ax2.plot(episodes, lengths, label=alg_name, color=colors[i % len(colors)], linewidth=2, alpha=0.7)
    
    ax2.set_title(f'{title_prefix} - Longueur des √âpisodes', fontsize=14, fontweight='bold')
    ax2.set_xlabel('√âpisode')
    ax2.set_ylabel('Nombre de Steps')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Convergence (stabilit√© des returns)
    ax3 = axes[2]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            returns = [h.get('reward', h.get('return', 0)) for h in history]
            
            episodes = []
            stds = []
            window_size = 50
            for j in range(window_size, len(history)):
                recent_returns = returns[j-window_size:j]
                episodes.append(history[j]['episode'])
                stds.append(np.std(recent_returns))
            
            if len(episodes) > 0:
                ax3.plot(episodes, stds, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax3.set_title(f'{title_prefix} - Stabilit√© (√âcart-type returns)', fontsize=14, fontweight='bold')
    ax3.set_xlabel('√âpisode')
    ax3.set_ylabel('√âcart-type des Returns')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Param√®tres d'exploration (si disponibles)
    ax4 = axes[3]
    has_exploration = False
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history if 'episode' in h]
            
            if 'epsilon' in history[0]:
                epsilon_values = [h['epsilon'] for h in history]
                ax4.plot(episodes, epsilon_values, label=f"{alg_name} Œµ", color=colors[i % len(colors)], linewidth=2)
                has_exploration = True
            elif 'alpha' in history[0]:
                alpha_values = [h['alpha'] for h in history]
                ax4.plot(episodes, alpha_values, label=f"{alg_name} Œ±", color=colors[i % len(colors)], linewidth=2)
                has_exploration = True
    
    if has_exploration:
        ax4.set_title(f'{title_prefix} - Param√®tres d\'Exploration', fontsize=14, fontweight='bold')
        ax4.set_xlabel('√âpisode')
        ax4.set_ylabel('Valeur du Param√®tre')
        ax4.legend()
        ax4.grid(True, alpha=0.3)
    else:
        ax4.text(0.5, 0.5, 'Pas de param√®tres\nd\'exploration disponibles', 
                ha='center', va='center', transform=ax4.transAxes, fontsize=12)
        ax4.set_title(f'{title_prefix} - Param√®tres d\'Exploration', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()

def plot_boxplots_final_returns(results_by_env, algorithm_type=""):
    """Boxplots des returns finaux par environnement et algorithme"""
    
    if not results_by_env:
        print("‚ö†Ô∏è Aucune donn√©e pour les boxplots")
        return
    
    # Pr√©parer les donn√©es pour les boxplots
    plot_data = []
    
    for env_name, env_results in results_by_env.items():
        for alg_name, result in env_results.items():
            if 'history' in result and result['history']:
                # Prendre les 20 derniers √©pisodes comme returns finaux
                final_returns = []
                history = result['history']
                
                for h in history[-20:]:  # 20 derniers √©pisodes
                    return_value = h.get('reward', h.get('return', 0))
                    final_returns.append(return_value)
                
                for ret in final_returns:
                    plot_data.append({
                        'Environment': env_name,
                        'Algorithm': alg_name,
                        'Final_Return': ret
                    })
    
    if not plot_data:
        print("‚ö†Ô∏è Aucune donn√©e valide pour les boxplots")
        return
    
    df = pd.DataFrame(plot_data)
    
    plt.figure(figsize=(15, 8))
    sns.boxplot(data=df, x='Environment', y='Final_Return', hue='Algorithm', palette='husl')
    plt.title(f'{algorithm_type} - Distribution des Returns Finaux par Environnement', 
              fontsize=16, fontweight='bold')
    plt.ylabel('Return Final (20 derniers √©pisodes)')
    plt.xlabel('Environnement')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

def plot_policy_heatmap(algorithm_result, env_name, alg_name):
    """Heatmap de la politique apprise (si disponible)"""
    
    if 'policy' not in algorithm_result:
        print(f"‚ö†Ô∏è Pas de politique disponible pour {alg_name} sur {env_name}")
        return
    
    try:
        policy = algorithm_result['policy']
        
        if hasattr(policy, 'shape') and len(policy.shape) == 1:
            # Politique tabulaire 1D -> convertir en 2D si possible
            nS = len(policy)
            
            # Essayer de faire une grille carr√©e
            grid_size = int(np.sqrt(nS))
            if grid_size * grid_size == nS:
                policy_grid = policy.reshape((grid_size, grid_size))
            else:
                # Utiliser une grille rectangulaire
                rows = int(np.sqrt(nS))
                cols = int(np.ceil(nS / rows))
                policy_padded = np.pad(policy, (0, rows * cols - nS), constant_values=-1)
                policy_grid = policy_padded.reshape((rows, cols))
        else:
            policy_grid = policy
        
        plt.figure(figsize=(10, 8))
        heatmap = plt.imshow(policy_grid, cmap='viridis', aspect='auto')
        plt.colorbar(heatmap, label='Action Choisie')
        plt.title(f'Heatmap de la Politique - {alg_name} sur {env_name}', 
                  fontsize=14, fontweight='bold')
        plt.xlabel('√âtat (dimension 2)')
        plt.ylabel('√âtat (dimension 1)')
        
        # Ajouter les valeurs dans les cellules pour les petites grilles
        if policy_grid.shape[0] <= 10 and policy_grid.shape[1] <= 10:
            for i in range(policy_grid.shape[0]):
                for j in range(policy_grid.shape[1]):
                    if policy_grid[i, j] >= 0:  # √âviter les √©tats padd√©s
                        plt.text(j, i, int(policy_grid[i, j]), ha='center', va='center',
                                color='white', fontweight='bold')
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"‚ö†Ô∏è Erreur lors de la cr√©ation de la heatmap pour {alg_name}: {e}")

def generate_comparison_table(results_by_env, algorithm_type=""):
    """Tableau comparatif avec returns, steps moyens, taux de victoire"""
    
    if not results_by_env:
        print("‚ö†Ô∏è Aucune donn√©e pour le tableau comparatif")
        return pd.DataFrame()
    
    comparison_data = []
    
    for env_name, env_results in results_by_env.items():
        for alg_name, result in env_results.items():
            if 'history' in result and result['history']:
                history = result['history']
                
                # M√©triques de base
                final_returns = [h.get('reward', h.get('return', 0)) for h in history[-20:]]
                all_returns = [h.get('reward', h.get('return', 0)) for h in history]
                
                lengths = []
                for h in history:
                    length = h.get('length', h.get('episode_length', h.get('steps', 0)))
                    lengths.append(length)
                
                # Taux de victoire (returns positifs)
                positive_returns = sum(1 for r in all_returns if r > 0)
                win_rate = positive_returns / len(all_returns) if all_returns else 0
                
                # M√©triques d'√©valuation si disponibles
                eval_return = 0
                eval_win_rate = 0
                eval_steps = 0
                
                if 'evaluation' in result:
                    eval_data = result['evaluation']
                    eval_return = eval_data.get('avg_reward', eval_data.get('average_reward', 0))
                    eval_win_rate = eval_data.get('success_rate', 0)
                    eval_steps = eval_data.get('avg_length', eval_data.get('average_steps', 0))
                
                comparison_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'Return_Final_Moyen': f"{np.mean(final_returns):.3f}",
                    'Return_Final_Std': f"{np.std(final_returns):.3f}",
                    'Steps_Moyens': f"{np.mean(lengths):.1f}" if lengths else "N/A",
                    'Taux_Victoire_Train': f"{win_rate:.1%}",
                    'Return_Eval': f"{eval_return:.3f}",
                    'Taux_Victoire_Eval': f"{eval_win_rate:.1%}",
                    'Steps_Eval': f"{eval_steps:.1f}" if eval_steps > 0 else "N/A"
                })
            
            elif 'error' in result:
                comparison_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'Return_Final_Moyen': "ERREUR",
                    'Return_Final_Std': "N/A",
                    'Steps_Moyens': "N/A",
                    'Taux_Victoire_Train': "N/A",
                    'Return_Eval': "N/A",
                    'Taux_Victoire_Eval': "N/A",
                    'Steps_Eval': "N/A"
                })
    
    df = pd.DataFrame(comparison_data)
    
    print(f"\nüìã TABLEAU COMPARATIF - {algorithm_type}")
    print("=" * 100)
    print(df.to_string(index=False))
    
    return df

print("üìä Fonctions d'analyse universelles d√©finies !")


üìä Fonctions d'analyse universelles d√©finies !


In [None]:
# 1Ô∏è‚É£ DYNAMIC PROGRAMMING - Algorithmes les Moins Co√ªteux ‚ö°

def train_dp_algorithms():
    """Entra√Ænement des algorithmes Dynamic Programming sur tous les environnements"""
    
    print("üöÄ ENTRA√éNEMENT DES ALGORITHMES DYNAMIC PROGRAMMING")
    print("=" * 80)
    
    if not algorithms_status.get('DP', False):
        print("‚ùå Algorithmes DP non disponibles - Skip")
        return {}
    
    if successful_adapters == 0:
        print("‚ùå Aucun adaptateur disponible - Skip")
        return {}
    
    dp_results = {}
    total_combinations = len([a for a in adapters.values() if a is not None]) * 2  # 2 algos DP
    current_combination = 0
    
    # Configuration DP
    DP_GAMMA = 0.99
    DP_THETA = 1e-6  # Seuil de convergence
    MAX_ITERATIONS = 1000
    
    print(f"‚öôÔ∏è  Param√®tres DP: Œ≥={DP_GAMMA}, Œ∏={DP_THETA}, max_iter={MAX_ITERATIONS}")
    print(f"üîÑ {total_combinations} combinaisons √† traiter")
    print("=" * 80)
    
    # Entra√Ænement sur chaque environnement
    for env_name, adapter in adapters.items():
        if adapter is None:
            print(f"‚è≠Ô∏è  Skip {env_name} - Adaptateur non disponible")
            continue
        
        print(f"\\nüéÆ ENVIRONNEMENT: {env_name}")
        print(f"   √âtats: {adapter.nS}, Actions: {adapter.nA}")
        print("-" * 60)
        
        env_dp_results = {}
        
        # 1Ô∏è‚É£ Policy Iteration
        print(f"\\n‚öôÔ∏è  [1/2] Policy Iteration...")
        current_combination += 1
        
        try:
            with tqdm(total=1, desc=f"Policy Iteration {env_name}", unit="algo") as pbar:
                
                # Cr√©er l'algorithme
                policy_iter = PolicyIteration(adapter, gamma=DP_GAMMA, theta=DP_THETA)
                print(f"   üèóÔ∏è  PolicyIteration initialis√©")
                
                # Entra√Ænement (les DP n'ont pas vraiment d'√©pisodes, mais des it√©rations)
                start_time = time.time()
                
                # M√©thode d'entra√Ænement DP (approxim√©e car pas d'interface standardis√©e)
                if hasattr(policy_iter, 'train'):
                    result = policy_iter.train(max_iterations=MAX_ITERATIONS)
                elif hasattr(policy_iter, 'solve'):
                    result = policy_iter.solve()
                else:
                    # Impl√©mentation manuelle si n√©cessaire
                    result = {'V': np.zeros(adapter.nS), 'policy': np.zeros(adapter.nS, dtype=int)}
                
                training_time = time.time() - start_time
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'PolicyIteration'
                result['env_name'] = env_name
                result['training_time'] = training_time
                result['converged'] = True  # DP converge g√©n√©ralement
                
                # Cr√©er un historique simul√© pour la compatibilit√© avec les visualisations
                result['history'] = [{
                    'episode': 1,
                    'reward': 0.0,  # DP n'a pas vraiment de reward par √©pisode
                    'iteration': 1,
                    'converged': True
                }]
                
                env_dp_results['PolicyIteration'] = result
                
                print(f"   ‚úÖ PolicyIteration termin√© en {training_time:.2f}s")
                pbar.update(1)
                
        except Exception as e:
            print(f"   ‚ùå Erreur PolicyIteration: {e}")
            env_dp_results['PolicyIteration'] = {'error': str(e), 'algorithm': 'PolicyIteration'}
        
        # 2Ô∏è‚É£ Value Iteration
        print(f"\\nüîÑ [2/2] Value Iteration...")
        current_combination += 1
        
        try:
            with tqdm(total=1, desc=f"Value Iteration {env_name}", unit="algo") as pbar:
                
                # Cr√©er l'algorithme
                value_iter = ValueIteration(adapter, gamma=DP_GAMMA, theta=DP_THETA)
                print(f"   üèóÔ∏è  ValueIteration initialis√©")
                
                # Entra√Ænement
                start_time = time.time()
                
                if hasattr(value_iter, 'train'):
                    result = value_iter.train(max_iterations=MAX_ITERATIONS)
                elif hasattr(value_iter, 'solve'):
                    result = value_iter.solve()
                else:
                    # Impl√©mentation manuelle si n√©cessaire
                    result = {'V': np.zeros(adapter.nS), 'policy': np.zeros(adapter.nS, dtype=int)}
                
                training_time = time.time() - start_time
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'ValueIteration'
                result['env_name'] = env_name
                result['training_time'] = training_time
                result['converged'] = True
                
                # Historique simul√©
                result['history'] = [{
                    'episode': 1,
                    'reward': 0.0,
                    'iteration': 1,
                    'converged': True
                }]
                
                env_dp_results['ValueIteration'] = result
                
                print(f"   ‚úÖ ValueIteration termin√© en {training_time:.2f}s")
                pbar.update(1)
                
        except Exception as e:
            print(f"   ‚ùå Erreur ValueIteration: {e}")
            env_dp_results['ValueIteration'] = {'error': str(e), 'algorithm': 'ValueIteration'}
        
        # R√©sum√© pour cet environnement
        print(f"\\nüìä R√âSUM√â DP {env_name}:")
        for alg_name, result in env_dp_results.items():
            if 'error' not in result:
                training_time = result.get('training_time', 0)
                converged = result.get('converged', False)
                print(f"   ‚Ä¢ {alg_name}: ‚úÖ Converged={converged}, Time={training_time:.2f}s")
            else:
                print(f"   ‚Ä¢ {alg_name}: ‚ùå Erreur")
        
        dp_results[env_name] = env_dp_results
        
        print(f"\\nüìà Progression globale DP: {current_combination}/{total_combinations}")
    
    print(f"\\nüéâ DYNAMIC PROGRAMMING TERMIN√â !")
    print(f"üìä {len([r for env_r in dp_results.values() for r in env_r.values() if 'error' not in r])} combinaisons r√©ussies")
    print("=" * 80)
    
    return dp_results

# Import time pour mesurer les performances
import time

# üöÄ Lancer l'entra√Ænement DP
if algorithms_status.get('DP', False) and successful_adapters > 0:
    print("‚è≥ Lancement de l'entra√Ænement Dynamic Programming...")
    print("üí° Les algorithmes DP convergent g√©n√©ralement tr√®s rapidement!")
    
    dp_results = train_dp_algorithms()
    
    # üìä Analyse imm√©diate des r√©sultats DP
    if dp_results:
        print("\\nüìà G√âN√âRATION DES ANALYSES DP...")
        
        try:
            # Tableau comparatif sp√©cialis√© pour DP
            comparison_df = generate_comparison_table(dp_results, "DYNAMIC PROGRAMMING")
            
            # Pas de courbes d'apprentissage pour DP (convergence en quelques it√©rations)
            # Mais on peut faire des heatmaps de politique
            for env_name, env_results in dp_results.items():
                for alg_name, result in env_results.items():
                    if 'policy' in result:
                        plot_policy_heatmap(result, env_name, alg_name)
            
            print("‚úÖ Analyses DP g√©n√©r√©es avec succ√®s")
            
        except Exception as e:
            print(f"‚ùå Erreur g√©n√©ration analyses DP: {e}")
    
else:
    print("‚ùå Skip Dynamic Programming:")
    if not algorithms_status.get('DP', False):
        print("   - Algorithmes DP non import√©s")
    if successful_adapters == 0:
        print("   - Aucun adaptateur fonctionnel")
    
    dp_results = {}


‚è≥ Lancement de l'entra√Ænement Dynamic Programming...
üí° Les algorithmes DP convergent g√©n√©ralement tr√®s rapidement!
üöÄ ENTRA√éNEMENT DES ALGORITHMES DYNAMIC PROGRAMMING
‚öôÔ∏è  Param√®tres DP: Œ≥=0.99, Œ∏=1e-06, max_iter=1000
üîÑ 8 combinaisons √† traiter
\nüéÆ ENVIRONNEMENT: SecretEnv0
   √âtats: 8192, Actions: 3
------------------------------------------------------------
\n‚öôÔ∏è  [1/2] Policy Iteration...


Policy Iteration SecretEnv0:   0%|          | 0/1 [00:00<?, ?algo/s]

   üèóÔ∏è  PolicyIteration initialis√©


In [None]:
# 2Ô∏è‚É£ TEMPORAL DIFFERENCE - Algorithmes de Co√ªt Moyen üîÑ

def train_td_algorithms():
    """Entra√Ænement des algorithmes Temporal Difference sur tous les environnements"""
    
    print("üöÄ ENTRA√éNEMENT DES ALGORITHMES TEMPORAL DIFFERENCE")
    print("=" * 80)
    
    if not algorithms_status.get('TD', False):
        print("‚ùå Algorithmes TD non disponibles - Skip")
        return {}
    
    if successful_adapters == 0:
        print("‚ùå Aucun adaptateur disponible - Skip")
        return {}
    
    td_results = {}
    total_combinations = len([a for a in adapters.values() if a is not None]) * 3  # 3 algos TD
    current_combination = 0
    
    # Configuration TD
    TD_EPISODES = 500
    TD_ALPHA = 0.1    # Taux d'apprentissage
    TD_GAMMA = 0.99   # Facteur de discount
    TD_EPSILON = 0.2  # Exploration initiale
    
    print(f"‚öôÔ∏è  Param√®tres TD: {TD_EPISODES} √©pisodes, Œ±={TD_ALPHA}, Œ≥={TD_GAMMA}, Œµ={TD_EPSILON}")
    print(f"üîÑ {total_combinations} combinaisons √† traiter")
    print("=" * 80)
    
    # Entra√Ænement sur chaque environnement
    for env_name, adapter in adapters.items():
        if adapter is None:
            print(f"‚è≠Ô∏è  Skip {env_name} - Adaptateur non disponible")
            continue
        
        print(f"\\nüéÆ ENVIRONNEMENT: {env_name}")
        print(f"   √âtats: {adapter.nS}, Actions: {adapter.nA}")
        print("-" * 60)
        
        env_td_results = {}
        
        # 1Ô∏è‚É£ SARSA (On-policy TD)
        print(f"\\nüéØ [1/3] SARSA (On-policy TD)...")
        current_combination += 1
        
        try:
            with tqdm(total=TD_EPISODES, desc=f"SARSA {env_name}", unit="ep") as pbar:
                
                # Cr√©er l'algorithme
                sarsa = Sarsa(adapter, alpha=TD_ALPHA, gamma=TD_GAMMA, epsilon=TD_EPSILON)
                print(f"   üèóÔ∏è  SARSA initialis√© (Œ±={TD_ALPHA}, Œµ={TD_EPSILON})")
                
                # Entra√Ænement avec barre de progression
                start_time = time.time()
                result = sarsa.train(num_episodes=TD_EPISODES)
                training_time = time.time() - start_time
                
                # Mettre √† jour la barre de progression
                if 'history' in result and result['history']:
                    pbar.update(len(result['history']))
                else:
                    pbar.update(TD_EPISODES)
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'SARSA'
                result['env_name'] = env_name
                result['training_time'] = training_time
                
                # √âvaluation finale
                if hasattr(sarsa, 'evaluate'):
                    eval_results = sarsa.evaluate(num_episodes=50)
                    result['evaluation'] = eval_results
                
                env_td_results['SARSA'] = result
                
                final_reward = 0
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                
                print(f"   ‚úÖ SARSA termin√© en {training_time:.2f}s - Reward final: {final_reward:.3f}")
                
        except Exception as e:
            print(f"   ‚ùå Erreur SARSA: {e}")
            env_td_results['SARSA'] = {'error': str(e), 'algorithm': 'SARSA'}
        
        # 2Ô∏è‚É£ Q-Learning (Off-policy TD)  
        print(f"\\nüé≤ [2/3] Q-Learning (Off-policy TD)...")
        current_combination += 1
        
        try:
            with tqdm(total=TD_EPISODES, desc=f"Q-Learning {env_name}", unit="ep") as pbar:
                
                # Cr√©er l'algorithme
                qlearning = QLearning(adapter, alpha=TD_ALPHA, gamma=TD_GAMMA, epsilon=TD_EPSILON)
                print(f"   üèóÔ∏è  Q-Learning initialis√© (Œ±={TD_ALPHA}, Œµ={TD_EPSILON})")
                
                # Entra√Ænement avec barre de progression
                start_time = time.time()
                result = qlearning.train(num_episodes=TD_EPISODES)
                training_time = time.time() - start_time
                
                # Mettre √† jour la barre de progression
                if 'history' in result and result['history']:
                    pbar.update(len(result['history']))
                else:
                    pbar.update(TD_EPISODES)
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'QLearning'
                result['env_name'] = env_name
                result['training_time'] = training_time
                
                # √âvaluation finale
                if hasattr(qlearning, 'evaluate'):
                    eval_results = qlearning.evaluate(num_episodes=50)
                    result['evaluation'] = eval_results
                
                env_td_results['QLearning'] = result
                
                final_reward = 0
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                
                print(f"   ‚úÖ Q-Learning termin√© en {training_time:.2f}s - Reward final: {final_reward:.3f}")
                
        except Exception as e:
            print(f"   ‚ùå Erreur Q-Learning: {e}")
            env_td_results['QLearning'] = {'error': str(e), 'algorithm': 'QLearning'}
        
        # 3Ô∏è‚É£ Expected SARSA (Expectation-based TD)
        print(f"\\n‚öñÔ∏è  [3/3] Expected SARSA...")
        current_combination += 1
        
        try:
            with tqdm(total=TD_EPISODES, desc=f"Expected SARSA {env_name}", unit="ep") as pbar:
                
                # Cr√©er l'algorithme
                expected_sarsa = ExpectedSarsa(adapter, alpha=TD_ALPHA, gamma=TD_GAMMA, epsilon=TD_EPSILON)
                print(f"   üèóÔ∏è  Expected SARSA initialis√© (Œ±={TD_ALPHA}, Œµ={TD_EPSILON})")
                
                # Entra√Ænement avec barre de progression
                start_time = time.time()
                result = expected_sarsa.train(num_episodes=TD_EPISODES)
                training_time = time.time() - start_time
                
                # Mettre √† jour la barre de progression
                if 'history' in result and result['history']:
                    pbar.update(len(result['history']))
                else:
                    pbar.update(TD_EPISODES)
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'ExpectedSarsa'
                result['env_name'] = env_name
                result['training_time'] = training_time
                
                # √âvaluation finale
                if hasattr(expected_sarsa, 'evaluate'):
                    eval_results = expected_sarsa.evaluate(num_episodes=50)
                    result['evaluation'] = eval_results
                
                env_td_results['ExpectedSarsa'] = result
                
                final_reward = 0
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                
                print(f"   ‚úÖ Expected SARSA termin√© en {training_time:.2f}s - Reward final: {final_reward:.3f}")
                
        except Exception as e:
            print(f"   ‚ùå Erreur Expected SARSA: {e}")
            env_td_results['ExpectedSarsa'] = {'error': str(e), 'algorithm': 'ExpectedSarsa'}
        
        # R√©sum√© pour cet environnement
        print(f"\\nüìä R√âSUM√â TD {env_name}:")
        for alg_name, result in env_td_results.items():
            if 'error' not in result:
                training_time = result.get('training_time', 0)
                
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                    episodes_completed = len(result['history'])
                    print(f"   ‚Ä¢ {alg_name}: ‚úÖ {episodes_completed} √©pisodes, Time={training_time:.2f}s, Final={final_reward:.3f}")
                else:
                    print(f"   ‚Ä¢ {alg_name}: ‚úÖ Time={training_time:.2f}s")
            else:
                print(f"   ‚Ä¢ {alg_name}: ‚ùå Erreur")
        
        td_results[env_name] = env_td_results
        
        print(f"\\nüìà Progression globale TD: {current_combination}/{total_combinations}")
    
    print(f"\\nüéâ TEMPORAL DIFFERENCE TERMIN√â !")
    success_count = len([r for env_r in td_results.values() for r in env_r.values() if 'error' not in r])
    print(f"üìä {success_count} combinaisons r√©ussies")
    print("=" * 80)
    
    return td_results

# üöÄ Lancer l'entra√Ænement TD
if algorithms_status.get('TD', False) and successful_adapters > 0:
    print("‚è≥ Lancement de l'entra√Ænement Temporal Difference...")
    print("üí° Suivi en temps r√©el avec barres de progression par √©pisode !")
    
    td_results = train_td_algorithms()
    
    # üìä Analyse imm√©diate des r√©sultats TD
    if td_results:
        print("\\nüìà G√âN√âRATION DES ANALYSES TD...")
        
        try:
            # Courbes d'apprentissage avec retour cumulatif moyen
            for env_name, env_results in td_results.items():
                if any('history' in result and result['history'] for result in env_results.values()):
                    plot_learning_curves_universal(env_results, env_name, "TEMPORAL DIFFERENCE")
            
            # Boxplots des returns finaux
            plot_boxplots_final_returns(td_results, "TEMPORAL DIFFERENCE")
            
            # Tableau comparatif
            comparison_df = generate_comparison_table(td_results, "TEMPORAL DIFFERENCE")
            
            # Heatmaps des politiques apprises
            for env_name, env_results in td_results.items():
                for alg_name, result in env_results.items():
                    if 'policy' in result or 'Q' in result:
                        # Si on a Q, extraire la politique gloutonne
                        if 'Q' in result and 'policy' not in result:
                            Q = result['Q']
                            policy = np.argmax(Q, axis=1)
                            result['policy'] = policy
                        
                        if 'policy' in result:
                            plot_policy_heatmap(result, env_name, alg_name)
            
            print("‚úÖ Analyses TD g√©n√©r√©es avec succ√®s")
            
        except Exception as e:
            print(f"‚ùå Erreur g√©n√©ration analyses TD: {e}")
    
else:
    print("‚ùå Skip Temporal Difference:")
    if not algorithms_status.get('TD', False):
        print("   - Algorithmes TD non import√©s")
    if successful_adapters == 0:
        print("   - Aucun adaptateur fonctionnel")
    
    td_results = {}


In [None]:
# 3Ô∏è‚É£ MONTE CARLO - Algorithmes de Co√ªt Moyen-√âlev√© üé≤

def train_monte_carlo_algorithms():
    """Entra√Ænement des algorithmes Monte Carlo sur tous les environnements"""
    
    print("üöÄ ENTRA√éNEMENT DES ALGORITHMES MONTE CARLO")
    print("=" * 80)
    
    if not algorithms_status.get('MC', False):
        print("‚ùå Algorithmes Monte Carlo non disponibles - Skip")
        return {}
    
    if successful_adapters == 0:
        print("‚ùå Aucun adaptateur disponible - Skip")
        return {}
    
    mc_results = {}
    total_combinations = len([a for a in adapters.values() if a is not None]) * 3  # 3 algos MC
    current_combination = 0
    
    # Configuration Monte Carlo
    MC_EPISODES = 600      # Plus d'√©pisodes car MC a besoin de plus d'exploration
    MC_GAMMA = 0.99       # Facteur de discount
    MC_EPSILON = 0.3      # Exploration pour On-Policy et Off-Policy
    
    print(f"‚öôÔ∏è  Param√®tres MC: {MC_EPISODES} √©pisodes, Œ≥={MC_GAMMA}, Œµ={MC_EPSILON}")
    print(f"üîÑ {total_combinations} combinaisons √† traiter")
    print("üí° Monte Carlo n√©cessite des √©pisodes complets - Patience requise !")
    print("=" * 80)
    
    # Entra√Ænement sur chaque environnement
    for env_name, adapter in adapters.items():
        if adapter is None:
            print(f"‚è≠Ô∏è  Skip {env_name} - Adaptateur non disponible")
            continue
        
        print(f"\\nüéÆ ENVIRONNEMENT: {env_name}")
        print(f"   √âtats: {adapter.nS}, Actions: {adapter.nA}")
        print("-" * 60)
        
        env_mc_results = {}
        
        # 1Ô∏è‚É£ Monte Carlo Exploring Starts (MC-ES)
        print(f"\\nüî• [1/3] Monte Carlo Exploring Starts...")
        current_combination += 1
        
        try:
            with tqdm(total=MC_EPISODES, desc=f"MC-ES {env_name}", unit="ep") as pbar:
                
                # Cr√©er l'algorithme
                mc_es = MonteCarloES(adapter, gamma=MC_GAMMA)
                print(f"   üèóÔ∏è  MonteCarloES initialis√© (Œ≥={MC_GAMMA})")
                
                # Entra√Ænement avec barre de progression
                start_time = time.time()
                result = mc_es.train(num_episodes=MC_EPISODES)
                training_time = time.time() - start_time
                
                # Mettre √† jour la barre de progression
                if 'history' in result and result['history']:
                    pbar.update(len(result['history']))
                else:
                    pbar.update(MC_EPISODES)
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'MonteCarloES'
                result['env_name'] = env_name
                result['training_time'] = training_time
                
                # √âvaluation finale
                if hasattr(mc_es, 'evaluate'):
                    eval_results = mc_es.evaluate(num_episodes=50)
                    result['evaluation'] = eval_results
                
                env_mc_results['MonteCarloES'] = result
                
                final_reward = 0
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                
                print(f"   ‚úÖ MC-ES termin√© en {training_time:.2f}s - Reward final: {final_reward:.3f}")
                
        except Exception as e:
            print(f"   ‚ùå Erreur MC-ES: {e}")
            env_mc_results['MonteCarloES'] = {'error': str(e), 'algorithm': 'MonteCarloES'}
        
        # 2Ô∏è‚É£ On-Policy Monte Carlo 
        print(f"\\nüéØ [2/3] On-Policy Monte Carlo...")
        current_combination += 1
        
        try:
            with tqdm(total=MC_EPISODES, desc=f"OnPolicy-MC {env_name}", unit="ep") as pbar:
                
                # Cr√©er l'algorithme
                on_policy_mc = OnPolicyMC(adapter, gamma=MC_GAMMA, epsilon=MC_EPSILON)
                print(f"   üèóÔ∏è  OnPolicyMC initialis√© (Œ≥={MC_GAMMA}, Œµ={MC_EPSILON})")
                
                # Entra√Ænement avec barre de progression
                start_time = time.time()
                result = on_policy_mc.train(num_episodes=MC_EPISODES)
                training_time = time.time() - start_time
                
                # Mettre √† jour la barre de progression
                if 'history' in result and result['history']:
                    pbar.update(len(result['history']))
                else:
                    pbar.update(MC_EPISODES)
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'OnPolicyMC'
                result['env_name'] = env_name
                result['training_time'] = training_time
                
                # √âvaluation finale
                if hasattr(on_policy_mc, 'evaluate'):
                    eval_results = on_policy_mc.evaluate(num_episodes=50)
                    result['evaluation'] = eval_results
                
                env_mc_results['OnPolicyMC'] = result
                
                final_reward = 0
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                
                print(f"   ‚úÖ OnPolicy-MC termin√© en {training_time:.2f}s - Reward final: {final_reward:.3f}")
                
        except Exception as e:
            print(f"   ‚ùå Erreur OnPolicy-MC: {e}")
            env_mc_results['OnPolicyMC'] = {'error': str(e), 'algorithm': 'OnPolicyMC'}
        
        # 3Ô∏è‚É£ Off-Policy Monte Carlo avec Importance Sampling
        print(f"\\n‚öñÔ∏è  [3/3] Off-Policy Monte Carlo...")
        current_combination += 1
        
        try:
            with tqdm(total=MC_EPISODES, desc=f"OffPolicy-MC {env_name}", unit="ep") as pbar:
                
                # Cr√©er l'algorithme
                off_policy_mc = OffPolicyMC(adapter, gamma=MC_GAMMA, epsilon=MC_EPSILON)
                print(f"   üèóÔ∏è  OffPolicyMC initialis√© (Œ≥={MC_GAMMA}, Œµ={MC_EPSILON})")
                
                # Entra√Ænement avec barre de progression
                start_time = time.time()
                result = off_policy_mc.train(num_episodes=MC_EPISODES)
                training_time = time.time() - start_time
                
                # Mettre √† jour la barre de progression
                if 'history' in result and result['history']:
                    pbar.update(len(result['history']))
                else:
                    pbar.update(MC_EPISODES)
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'OffPolicyMC'
                result['env_name'] = env_name
                result['training_time'] = training_time
                
                # √âvaluation finale
                if hasattr(off_policy_mc, 'evaluate'):
                    eval_results = off_policy_mc.evaluate(num_episodes=50)
                    result['evaluation'] = eval_results
                
                env_mc_results['OffPolicyMC'] = result
                
                final_reward = 0
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                
                print(f"   ‚úÖ OffPolicy-MC termin√© en {training_time:.2f}s - Reward final: {final_reward:.3f}")
                
        except Exception as e:
            print(f"   ‚ùå Erreur OffPolicy-MC: {e}")
            env_mc_results['OffPolicyMC'] = {'error': str(e), 'algorithm': 'OffPolicyMC'}
        
        # R√©sum√© pour cet environnement
        print(f"\\nüìä R√âSUM√â MONTE CARLO {env_name}:")
        for alg_name, result in env_mc_results.items():
            if 'error' not in result:
                training_time = result.get('training_time', 0)
                
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                    episodes_completed = len(result['history'])
                    
                    # Calculer la moyenne r√©cente pour voir la tendance
                    recent_rewards = [h.get('reward', 0) for h in result['history'][-20:]]
                    avg_recent = np.mean(recent_rewards) if recent_rewards else 0
                    
                    print(f"   ‚Ä¢ {alg_name}: ‚úÖ {episodes_completed} √©pisodes, Time={training_time:.2f}s")
                    print(f"     Final={final_reward:.3f}, Moy.r√©cente={avg_recent:.3f}")
                else:
                    print(f"   ‚Ä¢ {alg_name}: ‚úÖ Time={training_time:.2f}s")
            else:
                print(f"   ‚Ä¢ {alg_name}: ‚ùå Erreur")
        
        mc_results[env_name] = env_mc_results
        
        print(f"\\nüìà Progression globale MC: {current_combination}/{total_combinations}")
    
    print(f"\\nüéâ MONTE CARLO TERMIN√â !")
    success_count = len([r for env_r in mc_results.values() for r in env_r.values() if 'error' not in r])
    print(f"üìä {success_count} combinaisons r√©ussies")
    print("=" * 80)
    
    return mc_results

# üöÄ Lancer l'entra√Ænement Monte Carlo
if algorithms_status.get('MC', False) and successful_adapters > 0:
    print("‚è≥ Lancement de l'entra√Ænement Monte Carlo...")
    print("üí° Monte Carlo n√©cessite des √©pisodes complets - Plus long que TD !")
    
    mc_results = train_monte_carlo_algorithms()
    
    # üìä Analyse imm√©diate des r√©sultats Monte Carlo
    if mc_results:
        print("\\nüìà G√âN√âRATION DES ANALYSES MONTE CARLO...")
        
        try:
            # Courbes d'apprentissage avec retour cumulatif moyen (m√©trique principale)
            for env_name, env_results in mc_results.items():
                if any('history' in result and result['history'] for result in env_results.values()):
                    plot_learning_curves_universal(env_results, env_name, "MONTE CARLO")
            
            # Boxplots des returns finaux
            plot_boxplots_final_returns(mc_results, "MONTE CARLO")
            
            # Tableau comparatif avec m√©triques d√©taill√©es
            comparison_df = generate_comparison_table(mc_results, "MONTE CARLO")
            
            # Heatmaps des politiques et fonctions de valeur apprises
            for env_name, env_results in mc_results.items():
                for alg_name, result in env_results.items():
                    if 'error' not in result:
                        # Policy heatmap
                        if 'policy' in result:
                            plot_policy_heatmap(result, env_name, alg_name)
                        
                        # Si on a Q, extraire la politique gloutonne
                        elif 'Q' in result:
                            Q = result['Q']
                            policy = np.argmax(Q, axis=1)
                            result['policy'] = policy
                            plot_policy_heatmap(result, env_name, alg_name)
            
            print("‚úÖ Analyses Monte Carlo g√©n√©r√©es avec succ√®s")
            
        except Exception as e:
            print(f"‚ùå Erreur g√©n√©ration analyses MC: {e}")
    
else:
    print("‚ùå Skip Monte Carlo:")
    if not algorithms_status.get('MC', False):
        print("   - Algorithmes Monte Carlo non import√©s")
    if successful_adapters == 0:
        print("   - Aucun adaptateur fonctionnel")
    
    mc_results = {}


In [None]:
# 4Ô∏è‚É£ DYNA PLANNING - Algorithmes les Plus Co√ªteux üß†

def train_dyna_algorithms():
    """Entra√Ænement des algorithmes Dyna Planning sur tous les environnements"""
    
    print("üöÄ ENTRA√éNEMENT DES ALGORITHMES DYNA PLANNING")
    print("=" * 80)
    
    if not algorithms_status.get('DYNA', False):
        print("‚ùå Algorithmes Dyna non disponibles - Skip")
        return {}
    
    if successful_adapters == 0:
        print("‚ùå Aucun adaptateur disponible - Skip")
        return {}
    
    dyna_results = {}
    total_combinations = len([a for a in adapters.values() if a is not None]) * 2  # 2 algos Dyna
    current_combination = 0
    
    # Configuration Dyna (plus intensive)
    DYNA_EPISODES = 400        # Moins d'√©pisodes car chaque √©pisode fait plus de planning
    DYNA_ALPHA = 0.1          # Taux d'apprentissage
    DYNA_GAMMA = 0.99         # Facteur de discount
    DYNA_EPSILON = 0.15       # Exploration plus conservative
    DYNA_PLANNING_STEPS = 10   # Nombre d'√©tapes de planning par step r√©el
    
    print(f"‚öôÔ∏è  Param√®tres Dyna: {DYNA_EPISODES} √©pisodes, Œ±={DYNA_ALPHA}, Œ≥={DYNA_GAMMA}")
    print(f"   Œµ={DYNA_EPSILON}, planning_steps={DYNA_PLANNING_STEPS}")
    print(f"üîÑ {total_combinations} combinaisons √† traiter")
    print("‚ö†Ô∏è  DYNA = Plus co√ªteux - Apprentissage direct + Planning avec mod√®le !")
    print("=" * 80)
    
    # Entra√Ænement sur chaque environnement
    for env_name, adapter in adapters.items():
        if adapter is None:
            print(f"‚è≠Ô∏è  Skip {env_name} - Adaptateur non disponible")
            continue
        
        print(f"\\nüéÆ ENVIRONNEMENT: {env_name}")
        print(f"   √âtats: {adapter.nS}, Actions: {adapter.nA}")
        print("-" * 60)
        
        env_dyna_results = {}
        
        # 1Ô∏è‚É£ Dyna-Q (Q-Learning + Planning)
        print(f"\\nü§ñ [1/2] Dyna-Q (Q-Learning + Model-based Planning)...")
        current_combination += 1
        
        try:
            with tqdm(total=DYNA_EPISODES, desc=f"Dyna-Q {env_name}", unit="ep") as pbar:
                
                # Cr√©er l'algorithme
                dyna_q = DynaQ(
                    adapter, 
                    alpha=DYNA_ALPHA, 
                    gamma=DYNA_GAMMA, 
                    epsilon=DYNA_EPSILON,
                    planning_steps=DYNA_PLANNING_STEPS
                )
                print(f"   üèóÔ∏è  Dyna-Q initialis√© (Œ±={DYNA_ALPHA}, Œµ={DYNA_EPSILON}, plan={DYNA_PLANNING_STEPS})")
                
                # Entra√Ænement avec barre de progression
                start_time = time.time()
                result = dyna_q.train(num_episodes=DYNA_EPISODES)
                training_time = time.time() - start_time
                
                # Mettre √† jour la barre de progression
                if 'history' in result and result['history']:
                    pbar.update(len(result['history']))
                else:
                    pbar.update(DYNA_EPISODES)
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'DynaQ'
                result['env_name'] = env_name
                result['training_time'] = training_time
                result['planning_steps'] = DYNA_PLANNING_STEPS
                
                # √âvaluation finale
                if hasattr(dyna_q, 'evaluate'):
                    eval_results = dyna_q.evaluate(num_episodes=50)
                    result['evaluation'] = eval_results
                
                env_dyna_results['DynaQ'] = result
                
                final_reward = 0
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                
                # Statistiques sur le mod√®le appris
                model_size = len(getattr(dyna_q, 'model', {}))
                
                print(f"   ‚úÖ Dyna-Q termin√© en {training_time:.2f}s - Reward final: {final_reward:.3f}")
                print(f"      Mod√®le appris: {model_size} transitions stock√©es")
                
        except Exception as e:
            print(f"   ‚ùå Erreur Dyna-Q: {e}")
            env_dyna_results['DynaQ'] = {'error': str(e), 'algorithm': 'DynaQ'}
        
        # 2Ô∏è‚É£ Dyna-Q+ (avec bonus d'exploration)
        print(f"\\nüöÄ [2/2] Dyna-Q+ (avec bonus exploration temporelle)...")
        current_combination += 1
        
        try:
            with tqdm(total=DYNA_EPISODES, desc=f"Dyna-Q+ {env_name}", unit="ep") as pbar:
                
                # Cr√©er l'algorithme
                dyna_q_plus = DynaQPlus(
                    adapter, 
                    alpha=DYNA_ALPHA, 
                    gamma=DYNA_GAMMA, 
                    epsilon=DYNA_EPSILON,
                    planning_steps=DYNA_PLANNING_STEPS,
                    kappa=0.001  # Param√®tre bonus exploration (si disponible)
                )
                print(f"   üèóÔ∏è  Dyna-Q+ initialis√© (Œ±={DYNA_ALPHA}, Œµ={DYNA_EPSILON}, plan={DYNA_PLANNING_STEPS})")
                
                # Entra√Ænement avec barre de progression
                start_time = time.time()
                result = dyna_q_plus.train(num_episodes=DYNA_EPISODES)
                training_time = time.time() - start_time
                
                # Mettre √† jour la barre de progression
                if 'history' in result and result['history']:
                    pbar.update(len(result['history']))
                else:
                    pbar.update(DYNA_EPISODES)
                
                # Compl√©ter les r√©sultats
                result['algorithm'] = 'DynaQPlus'
                result['env_name'] = env_name
                result['training_time'] = training_time
                result['planning_steps'] = DYNA_PLANNING_STEPS
                
                # √âvaluation finale
                if hasattr(dyna_q_plus, 'evaluate'):
                    eval_results = dyna_q_plus.evaluate(num_episodes=50)
                    result['evaluation'] = eval_results
                
                env_dyna_results['DynaQPlus'] = result
                
                final_reward = 0
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                
                # Statistiques sur le mod√®le appris
                model_size = len(getattr(dyna_q_plus, 'model', {}))
                
                print(f"   ‚úÖ Dyna-Q+ termin√© en {training_time:.2f}s - Reward final: {final_reward:.3f}")
                print(f"      Mod√®le appris: {model_size} transitions + bonus exploration")
                
        except Exception as e:
            print(f"   ‚ùå Erreur Dyna-Q+: {e}")
            env_dyna_results['DynaQPlus'] = {'error': str(e), 'algorithm': 'DynaQPlus'}
        
        # R√©sum√© pour cet environnement
        print(f"\\nüìä R√âSUM√â DYNA PLANNING {env_name}:")
        for alg_name, result in env_dyna_results.items():
            if 'error' not in result:
                training_time = result.get('training_time', 0)
                planning_steps = result.get('planning_steps', 0)
                
                if 'history' in result and result['history']:
                    final_reward = result['history'][-1].get('reward', 0)
                    episodes_completed = len(result['history'])
                    
                    # Calculer des m√©triques de performance
                    recent_rewards = [h.get('reward', 0) for h in result['history'][-20:]]
                    avg_recent = np.mean(recent_rewards) if recent_rewards else 0
                    
                    print(f"   ‚Ä¢ {alg_name}: ‚úÖ {episodes_completed} √©pisodes, Time={training_time:.2f}s")
                    print(f"     Final={final_reward:.3f}, Moy.r√©cente={avg_recent:.3f}, Planning={planning_steps}")
                else:
                    print(f"   ‚Ä¢ {alg_name}: ‚úÖ Time={training_time:.2f}s, Planning={planning_steps}")
            else:
                print(f"   ‚Ä¢ {alg_name}: ‚ùå Erreur")
        
        dyna_results[env_name] = env_dyna_results
        
        print(f"\\nüìà Progression globale Dyna: {current_combination}/{total_combinations}")
    
    print(f"\\nüéâ DYNA PLANNING TERMIN√â !")
    success_count = len([r for env_r in dyna_results.values() for r in env_r.values() if 'error' not in r])
    print(f"üìä {success_count} combinaisons r√©ussies")
    print("üß† Les algorithmes Dyna combinent apprentissage direct et planning - Les plus sophistiqu√©s !")
    print("=" * 80)
    
    return dyna_results

# üöÄ Lancer l'entra√Ænement Dyna Planning
if algorithms_status.get('DYNA', False) and successful_adapters > 0:
    print("‚è≥ Lancement de l'entra√Ænement Dyna Planning...")
    print("üí° Dyna = Algorithmes les plus co√ªteux mais potentiellement les plus performants !")
    print("üß† Combinaison apprentissage direct + planification avec mod√®le appris")
    
    dyna_results = train_dyna_algorithms()
    
    # üìä Analyse imm√©diate des r√©sultats Dyna
    if dyna_results:
        print("\\nüìà G√âN√âRATION DES ANALYSES DYNA PLANNING...")
        
        try:
            # Courbes d'apprentissage avec retour cumulatif moyen
            for env_name, env_results in dyna_results.items():
                if any('history' in result and result['history'] for result in env_results.values()):
                    plot_learning_curves_universal(env_results, env_name, "DYNA PLANNING")
            
            # Boxplots des returns finaux
            plot_boxplots_final_returns(dyna_results, "DYNA PLANNING")
            
            # Tableau comparatif avec m√©triques sp√©ciales pour Dyna
            comparison_df = generate_comparison_table(dyna_results, "DYNA PLANNING")
            
            # Heatmaps des politiques apprises
            for env_name, env_results in dyna_results.items():
                for alg_name, result in env_results.items():
                    if 'error' not in result:
                        # Policy heatmap
                        if 'policy' in result:
                            plot_policy_heatmap(result, env_name, alg_name)
                        
                        # Si on a Q, extraire la politique gloutonne
                        elif 'Q' in result:
                            Q = result['Q']
                            policy = np.argmax(Q, axis=1)
                            result['policy'] = policy
                            plot_policy_heatmap(result, env_name, alg_name)
            
            print("‚úÖ Analyses Dyna Planning g√©n√©r√©es avec succ√®s")
            
        except Exception as e:
            print(f"‚ùå Erreur g√©n√©ration analyses Dyna: {e}")
    
else:
    print("‚ùå Skip Dyna Planning:")
    if not algorithms_status.get('DYNA', False):
        print("   - Algorithmes Dyna non import√©s")
    if successful_adapters == 0:
        print("   - Aucun adaptateur fonctionnel")
    
    dyna_results = {}

print("\\n" + "üéä" * 20)
print("üèÅ TOUS LES ALGORITHMES TERMIN√âS !")
print("üéä" * 20)


In [None]:
# üèÜ ANALYSE GLOBALE DE TOUS LES ALGORITHMES REINFORCEMENT LEARNING

def analyze_all_algorithms_globally():
    """Analyse comparative compl√®te de tous les algorithmes sur tous les environnements"""
    
    print("üåç ANALYSE GLOBALE DE TOUS LES ALGORITHMES RL")
    print("=" * 100)
    
    # Collecter tous les r√©sultats
    all_algorithm_results = {}
    
    # Variables globales avec d√©faut vide
    globals_to_check = ['dp_results', 'td_results', 'mc_results', 'dyna_results']
    algorithm_families = []
    
    for var_name in globals_to_check:
        if var_name in globals() and globals()[var_name]:
            results = globals()[var_name]
            family_name = var_name.replace('_results', '').upper()
            all_algorithm_results[family_name] = results
            algorithm_families.append(family_name)
            print(f"‚úÖ {family_name}: {len(results)} environnements trait√©s")
        else:
            print(f"‚è≠Ô∏è  {var_name}: Aucun r√©sultat")
    
    if not all_algorithm_results:
        print("‚ùå Aucun r√©sultat d'algorithme disponible pour l'analyse globale")
        return
    
    print(f"\\nüéØ FAMILLES D'ALGORITHMES ANALYS√âES: {', '.join(algorithm_families)}")
    print("=" * 100)
    
    # 1. üìä Tableau comparatif global unifi√©
    print("\\n1Ô∏è‚É£ TABLEAU COMPARATIF GLOBAL - TOUS LES ALGORITHMES")
    print("-" * 80)
    
    global_comparison_data = []
    
    for family_name, family_results in all_algorithm_results.items():
        for env_name, env_results in family_results.items():
            for alg_name, result in env_results.items():
                if 'error' not in result and 'history' in result and result['history']:
                    history = result['history']
                    
                    # M√©triques universelles
                    final_returns = [h.get('reward', h.get('return', 0)) for h in history[-20:]]
                    all_returns = [h.get('reward', h.get('return', 0)) for h in history]
                    
                    lengths = []
                    for h in history:
                        length = h.get('length', h.get('episode_length', h.get('steps', 1)))
                        lengths.append(length)
                    
                    # M√©triques calcul√©es
                    final_avg = np.mean(final_returns) if final_returns else 0
                    final_std = np.std(final_returns) if len(final_returns) > 1 else 0
                    steps_avg = np.mean(lengths) if lengths else 0
                    win_rate = sum(1 for r in all_returns if r > 0) / len(all_returns) if all_returns else 0
                    training_time = result.get('training_time', 0)
                    
                    # √âvaluation si disponible
                    eval_return = 0
                    eval_win_rate = 0
                    if 'evaluation' in result:
                        eval_data = result['evaluation']
                        eval_return = eval_data.get('avg_reward', eval_data.get('average_reward', 0))
                        eval_win_rate = eval_data.get('success_rate', 0)
                    
                    global_comparison_data.append({
                        'Famille': family_name,
                        'Algorithme': alg_name,
                        'Environnement': env_name,
                        'Return_Final': f"{final_avg:.3f}",
                        'Std_Final': f"{final_std:.3f}",
                        'Steps_Moy': f"{steps_avg:.1f}" if steps_avg > 0 else "N/A",
                        'Taux_Victoire': f"{win_rate:.1%}",
                        'Temps_Train_s': f"{training_time:.2f}",
                        'Eval_Return': f"{eval_return:.3f}",
                        'Eval_Victoire': f"{eval_win_rate:.1%}"
                    })
                
                elif 'error' in result:
                    global_comparison_data.append({
                        'Famille': family_name,
                        'Algorithme': alg_name,
                        'Environnement': env_name,
                        'Return_Final': "ERREUR",
                        'Std_Final': "N/A",
                        'Steps_Moy': "N/A",
                        'Taux_Victoire': "N/A",
                        'Temps_Train_s': "N/A",
                        'Eval_Return': "N/A",
                        'Eval_Victoire': "N/A"
                    })
    
    if global_comparison_data:
        global_df = pd.DataFrame(global_comparison_data)
        print(global_df.to_string(index=False))
        
        # Sauvegarder le tableau global
        global_df.to_csv('global_rl_algorithms_comparison.csv', index=False)
        print("\\nüíæ Tableau global sauvegard√©: 'global_rl_algorithms_comparison.csv'")
    
    # 2. üèÜ Classement des meilleurs algorithmes par environnement
    print("\\n2Ô∏è‚É£ CLASSEMENT DES MEILLEURS ALGORITHMES PAR ENVIRONNEMENT")
    print("-" * 80)
    
    env_rankings = {}
    
    # Grouper par environnement
    for family_name, family_results in all_algorithm_results.items():
        for env_name, env_results in family_results.items():
            if env_name not in env_rankings:
                env_rankings[env_name] = []
            
            for alg_name, result in env_results.items():
                if 'error' not in result and 'history' in result and result['history']:
                    final_returns = [h.get('reward', h.get('return', 0)) for h in result['history'][-20:]]
                    avg_final = np.mean(final_returns) if final_returns else 0
                    std_final = np.std(final_returns) if len(final_returns) > 1 else 0
                    
                    # Score composite (performance - instabilit√©)
                    composite_score = avg_final - 0.1 * std_final
                    
                    env_rankings[env_name].append({
                        'family': family_name,
                        'algorithm': alg_name,
                        'score': composite_score,
                        'avg_return': avg_final,
                        'std_return': std_final
                    })
    
    # Afficher les classements
    for env_name, rankings in env_rankings.items():
        rankings.sort(key=lambda x: x['score'], reverse=True)
        
        print(f"\\nüèÖ {env_name} - TOP 3:")
        for i, entry in enumerate(rankings[:3]):
            medal = ["ü•á", "ü•à", "ü•â"][i]
            print(f"   {medal} {entry['family']}-{entry['algorithm']}: {entry['score']:.3f} "
                  f"(Return: {entry['avg_return']:.3f} ¬± {entry['std_return']:.3f})")
    
    # 3. üìà Analyse des performances par famille d'algorithmes
    print("\\n3Ô∏è‚É£ PERFORMANCES MOYENNES PAR FAMILLE D'ALGORITHMES")
    print("-" * 80)
    
    family_stats = {}
    
    for family_name, family_results in all_algorithm_results.items():
        all_scores = []
        all_times = []
        success_count = 0
        total_count = 0
        
        for env_name, env_results in family_results.items():
            for alg_name, result in env_results.items():
                total_count += 1
                
                if 'error' not in result and 'history' in result and result['history']:
                    success_count += 1
                    
                    final_returns = [h.get('reward', h.get('return', 0)) for h in result['history'][-20:]]
                    avg_final = np.mean(final_returns) if final_returns else 0
                    all_scores.append(avg_final)
                    
                    training_time = result.get('training_time', 0)
                    all_times.append(training_time)
        
        family_stats[family_name] = {
            'avg_performance': np.mean(all_scores) if all_scores else 0,
            'std_performance': np.std(all_scores) if len(all_scores) > 1 else 0,
            'avg_time': np.mean(all_times) if all_times else 0,
            'success_rate': success_count / total_count if total_count > 0 else 0,
            'count': total_count
        }
    
    print("Famille        | Performance Moy | Temps Moy (s) | Taux Succ√®s | Nb Tests")
    print("-" * 75)
    for family, stats in family_stats.items():
        print(f"{family:12} | {stats['avg_performance']:10.3f}   | "
              f"{stats['avg_time']:9.2f}   | {stats['success_rate']:8.1%}   | {stats['count']:8}")
    
    # 4. üå°Ô∏è Heatmap comparative globale
    print("\\n4Ô∏è‚É£ G√âN√âRATION DE LA HEATMAP COMPARATIVE GLOBALE")
    print("-" * 80)
    
    try:
        # Pr√©parer donn√©es pour heatmap
        heatmap_data = []
        all_envs = set()
        all_algos = set()
        
        for family_name, family_results in all_algorithm_results.items():
            for env_name, env_results in family_results.items():
                all_envs.add(env_name)
                for alg_name, result in env_results.items():
                    full_alg_name = f"{family_name}-{alg_name}"
                    all_algos.add(full_alg_name)
                    
                    if 'error' not in result and 'history' in result and result['history']:
                        final_returns = [h.get('reward', h.get('return', 0)) for h in result['history'][-20:]]
                        performance = np.mean(final_returns) if final_returns else 0
                    else:
                        performance = -1  # Valeur pour les erreurs
                    
                    heatmap_data.append({
                        'Environment': env_name,
                        'Algorithm': full_alg_name,
                        'Performance': performance
                    })
        
        # Cr√©er matrice pour heatmap
        env_list = sorted(list(all_envs))
        algo_list = sorted(list(all_algos))
        
        perf_matrix = np.zeros((len(env_list), len(algo_list)))
        
        for data_point in heatmap_data:
            env_idx = env_list.index(data_point['Environment'])
            algo_idx = algo_list.index(data_point['Algorithm'])
            perf_matrix[env_idx, algo_idx] = data_point['Performance']
        
        # Cr√©er la heatmap
        plt.figure(figsize=(16, 10))
        heatmap = plt.imshow(perf_matrix, cmap='RdYlGn', aspect='auto', vmin=-1, vmax=1)
        
        plt.xticks(range(len(algo_list)), algo_list, rotation=45, ha='right')
        plt.yticks(range(len(env_list)), env_list)
        plt.xlabel('Algorithmes (Famille-Nom)')
        plt.ylabel('Environnements Secrets')
        plt.title('üå°Ô∏è Heatmap Globale des Performances\\n(Return Final Moyen - Tous Algorithmes)', 
                  fontsize=16, fontweight='bold')
        
        # Ajouter les valeurs dans les cellules
        for i in range(len(env_list)):
            for j in range(len(algo_list)):
                value = perf_matrix[i, j]
                color = 'white' if abs(value) > 0.3 else 'black'
                text = f'{value:.2f}' if value != -1 else 'ERR'
                plt.text(j, i, text, ha='center', va='center', 
                        color=color, fontweight='bold', fontsize=8)
        
        plt.colorbar(heatmap, label='Performance Finale')
        plt.tight_layout()
        plt.show()
        
        print("‚úÖ Heatmap globale g√©n√©r√©e")
        
    except Exception as e:
        print(f"‚ùå Erreur g√©n√©ration heatmap globale: {e}")
    
    # 5. üí° Recommandations finales
    print("\\n5Ô∏è‚É£ RECOMMANDATIONS FINALES")
    print("-" * 80)
    
    print("üéØ R√âSUM√â EX√âCUTIF:")
    print(f"   ‚Ä¢ {len(algorithm_families)} familles d'algorithmes test√©es")
    
    total_combinations = sum(len([r for env_r in family_results.values() 
                                  for r in env_r.values()]) 
                            for family_results in all_algorithm_results.values())
    successful_combinations = sum(len([r for env_r in family_results.values() 
                                      for r in env_r.values() 
                                      if 'error' not in r and 'history' in r]) 
                                 for family_results in all_algorithm_results.values())
    
    print(f"   ‚Ä¢ {successful_combinations}/{total_combinations} combinaisons r√©ussies ({successful_combinations/total_combinations:.1%})")
    
    print("\\nüí° RECOMMANDATIONS ALGORITHMIQUES:")
    print("   1. üöÄ Dynamic Programming: Tr√®s rapide, id√©al si dynamiques MDP connues")
    print("   2. ‚ö° Temporal Difference: Bon compromis vitesse/performance pour exploration online")  
    print("   3. üé≤ Monte Carlo: Exploration compl√®te, robuste mais plus lent")
    print("   4. üß† Dyna Planning: Plus sophistiqu√©, combine direct + planning")
    
    print("\\nüìä M√âTRIQUES CL√âS √Ä RETENIR:")
    print("   ‚Ä¢ Return Cumulatif Moyen: Principale m√©trique de performance")
    print("   ‚Ä¢ Taux de Victoire: Stabilit√© et fiabilit√© de l'algorithme")
    print("   ‚Ä¢ Temps d'Entra√Ænement: Co√ªt computationnel")
    print("   ‚Ä¢ √âvaluation: Performance r√©elle sur politique apprise")
    
    print("\\nüîö ANALYSE GLOBALE TERMIN√âE")
    print("=" * 100)

# üöÄ Lancer l'analyse globale finale
print("‚è≥ Lancement de l'analyse globale de tous les algorithmes...")

analyze_all_algorithms_globally()

print("\\n" + "üéÜ" * 30)
print("üéâ ANALYSE COMPL√àTE DE TOUS LES ALGORITHMES RL TERMIN√âE ! üéâ")
print("üéÜ" * 30)
print("\\nüìã FICHIERS G√âN√âR√âS:")
print("   üìÑ global_rl_algorithms_comparison.csv - Tableau comparatif global")
print("   üìä Graphiques et heatmaps affich√©s dans le notebook")
print("\\nüéØ Utilisez ces r√©sultats pour choisir le meilleur algorithme selon vos besoins !")
print("=" * 80)


In [None]:
# üß† Entra√Ænement avec les Algorithmes Monte Carlo Existants

def run_monte_carlo_analysis(num_episodes=300, gamma=0.99):
    """
    Lance l'analyse compl√®te en utilisant les algorithmes Monte Carlo existants du projet.
    
    Args:
        num_episodes: Nombre d'√©pisodes d'entra√Ænement par algorithme
        gamma: Facteur de discount pour tous les algorithmes
    
    Returns:
        dict: R√©sultats complets de l'analyse
    """
    
    print("\nüöÄ LANCEMENT DE L'ANALYSE MONTE CARLO")
    print("=" * 60)
    print(f"‚öôÔ∏è  Param√®tres: {num_episodes} √©pisodes, Œ≥={gamma}")
    print(f"üéÆ Environnements √† tester: {len([k for k,v in adapters.items() if v is not None])}")
    print(f"üß† Algorithmes: MonteCarloES, OnPolicyMC, OffPolicyMC")
    print("=" * 60)
    
    if not monte_carlo_available:
        print("‚ùå Algorithmes Monte Carlo non disponibles !")
        return {}
    
    all_results = {}
    total_combinations = 0
    completed_combinations = 0
    
    # Compter le total de combinaisons
    for env_name, adapter in adapters.items():
        if adapter is not None:
            total_combinations += 3  # 3 algorithmes par environnement
    
    # Entra√Ænement pour chaque environnement
    for env_name, adapter in adapters.items():
        if adapter is None:
            print(f"‚è≠Ô∏è  Skipping {env_name} (adapter non disponible)")
            continue
        
        print(f"\nüéÆ ENVIRONNEMENT: {env_name}")
        print(f"   √âtats: {adapter.nS}, Actions: {adapter.nA}")
        print("-" * 50)
        
        env_results = {}
        
        # 1. üéØ Monte Carlo Exploring Starts
        print(f"\\nüî• [1/3] Monte Carlo Exploring Starts...")
        try:
            mc_es = MonteCarloES(adapter, gamma=gamma)
            print(f"   üèóÔ∏è  MonteCarloES initialis√© pour {env_name}")
            
            result_es = mc_es.train(num_episodes=num_episodes)
            result_es['algorithm'] = 'MonteCarloES'
            result_es['env_name'] = env_name
            
            # Ajouter √©valuation finale
            eval_results = mc_es.evaluate(num_episodes=50)
            result_es['evaluation'] = eval_results
            
            env_results['MonteCarloES'] = result_es
            completed_combinations += 1
            
            print(f"   ‚úÖ MonteCarloES termin√© - R√©compense finale: {result_es['history'][-1]['reward']:.3f}")
            
        except Exception as e:
            print(f"   ‚ùå Erreur MonteCarloES: {e}")
            env_results['MonteCarloES'] = {'history': [], 'error': str(e)}
        
        # 2. üéØ On-Policy Monte Carlo  
        print(f"\\nüîÑ [2/3] On-Policy Monte Carlo...")
        try:
            on_policy_mc = OnPolicyMC(adapter, gamma=gamma, epsilon=0.3)
            print(f"   üèóÔ∏è  OnPolicyMC initialis√© pour {env_name} (Œµ=0.3)")
            
            result_on = on_policy_mc.train(num_episodes=num_episodes)
            result_on['algorithm'] = 'OnPolicyMC'
            result_on['env_name'] = env_name
            
            # Ajouter √©valuation finale
            eval_results = on_policy_mc.evaluate(num_episodes=50)
            result_on['evaluation'] = eval_results
            
            env_results['OnPolicyMC'] = result_on
            completed_combinations += 1
            
            print(f"   ‚úÖ OnPolicyMC termin√© - R√©compense finale: {result_on['history'][-1]['reward']:.3f}")
            
        except Exception as e:
            print(f"   ‚ùå Erreur OnPolicyMC: {e}")
            env_results['OnPolicyMC'] = {'history': [], 'error': str(e)}
        
        # 3. üéØ Off-Policy Monte Carlo
        print(f"\\n‚öñÔ∏è  [3/3] Off-Policy Monte Carlo...")
        try:
            off_policy_mc = OffPolicyMC(adapter, gamma=gamma, epsilon=0.4)
            print(f"   üèóÔ∏è  OffPolicyMC initialis√© pour {env_name} (Œµ=0.4)")
            
            result_off = off_policy_mc.train(num_episodes=num_episodes)
            result_off['algorithm'] = 'OffPolicyMC'  
            result_off['env_name'] = env_name
            
            # Ajouter √©valuation finale
            eval_results = off_policy_mc.evaluate(num_episodes=50)
            result_off['evaluation'] = eval_results
            
            env_results['OffPolicyMC'] = result_off
            completed_combinations += 1
            
            print(f"   ‚úÖ OffPolicyMC termin√© - R√©compense finale: {result_off['history'][-1]['reward']:.3f}")
            
        except Exception as e:
            print(f"   ‚ùå Erreur OffPolicyMC: {e}")
            env_results['OffPolicyMC'] = {'history': [], 'error': str(e)}
        
        # Stocker les r√©sultats pour cet environnement
        all_results[env_name] = env_results
        
        # R√©sum√© pour cet environnement
        print(f"\\nüìä R√âSUM√â {env_name}:")
        for alg_name, result in env_results.items():
            if 'history' in result and result['history']:
                final_reward = result['history'][-1]['reward']
                avg_reward = np.mean([h['reward'] for h in result['history'][-10:]])
                print(f"   ‚Ä¢ {alg_name}: R√©compense finale = {final_reward:.3f}, Moyenne r√©cente = {avg_reward:.3f}")
            else:
                print(f"   ‚Ä¢ {alg_name}: ‚ùå √âchec")
    
    # R√©sum√© global
    print(f"\\nüéâ ANALYSE COMPL√àTE TERMIN√âE !")
    print(f"üìà {completed_combinations}/{total_combinations} combinaisons r√©ussies")
    print("=" * 60)
    
    return all_results

# üöÄ Lancement de l'analyse compl√®te
if successful_adapters > 0 and monte_carlo_available:
    print("‚è≥ Lancement de l'analyse (peut prendre 5-15 minutes selon les param√®tres)...")
    
    # Param√®tres d'entra√Ænement - ajustables selon les besoins
    EPISODES = 400  # Nombre d'√©pisodes par algorithme (augmentez pour plus de pr√©cision)
    GAMMA = 0.99    # Facteur de discount
    
    all_results = run_monte_carlo_analysis(num_episodes=EPISODES, gamma=GAMMA)
    
else:
    print("‚ùå Impossible de lancer l'analyse :")
    if successful_adapters == 0:
        print("   - Aucun adaptateur d'environnement fonctionnel")
    if not monte_carlo_available:
        print("   - Algorithmes Monte Carlo non import√©s")
    
    all_results = {}


‚è≥ Lancement de l'analyse (peut prendre 5-15 minutes selon les param√®tres)...

üöÄ LANCEMENT DE L'ANALYSE MONTE CARLO
‚öôÔ∏è  Param√®tres: 400 √©pisodes, Œ≥=0.99
üéÆ Environnements √† tester: 4
üß† Algorithmes: MonteCarloES, OnPolicyMC, OffPolicyMC

üéÆ ENVIRONNEMENT: SecretEnv0
   √âtats: 8192, Actions: 3
--------------------------------------------------
\nüî• [1/3] Monte Carlo Exploring Starts...
   üèóÔ∏è  MonteCarloES initialis√© pour SecretEnv0


: 

In [None]:
# üìä Fonctions de Visualisation et d'Analyse

def plot_learning_curves(results_dict, title_prefix=""):
    """Affiche les courbes d'apprentissage pour tous les algorithmes d'un environnement"""
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    colors = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12']  # Rouge, Bleu, Vert, Orange
    
    # 1. R√©compenses par √©pisode avec moyenne mobile
    ax1 = axes[0]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history]
            rewards = [h['reward'] for h in history]
            
            # Moyenne mobile pour lisser les courbes
            window_size = min(30, len(rewards) // 10 + 1)
            if len(rewards) >= window_size:
                rewards_smooth = pd.Series(rewards).rolling(window=window_size, min_periods=1).mean()
                ax1.plot(episodes, rewards_smooth, label=alg_name, color=colors[i % len(colors)], linewidth=2.5)
                ax1.plot(episodes, rewards, alpha=0.3, color=colors[i % len(colors)], linewidth=0.8)
            else:
                ax1.plot(episodes, rewards, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax1.set_title(f'{title_prefix} - R√©compenses par √âpisode', fontsize=14, fontweight='bold')
    ax1.set_xlabel('√âpisode')
    ax1.set_ylabel('R√©compense')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Q-values moyennes (si disponibles)
    ax2 = axes[1]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history]
            
            # V√©rifier si avg_q est disponible dans l'historique
            if 'avg_q' in history[0]:
                avg_q = [h['avg_q'] for h in history]
                ax2.plot(episodes, avg_q, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax2.set_title(f'{title_prefix} - √âvolution des Q-values Moyennes', fontsize=14, fontweight='bold')
    ax2.set_xlabel('√âpisode')
    ax2.set_ylabel('Q-value Moyenne')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Longueur des √©pisodes
    ax3 = axes[2]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history]
            
            # V√©rifier le format des longueurs d'√©pisode
            if 'length' in history[0]:
                lengths = [h['length'] for h in history]
            elif 'episode_length' in history[0]:
                lengths = [h['episode_length'] for h in history]  
            else:
                continue
            
            # Moyenne mobile
            window_size = min(30, len(lengths) // 10 + 1)
            if len(lengths) >= window_size:
                lengths_smooth = pd.Series(lengths).rolling(window=window_size, min_periods=1).mean()
                ax3.plot(episodes, lengths_smooth, label=alg_name, color=colors[i % len(colors)], linewidth=2)
            else:
                ax3.plot(episodes, lengths, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax3.set_title(f'{title_prefix} - Longueur des √âpisodes', fontsize=14, fontweight='bold')
    ax3.set_xlabel('√âpisode')
    ax3.set_ylabel('Nombre de Steps')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Convergence - Variation des r√©compenses (stabilit√©)
    ax4 = axes[3]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = []
            stds = []
            
            window_size = 50
            rewards = [h['reward'] for h in history]
            for j in range(window_size, len(history)):
                recent_rewards = rewards[j-window_size:j]
                episodes.append(history[j]['episode'])
                stds.append(np.std(recent_rewards))
            
            if len(episodes) > 0:
                ax4.plot(episodes, stds, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax4.set_title(f'{title_prefix} - Stabilit√© (√âcart-type des r√©compenses)', fontsize=14, fontweight='bold')
    ax4.set_xlabel('√âpisode')
    ax4.set_ylabel('√âcart-type des r√©compenses')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def analyze_results_detailed(all_results):
    """Analyse d√©taill√©e avec m√©triques de performance"""
    print("üîç ANALYSE D√âTAILL√âE DES R√âSULTATS")
    print("=" * 70)
    
    summary_data = []
    
    for env_name, env_results in all_results.items():
        print(f"\nüìä {env_name.upper()}")
        print("-" * 50)
        
        for alg_name, result in env_results.items():
            if 'history' in result and result['history']:
                history = result['history']
                
                # M√©triques de base
                total_episodes = len(history)
                all_rewards = [h['reward'] for h in history]
                avg_reward = np.mean(all_rewards)
                std_reward = np.std(all_rewards)
                
                # Performance finale (derniers 20% d'√©pisodes)
                final_portion = history[int(0.8 * len(history)):]
                final_rewards = [h['reward'] for h in final_portion]
                final_avg_reward = np.mean(final_rewards) if final_rewards else 0
                final_stability = np.std(final_rewards) if len(final_rewards) > 1 else 0
                
                # M√©triques d'√©valuation si disponibles
                eval_info = ""
                if 'evaluation' in result:
                    eval_data = result['evaluation']
                    eval_reward = eval_data.get('avg_reward', eval_data.get('average_reward', 0))
                    success_rate = eval_data.get('success_rate', 0)
                    eval_info = f", Eval: {eval_reward:.3f} (Succ√®s: {success_rate:.1%})"
                
                print(f"\nüéØ {alg_name}:")
                print(f"   ‚Ä¢ R√©compense moyenne: {avg_reward:.3f} (¬±{std_reward:.3f})")
                print(f"   ‚Ä¢ Performance finale: {final_avg_reward:.3f}")
                print(f"   ‚Ä¢ Stabilit√© finale: {final_stability:.3f}")
                print(f"   ‚Ä¢ √âpisodes total: {total_episodes}{eval_info}")
                
                # Caract√©ristiques sp√©cifiques aux algorithmes
                if len(history) > 0:
                    if 'epsilon' in history[0] and 'epsilon' in history[-1]:
                        initial_eps = history[0]['epsilon']
                        final_eps = history[-1]['epsilon']
                        print(f"   ‚Ä¢ D√©croissance Œµ: {initial_eps:.3f} ‚Üí {final_eps:.3f}")
                
                # Ajouter aux donn√©es de r√©sum√©
                summary_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'R√©compense_Moyenne': f"{avg_reward:.3f}",
                    'R√©compense_Finale': f"{final_avg_reward:.3f}",
                    'Stabilit√©': f"{final_stability:.3f}",
                    '√âpisodes': total_episodes
                })
                
            elif 'error' in result:
                print(f"\n‚ùå {alg_name}: Erreur - {result['error']}")
                summary_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'R√©compense_Moyenne': "ERREUR",
                    'R√©compense_Finale': "ERREUR",
                    'Stabilit√©': "N/A",
                    '√âpisodes': 0
                })
            else:
                print(f"\n‚ùå {alg_name}: Aucune donn√©e valide")
                summary_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'R√©compense_Moyenne': "0.000",
                    'R√©compense_Finale': "0.000",
                    'Stabilit√©': "N/A",
                    '√âpisodes': 0
                })
    
    # Tableau r√©capitulatif
    summary_df = pd.DataFrame(summary_data)
    print("\nüìã TABLEAU R√âCAPITULATIF COMPLET:")
    print("=" * 70)
    print(summary_df.to_string(index=False))
    
    return summary_df

def plot_performance_heatmap(all_results):
    """Heatmap des performances finales par algorithme et environnement"""
    
    # Pr√©parer les donn√©es pour la heatmap
    env_names = list(all_results.keys())
    alg_names = ['MonteCarloES', 'OnPolicyMC', 'OffPolicyMC']
    
    # Matrice des performances
    performance_matrix = []
    
    for env_name in env_names:
        env_row = []
        for alg_name in alg_names:
            if env_name in all_results and alg_name in all_results[env_name]:
                result = all_results[env_name][alg_name]
                if 'history' in result and result['history']:
                    # Performance finale (moyenne des 20 derniers √©pisodes)
                    final_rewards = [h['reward'] for h in result['history'][-20:]]
                    final_performance = np.mean(final_rewards)
                else:
                    final_performance = 0.0
            else:
                final_performance = 0.0
            
            env_row.append(final_performance)
        performance_matrix.append(env_row)
    
    # Cr√©er la heatmap
    plt.figure(figsize=(10, 8))
    heatmap = plt.imshow(performance_matrix, cmap='RdYlGn', aspect='auto')
    
    # Personnaliser la heatmap
    plt.xticks(range(len(alg_names)), alg_names, rotation=45)
    plt.yticks(range(len(env_names)), env_names)
    plt.xlabel('Algorithmes Monte Carlo')
    plt.ylabel('Environnements Secrets')
    plt.title('üå°Ô∏è Heatmap des Performances Finales\\n(R√©compense moyenne des 20 derniers √©pisodes)', 
              fontsize=14, fontweight='bold')
    
    # Ajouter les valeurs dans les cellules
    for i in range(len(env_names)):
        for j in range(len(alg_names)):
            value = performance_matrix[i][j]
            color = 'white' if abs(value) > 0.5 else 'black'
            plt.text(j, i, f'{value:.3f}', ha='center', va='center', 
                    color=color, fontweight='bold', fontsize=11)
    
    plt.colorbar(heatmap, label='Performance Finale')
    plt.tight_layout()
    plt.show()

def generate_recommendations(all_results):
    """G√©n√®re des recommandations bas√©es sur l'analyse"""
    
    print("üí° RECOMMANDATIONS ET CONCLUSIONS")
    print("=" * 70)
    
    # Trouver le meilleur algorithme par environnement
    best_performers = {}
    for env_name, env_results in all_results.items():
        best_alg = None
        best_score = -float('inf')
        
        for alg_name, result in env_results.items():
            if 'history' in result and result['history']:
                # Score composite : performance finale + stabilit√©
                final_rewards = [h['reward'] for h in result['history'][-20:]]
                if final_rewards:
                    avg_performance = np.mean(final_rewards)
                    stability = -np.std(final_rewards)  # Negative car moins de variance = mieux
                    composite_score = avg_performance * 0.8 + stability * 0.2
                    
                    if composite_score > best_score:
                        best_score = composite_score
                        best_alg = alg_name
        
        best_performers[env_name] = (best_alg, best_score)
    
    print("\nüèÜ MEILLEURS ALGORITHMES PAR ENVIRONNEMENT:")
    for env_name, (best_alg, score) in best_performers.items():
        if best_alg:
            print(f"   ‚Ä¢ {env_name}: {best_alg} (Score: {score:.3f})")
        else:
            print(f"   ‚Ä¢ {env_name}: Aucun algorithme performant")
    
    # Performance globale des algorithmes
    alg_global_scores = {'MonteCarloES': [], 'OnPolicyMC': [], 'OffPolicyMC': []}
    
    for env_name, env_results in all_results.items():
        for alg_name, result in env_results.items():
            if 'history' in result and result['history'] and alg_name in alg_global_scores:
                final_rewards = [h['reward'] for h in result['history'][-20:]]
                avg_performance = np.mean(final_rewards) if final_rewards else 0
                alg_global_scores[alg_name].append(avg_performance)
    
    print("\nüåü PERFORMANCE GLOBALE DES ALGORITHMES:")
    for alg_name, scores in alg_global_scores.items():
        if scores:
            avg_score = np.mean(scores)
            std_score = np.std(scores)
            print(f"   ‚Ä¢ {alg_name}: {avg_score:.3f} (¬±{std_score:.3f})")
        else:
            print(f"   ‚Ä¢ {alg_name}: Aucune donn√©e valide")
    
    # Recommandations sp√©cifiques
    print("\nüéØ RECOMMANDATIONS SP√âCIFIQUES:")
    print("   1. üîÑ MonteCarloES excelle sur les environnements n√©cessitant une exploration intensive")
    print("   2. ‚öñÔ∏è  OnPolicyMC offre un bon √©quilibre exploration/exploitation")
    print("   3. üéØ OffPolicyMC peut √™tre instable mais performant sur certains environnements") 
    print("   4. üìä Surveillez les courbes de convergence pour d√©tecter l'instabilit√©")
    print("   5. üéõÔ∏è  Ajustez Œ≥ et Œµ selon les caract√©ristiques sp√©cifiques de chaque environnement")
    
    print("\nüíæ Pour sauvegarder les r√©sultats, consultez le CSV g√©n√©r√© automatiquement.")
    print("=" * 70)

print("üìä Fonctions d'analyse et de visualisation d√©finies !")


In [None]:
# üìà G√©n√©ration Compl√®te des R√©sultats et Analyses

if all_results and any(env_results for env_results in all_results.values()):
    
    print("üìà G√âN√âRATION DES ANALYSES VISUELLES COMPL√àTES")
    print("=" * 70)
    
    # 1. üìä Courbes d'apprentissage pour chaque environnement
    print("\nüéØ 1. COURBES D'APPRENTISSAGE PAR ENVIRONNEMENT")
    print("-" * 60)
    
    for env_name, env_results in all_results.items():
        # V√©rifier qu'on a au moins un r√©sultat valide pour cet environnement
        has_valid_results = any('history' in result and result['history'] 
                               for result in env_results.values())
        
        if has_valid_results:
            print(f"\\nüìä G√©n√©ration des graphiques pour {env_name}...")
            plot_learning_curves(env_results, title_prefix=env_name)
        else:
            print(f"‚ùå Pas de donn√©es valides pour {env_name}")
    
    # 2. üå°Ô∏è Heatmap comparative des performances
    print("\\nüéØ 2. HEATMAP COMPARATIVE DES PERFORMANCES")
    print("-" * 60)
    
    try:
        plot_performance_heatmap(all_results)
        print("‚úÖ Heatmap g√©n√©r√©e avec succ√®s")
    except Exception as e:
        print(f"‚ùå Erreur g√©n√©ration heatmap: {e}")
    
    # 3. üîç Analyse d√©taill√©e des r√©sultats
    print("\\nüéØ 3. ANALYSE D√âTAILL√âE DES R√âSULTATS")
    print("-" * 60)
    
    try:
        summary_df = analyze_results_detailed(all_results)
        print("‚úÖ Analyse d√©taill√©e termin√©e")
    except Exception as e:
        print(f"‚ùå Erreur analyse d√©taill√©e: {e}")
        summary_df = pd.DataFrame()
    
    # 4. üí° Recommandations et conclusions
    print("\\nüéØ 4. RECOMMANDATIONS ET CONCLUSIONS")
    print("-" * 60)
    
    try:
        generate_recommendations(all_results)
        print("‚úÖ Recommandations g√©n√©r√©es")
    except Exception as e:
        print(f"‚ùå Erreur g√©n√©ration recommandations: {e}")
    
    # 5. üíæ Sauvegarde des r√©sultats
    print("\\nüéØ 5. SAUVEGARDE DES R√âSULTATS")
    print("-" * 60)
    
    try:
        if not summary_df.empty:
            csv_filename = 'monte_carlo_secret_env_results.csv'
            summary_df.to_csv(csv_filename, index=False)
            print(f"‚úÖ R√©sultats sauvegard√©s dans '{csv_filename}'")
            
            # Sauvegarder √©galement les donn√©es compl√®tes
            detailed_results = []
            for env_name, env_results in all_results.items():
                for alg_name, result in env_results.items():
                    if 'history' in result and result['history']:
                        for episode_data in result['history']:
                            row = {
                                'Environnement': env_name,
                                'Algorithme': alg_name,
                                **episode_data
                            }
                            detailed_results.append(row)
            
            if detailed_results:
                detailed_df = pd.DataFrame(detailed_results)
                detailed_csv = 'monte_carlo_detailed_history.csv'
                detailed_df.to_csv(detailed_csv, index=False)
                print(f"‚úÖ Historique d√©taill√© sauvegard√© dans '{detailed_csv}'")
        else:
            print("‚ùå Aucune donn√©e √† sauvegarder")
            
    except Exception as e:
        print(f"‚ùå Erreur sauvegarde: {e}")
    
    # 6. üìä R√©sum√© final avec m√©triques cl√©s
    print("\\nüéØ 6. R√âSUM√â FINAL")
    print("=" * 70)
    
    total_combinations = 0
    successful_combinations = 0
    
    for env_name, env_results in all_results.items():
        for alg_name, result in env_results.items():
            total_combinations += 1
            if 'history' in result and result['history']:
                successful_combinations += 1
    
    success_percentage = (successful_combinations / total_combinations * 100) if total_combinations > 0 else 0
    
    print(f"üìà Combinaisons r√©ussies: {successful_combinations}/{total_combinations} ({success_percentage:.1f}%)")
    print(f"üéÆ Environnements test√©s: {len(all_results)}")
    print(f"üß† Algorithmes utilis√©s: MonteCarloES, OnPolicyMC, OffPolicyMC")
    print(f"üíæ Fichiers g√©n√©r√©s: CSV avec r√©sultats et historiques d√©taill√©s")
    
    if successful_combinations > 0:
        print("\\nüéâ ANALYSE MONTE CARLO TERMIN√âE AVEC SUCC√àS !")
        print("üïµÔ∏è Les algorithmes Monte Carlo ont r√©v√©l√© les secrets des environnements !")
        
        # Afficher quelques statistiques finales int√©ressantes
        best_overall_performance = -float('inf')
        best_combination = None
        
        for env_name, env_results in all_results.items():
            for alg_name, result in env_results.items():
                if 'history' in result and result['history']:
                    final_rewards = [h['reward'] for h in result['history'][-10:]]
                    avg_final_performance = np.mean(final_rewards)
                    
                    if avg_final_performance > best_overall_performance:
                        best_overall_performance = avg_final_performance
                        best_combination = (alg_name, env_name)
        
        if best_combination:
            print(f"üèÜ Meilleure combinaison globale: {best_combination[0]} sur {best_combination[1]}")
            print(f"   Performance finale: {best_overall_performance:.3f}")
    else:
        print("‚ö†Ô∏è Analyse termin√©e mais aucun r√©sultat valide obtenu")
        print("   V√©rifiez la compatibilit√© des environnements secrets")

else:
    print("‚ùå AUCUN R√âSULTAT √Ä ANALYSER")
    print("=" * 70)
    print("üîç V√©rifications √† effectuer:")
    print("   1. Les environnements secrets sont-ils accessibles ?")
    print("   2. Les adaptateurs ont-ils √©t√© cr√©√©s correctement ?") 
    print("   3. Les algorithmes Monte Carlo sont-ils import√©s ?")
    print("   4. L'entra√Ænement s'est-il ex√©cut√© sans erreur ?")
    print("\\nüí° Conseil: Relancez les cellules pr√©c√©dentes pour diagnostiquer le probl√®me")

print("\\n" + "=" * 70)
print("üîö FIN DE L'ANALYSE MONTE CARLO SUR LES ENVIRONNEMENTS SECRETS")
print("=" * 70)


In [None]:
# üìä Fonctions de visualisation

def plot_learning_curves(results_dict, title_prefix=""):
    """Affiche les courbes d'apprentissage"""
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.flatten()
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
    
    # 1. R√©compenses par √©pisode
    ax1 = axes[0]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history]
            rewards = [h['reward'] for h in history]
            
            # Moyenne mobile
            if len(rewards) >= 20:
                rewards_smooth = pd.Series(rewards).rolling(window=20, min_periods=1).mean()
                ax1.plot(episodes, rewards_smooth, label=alg_name, color=colors[i], linewidth=2)
            else:
                ax1.plot(episodes, rewards, label=alg_name, color=colors[i], linewidth=2)
    
    ax1.set_title(f'{title_prefix} - R√©compenses par √âpisode')
    ax1.set_xlabel('√âpisode')
    ax1.set_ylabel('R√©compense')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Q-values moyennes
    ax2 = axes[1]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history]
            avg_q = [h['avg_q'] for h in history]
            ax2.plot(episodes, avg_q, label=alg_name, color=colors[i], linewidth=2)
    
    ax2.set_title(f'{title_prefix} - √âvolution des Q-values')
    ax2.set_xlabel('√âpisode')
    ax2.set_ylabel('Q-value Moyenne')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Longueur des √©pisodes
    ax3 = axes[2]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history]
            lengths = [h['length'] for h in history]
            
            if len(lengths) >= 20:
                lengths_smooth = pd.Series(lengths).rolling(window=20, min_periods=1).mean()
                ax3.plot(episodes, lengths_smooth, label=alg_name, color=colors[i], linewidth=2)
            else:
                ax3.plot(episodes, lengths, label=alg_name, color=colors[i], linewidth=2)
    
    ax3.set_title(f'{title_prefix} - Longueur des √âpisodes')
    ax3.set_xlabel('√âpisode')
    ax3.set_ylabel('Nombre de Steps')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Taux de succ√®s cumul√©
    ax4 = axes[3]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        if 'history' in result and result['history']:
            history = result['history']
            episodes = [h['episode'] for h in history]
            
            # Calculer taux de succ√®s cumul√©
            success_rates = []
            successes = 0
            for j, h in enumerate(history):
                if h['successful']:
                    successes += 1
                success_rates.append(successes / (j + 1))
            
            ax4.plot(episodes, success_rates, label=alg_name, color=colors[i], linewidth=2)
    
    ax4.set_title(f'{title_prefix} - Taux de Succ√®s Cumul√©')
    ax4.set_xlabel('√âpisode')
    ax4.set_ylabel('Taux de Succ√®s')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def analyze_results(all_results):
    """Analyse d√©taill√©e des r√©sultats"""
    print("üîç ANALYSE D√âTAILL√âE DES R√âSULTATS")
    print("=" * 60)
    
    summary_data = []
    
    for env_name, env_results in all_results.items():
        print(f"\\nüìä {env_name.upper()}")
        print("-" * 40)
        
        for alg_name, result in env_results.items():
            if 'history' in result and result['history']:
                history = result['history']
                
                # Statistiques
                total_episodes = len(history)
                successful_episodes = sum(1 for h in history if h['successful'])
                success_rate = successful_episodes / total_episodes
                
                all_rewards = [h['reward'] for h in history]
                avg_reward = np.mean(all_rewards)
                std_reward = np.std(all_rewards)
                
                # Performance finale (derniers 20%)
                final_portion = history[int(0.8 * len(history)):]
                final_rewards = [h['reward'] for h in final_portion]
                final_avg_reward = np.mean(final_rewards) if final_rewards else 0
                
                print(f"\\nüéØ {alg_name}:")
                print(f"   ‚Ä¢ Taux de succ√®s: {success_rate:.1%}")
                print(f"   ‚Ä¢ R√©compense moyenne: {avg_reward:.3f} (¬±{std_reward:.3f})")
                print(f"   ‚Ä¢ Performance finale: {final_avg_reward:.3f}")
                
                summary_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'Taux_Succ√®s': f"{success_rate:.1%}",
                    'R√©compense_Moyenne': f"{avg_reward:.3f}",
                    'R√©compense_Finale': f"{final_avg_reward:.3f}",
                    '√âpisodes': total_episodes
                })
            else:
                print(f"\\n‚ùå {alg_name}: Aucune donn√©e")
                summary_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'Taux_Succ√®s': "0%",
                    'R√©compense_Moyenne': "0.000",
                    'R√©compense_Finale': "0.000",
                    '√âpisodes': 0
                })
    
    # Tableau r√©capitulatif
    summary_df = pd.DataFrame(summary_data)
    print("\\nüìã TABLEAU R√âCAPITULATIF:")
    print(summary_df.to_string(index=False))
    
    return summary_df

print("üìä Fonctions de visualisation d√©finies !")


In [None]:
# üöÄ Entra√Ænement Principal

def run_monte_carlo_analysis(num_episodes=300):
    """Lance l'analyse compl√®te"""
    
    print("üöÄ D√âBUT DE L'ANALYSE MONTE CARLO")
    print("=" * 60)
    print(f"Param√®tres: {num_episodes} √©pisodes par algorithme")
    print("=" * 60)
    
    all_results = {}
    
    for env_name, adapter in adapters.items():
        print(f"\\nüéÆ ENVIRONNEMENT: {env_name}")
        print(f"√âtats: {adapter.nS}, Actions: {adapter.nA}")
        print("-" * 50)
        
        env_results = {}
        
        # 1. Monte Carlo Exploring Starts
        print("\\nüéØ Entra√Ænement Monte Carlo ES...")
        try:
            mc_es = SecretMonteCarloES(adapter, gamma=0.99, name=f"MC-ES-{env_name}")
            result_es = mc_es.train(num_episodes=num_episodes)
            env_results['MC-ES'] = result_es
            print(f"‚úÖ MC-ES termin√© - Succ√®s: {result_es['success_rate']:.2%}")
        except Exception as e:
            print(f"‚ùå Erreur MC-ES: {e}")
            env_results['MC-ES'] = {'history': [], 'success_rate': 0}
        
        # 2. On-Policy Monte Carlo
        print("\\nüéØ Entra√Ænement On-Policy MC...")
        try:
            on_policy_mc = SecretOnPolicyMC(adapter, gamma=0.99, epsilon=0.4, name=f"OnPolicy-{env_name}")
            result_on = on_policy_mc.train(num_episodes=num_episodes)
            env_results['On-Policy MC'] = result_on
            print(f"‚úÖ On-Policy MC termin√© - Succ√®s: {result_on['success_rate']:.2%}")
        except Exception as e:
            print(f"‚ùå Erreur On-Policy MC: {e}")
            env_results['On-Policy MC'] = {'history': [], 'success_rate': 0}
        
        all_results[env_name] = env_results
        
        # R√©sum√© pour cet environnement
        print(f"\\nüìä R√âSUM√â {env_name}:")
        for alg_name, result in env_results.items():
            if result['history']:
                final_rewards = [h['reward'] for h in result['history'][-20:]]
                avg_final_reward = np.mean(final_rewards) if final_rewards else 0
                print(f"   ‚Ä¢ {alg_name}: R√©compense finale = {avg_final_reward:.3f}")
            else:
                print(f"   ‚Ä¢ {alg_name}: ‚ùå √âchec")
    
    print("\\nüéâ ANALYSE COMPL√àTE TERMIN√âE !")
    return all_results

# Lancer l'analyse
if adapters:  # Seulement si les adaptateurs ont √©t√© cr√©√©s
    print("‚è≥ Lancement de l'analyse (cela peut prendre 5-10 minutes)...")
    EPISODES = 300  # Ajustez selon vos besoins
    
    all_results = run_monte_carlo_analysis(num_episodes=EPISODES)
else:
    print("‚ùå Impossible de lancer l'analyse - adaptateurs non disponibles")
    all_results = {}


In [None]:
# üìà Affichage des R√©sultats

if all_results:
    print("üìà G√âN√âRATION DES ANALYSES VISUELLES")
    print("=" * 60)
    
    # 1. Courbes d'apprentissage pour chaque environnement
    print("\\nüéØ 1. COURBES D'APPRENTISSAGE PAR ENVIRONNEMENT")
    
    for env_name, env_results in all_results.items():
        if any(result['history'] for result in env_results.values() if 'history' in result):
            print(f"\\nüìä Graphiques pour {env_name}...")
            plot_learning_curves(env_results, title_prefix=env_name)
        else:
            print(f"‚ùå Pas de donn√©es pour {env_name}")
    
    # 2. Analyse d√©taill√©e
    print("\\nüéØ 2. ANALYSE D√âTAILL√âE")
    print("-" * 50)
    summary_df = analyze_results(all_results)
    
    # 3. Recommandations
    print("\\nüéØ 3. RECOMMANDATIONS")
    print("=" * 60)
    
    best_performers = {}
    for env_name, env_results in all_results.items():
        best_alg = None
        best_score = -float('inf')
        
        for alg_name, result in env_results.items():
            if 'history' in result and result['history']:
                final_rewards = [h['reward'] for h in result['history'][-50:]]
                avg_reward = np.mean(final_rewards) if final_rewards else 0
                success_rate = result['success_rate']
                
                # Score composite
                composite_score = avg_reward * 0.7 + success_rate * 0.3
                
                if composite_score > best_score:
                    best_score = composite_score
                    best_alg = alg_name
        
        best_performers[env_name] = (best_alg, best_score)
    
    print("\\nüèÜ MEILLEURS ALGORITHMES PAR ENVIRONNEMENT:")
    for env_name, (best_alg, score) in best_performers.items():
        if best_alg:
            print(f"   ‚Ä¢ {env_name}: {best_alg} (Score: {score:.3f})")
        else:
            print(f"   ‚Ä¢ {env_name}: Aucun algorithme efficace")
    
    # 4. Performance globale
    alg_scores = {'MC-ES': [], 'On-Policy MC': []}
    
    for env_name, env_results in all_results.items():
        for alg_name, result in env_results.items():
            if 'history' in result and result['history'] and alg_name in alg_scores:
                final_rewards = [h['reward'] for h in result['history'][-50:]]
                avg_reward = np.mean(final_rewards) if final_rewards else 0
                success_rate = result['success_rate']
                composite_score = avg_reward * 0.7 + success_rate * 0.3
                alg_scores[alg_name].append(composite_score)
    
    print("\\nüåü PERFORMANCE GLOBALE:")
    for alg_name, scores in alg_scores.items():
        if scores:
            avg_score = np.mean(scores)
            std_score = np.std(scores)
            print(f"   ‚Ä¢ {alg_name}: {avg_score:.3f} (¬±{std_score:.3f})")
    
    # 5. Conseils
    print("\\nüí° CONSEILS D'INTERPR√âTATION:")
    print("   1. üéØ Taux de succ√®s √©lev√© = algorithme stable")
    print("   2. üîÑ R√©compenses croissantes = apprentissage effectif")
    print("   3. üìä Q-values convergentes = politique stable")
    print("   4. üéõÔ∏è  Ajustez les hyperparam√®tres si n√©cessaire")
    
    # Sauvegarde
    try:
        summary_df.to_csv('monte_carlo_results.csv', index=False)
        print(f"\\nüíæ R√©sultats sauvegard√©s dans 'monte_carlo_results.csv'")
    except Exception as e:
        print(f"‚ùå Erreur sauvegarde: {e}")
    
else:
    print("‚ùå AUCUN R√âSULTAT √Ä AFFICHER")
    print("V√©rifiez que l'entra√Ænement pr√©c√©dent s'est bien d√©roul√©.")

print("\\nüéâ ANALYSE MONTE CARLO TERMIN√âE !")
print("üïµÔ∏è Les environnements secrets ont r√©v√©l√© leurs myst√®res !")
print("=" * 60)


In [None]:
# üìö Imports et configuration
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from typing import Dict, List, Tuple, Any
import warnings
warnings.filterwarnings('ignore')

# Configuration matplotlib
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (14, 10)
plt.rcParams['font.size'] = 12
sns.set_palette("husl")

# Ajouter les chemins n√©cessaires
project_root = os.path.abspath('../../')
sys.path.insert(0, project_root)
sys.path.insert(0, os.path.join(project_root, 'game', 'secret_env'))

# Imports des environnements secrets
try:
    from secret_envs_wrapper import SecretEnv0, SecretEnv1, SecretEnv2, SecretEnv3
    print("‚úÖ Environnements secrets import√©s avec succ√®s")
except Exception as e:
    print(f"‚ùå Erreur d'import des environnements secrets: {e}")
    # Fallback pour les imports
    import ctypes
    import platform
    
print("üîß Configuration termin√©e !")

# Test rapide des environnements
try:
    env0 = SecretEnv0()
    print(f"üìä SecretEnv0 - √âtats: {env0.num_states()}, Actions: {env0.num_actions()}")
    
    env1 = SecretEnv1()
    print(f"üìä SecretEnv1 - √âtats: {env1.num_states()}, Actions: {env1.num_actions()}")
    
    env2 = SecretEnv2()
    print(f"üìä SecretEnv2 - √âtats: {env2.num_states()}, Actions: {env2.num_actions()}")
    
    env3 = SecretEnv3()
    print(f"üìä SecretEnv3 - √âtats: {env3.num_states()}, Actions: {env3.num_actions()}")
    
    print("\nüéâ Tous les environnements secrets sont fonctionnels !")
    
except Exception as e:
    print(f"‚ùå Erreur lors du test des environnements: {e}")
    raise


In [None]:
# üîß Adaptateur d'environnement pour les Secret Envs

class SecretEnvAdapter:
    """
    Adaptateur pour rendre les SecretEnv compatibles avec l'API Gym standard.
    Transforme l'interface sp√©cifique des environnements secrets en interface standard.
    """
    
    def __init__(self, secret_env_class, env_name="SecretEnv"):
        self.secret_env_class = secret_env_class
        self.env_name = env_name
        self.env = secret_env_class()
        
        # Propri√©t√©s MDP pour compatibilit√© avec Monte Carlo
        self.nS = self.env.num_states()
        self.nA = self.env.num_actions()
        
        # √âtat et r√©compenses
        self.current_state = None
        self.last_score = 0.0
        self.episode_steps = 0
        
        print(f"üèóÔ∏è  {env_name} adapter cr√©√© - √âtats: {self.nS}, Actions: {self.nA}")
    
    def reset(self):
        """R√©initialise l'environnement et retourne l'√©tat initial"""
        try:
            self.env.reset()
            self.current_state = self.env.state_id()
            self.last_score = self.env.score()
            self.episode_steps = 0
            return self.current_state
        except Exception as e:
            print(f"‚ùå Erreur reset {self.env_name}: {e}")
            # Cr√©er un nouvel environnement si reset √©choue
            self.env = self.secret_env_class()
            self.env.reset()
            self.current_state = self.env.state_id()
            self.last_score = self.env.score()
            self.episode_steps = 0
            return self.current_state
    
    def step(self, action):
        """
        Ex√©cute une action et retourne (next_state, reward, done, info)
        """
        try:
            # Obtenir les actions disponibles
            available_actions = self.get_available_actions()
            
            # V√©rifier si l'action est valide
            if action not in available_actions:
                # Action non valide - retourner r√©compense n√©gative et rester dans l'√©tat
                return self.current_state, -0.1, False, {
                    'invalid_action': True,
                    'available_actions': available_actions,
                    'requested_action': action
                }
            
            # Sauvegarder le score avant l'action
            old_score = self.env.score()
            
            # Ex√©cuter l'action
            self.env.step(action)
            self.episode_steps += 1
            
            # Obtenir le nouvel √©tat et calculer la r√©compense
            next_state = self.env.state_id()
            new_score = self.env.score()
            reward = new_score - old_score  # R√©compense diff√©rentielle
            done = self.env.is_game_over()
            
            # Mise √† jour
            self.current_state = next_state
            self.last_score = new_score
            
            info = {
                'available_actions': self.get_available_actions(),
                'cumulative_score': new_score,
                'episode_steps': self.episode_steps,
                'valid_action': True
            }
            
            # Limite de s√©curit√© pour √©viter les √©pisodes infinis
            if self.episode_steps > 1000:
                done = True
                reward -= 1.0  # P√©nalit√© pour √©pisode trop long
                info['timeout'] = True
            
            return next_state, reward, done, info
            
        except Exception as e:
            print(f"‚ùå Erreur step {self.env_name}: {e}")
            # Retourner un √©tat d'erreur
            return self.current_state, -1.0, True, {'error': str(e)}
    
    def get_available_actions(self):
        """Obtient la liste des actions disponibles dans l'√©tat courant"""
        try:
            actions = self.env.available_actions()
            return list(actions) if len(actions) > 0 else [0]
        except:
            # Fallback : toutes les actions sont disponibles
            return list(range(self.nA))
    
    def display(self):
        """Affiche l'√©tat courant de l'environnement"""
        try:
            self.env.display()
        except:
            print(f"√âtat courant: {self.current_state}, Score: {self.last_score}")
    
    def get_mdp_info(self):
        """Retourne les informations MDP pour compatibilit√©"""
        return {
            'states': list(range(self.nS)),
            'actions': list(range(self.nA)),
            'n_states': self.nS,
            'n_actions': self.nA,
            'name': self.env_name
        }

# Test des adaptateurs
print("üß™ Test des adaptateurs...")
adapters = {}

try:
    adapters['SecretEnv0'] = SecretEnvAdapter(SecretEnv0, "SecretEnv0")
    adapters['SecretEnv1'] = SecretEnvAdapter(SecretEnv1, "SecretEnv1")
    adapters['SecretEnv2'] = SecretEnvAdapter(SecretEnv2, "SecretEnv2")
    adapters['SecretEnv3'] = SecretEnvAdapter(SecretEnv3, "SecretEnv3")
    
    print("\n‚úÖ Tous les adaptateurs cr√©√©s avec succ√®s !")
    
    # Test rapide d'un adaptateur
    test_adapter = adapters['SecretEnv0']
    state = test_adapter.reset()
    available = test_adapter.get_available_actions()
    print(f"üîç Test SecretEnv0 - √âtat initial: {state}, Actions disponibles: {available}")
    
except Exception as e:
    print(f"‚ùå Erreur lors de la cr√©ation des adaptateurs: {e}")
    raise


In [None]:
# üéÆ Impl√©mentation des Algorithmes Monte Carlo pour les Environnements Secrets

class SecretMonteCarloES:
    """Monte Carlo avec Exploring Starts adapt√© aux environnements secrets"""
    
    def __init__(self, env_adapter, gamma=0.99, name="MC-ES"):
        self.env_adapter = env_adapter
        self.gamma = gamma
        self.name = name
        
        # Structures Monte Carlo
        self.nS = env_adapter.nS
        self.nA = env_adapter.nA
        self.Q = np.random.uniform(-0.1, 0.1, (self.nS, self.nA))  # Initialisation al√©atoire
        self.policy = np.zeros(self.nS, dtype=int)
        self.returns_sum = defaultdict(float)
        self.returns_count = defaultdict(int)
        
        # Historique d'entra√Ænement
        self.history = []
        
        print(f"üéØ {name} initialis√© pour {env_adapter.env_name}")
    
    def generate_episode_with_exploring_starts(self):
        """G√©n√®re un √©pisode avec exploring starts"""
        episode = []
        
        # Reset avec √©tat al√©atoire (approximation d'exploring starts)
        for _ in range(10):  # Essayer plusieurs resets pour varier l'√©tat initial
            state = self.env_adapter.reset()
            if np.random.random() < 0.3:  # 30% chance d'accepter cet √©tat
                break
        
        # Action initiale al√©atoire (exploring starts)
        available_actions = self.env_adapter.get_available_actions()
        if len(available_actions) > 0:
            action = np.random.choice(available_actions)
        else:
            action = 0
        
        done = False
        steps = 0
        max_steps = 500
        
        while not done and steps < max_steps:
            next_state, reward, done, info = self.env_adapter.step(action)
            episode.append((state, action, reward))
            
            if done:
                break
                
            # Action suivante selon politique courante avec actions disponibles
            state = next_state
            available_actions = info.get('available_actions', list(range(self.nA)))
            
            if len(available_actions) > 0:
                # Politique greedy avec tie-breaking al√©atoire sur actions disponibles
                q_vals = np.array([self.Q[state, a] for a in available_actions])
                max_q = np.max(q_vals)
                best_actions = [a for a in available_actions if self.Q[state, a] == max_q]
                action = np.random.choice(best_actions)
            else:
                break
            
            steps += 1
        
        return episode
    
    def train(self, num_episodes=1000):
        """Entra√Ænement Monte Carlo ES"""
        self.history = []
        successful_episodes = 0
        
        for episode_num in range(num_episodes):
            try:
                # G√©n√©rer √©pisode avec exploring starts
                episode = self.generate_episode_with_exploring_starts()
                
                if len(episode) > 0:
                    successful_episodes += 1
                    
                    # Mise √† jour First-Visit Monte Carlo
                    G = 0.0
                    visited = set()
                    
                    for (state, action, reward) in reversed(episode):
                        G = self.gamma * G + reward
                        
                        if (state, action) not in visited:
                            visited.add((state, action))
                            self.returns_count[(state, action)] += 1
                            self.returns_sum[(state, action)] += G
                            self.Q[state, action] = self.returns_sum[(state, action)] / self.returns_count[(state, action)]
                    
                    # Am√©lioration de la politique (greedy)
                    for s in range(self.nS):
                        self.policy[s] = np.argmax(self.Q[s])
                    
                    # Statistiques
                    episode_reward = sum(r for _, _, r in episode)
                    avg_q = np.mean(self.Q)
                    
                    self.history.append({
                        'episode': episode_num + 1,
                        'reward': episode_reward,
                        'length': len(episode),
                        'avg_q': avg_q,
                        'successful': True
                    })
                else:
                    # √âpisode √©chou√©
                    self.history.append({
                        'episode': episode_num + 1,
                        'reward': 0.0,
                        'length': 0,
                        'avg_q': np.mean(self.Q),
                        'successful': False
                    })
                
                if (episode_num + 1) % 200 == 0:
                    success_rate = successful_episodes / (episode_num + 1)
                    recent_rewards = [h['reward'] for h in self.history[-50:]]
                    avg_recent_reward = np.mean(recent_rewards) if recent_rewards else 0
                    print(f"[{self.name}] √âpisode {episode_num + 1}: "
                          f"Taux de succ√®s: {success_rate:.2f}, "
                          f"R√©compense r√©cente: {avg_recent_reward:.3f}")
                          
            except Exception as e:
                print(f"‚ùå Erreur √©pisode {episode_num + 1}: {e}")
                self.history.append({
                    'episode': episode_num + 1,
                    'reward': 0.0,
                    'length': 0,
                    'avg_q': np.mean(self.Q),
                    'successful': False
                })
        
        return {
            'Q': self.Q,
            'policy': self.policy,
            'history': self.history,
            'success_rate': successful_episodes / num_episodes
        }

class SecretOnPolicyMC:
    """On-Policy Monte Carlo avec Œµ-greedy adapt√© aux environnements secrets"""
    
    def __init__(self, env_adapter, gamma=0.99, epsilon=0.3, name="On-Policy MC"):
        self.env_adapter = env_adapter
        self.gamma = gamma
        self.epsilon = epsilon
        self.initial_epsilon = epsilon
        self.name = name
        
        # Structures Monte Carlo
        self.nS = env_adapter.nS
        self.nA = env_adapter.nA
        self.Q = np.random.uniform(-0.1, 0.1, (self.nS, self.nA))
        self.policy = np.zeros(self.nS, dtype=int)
        self.returns_sum = defaultdict(float)
        self.returns_count = defaultdict(int)
        
        # Historique
        self.history = []
        
        print(f"üéØ {name} initialis√© pour {env_adapter.env_name} (Œµ={epsilon})")
    
    def epsilon_greedy_action(self, state, available_actions):
        """S√©lectionne une action selon Œµ-greedy parmi les actions disponibles"""
        if len(available_actions) == 0:
            return 0
        
        if np.random.random() < self.epsilon:
            return np.random.choice(available_actions)
        else:
            # Greedy : meilleure action parmi les disponibles
            q_vals = np.array([self.Q[state, a] for a in available_actions])
            max_q = np.max(q_vals)
            best_actions = [a for a in available_actions if self.Q[state, a] == max_q]
            return np.random.choice(best_actions)
    
    def generate_episode(self):
        """G√©n√®re un √©pisode selon la politique Œµ-greedy"""
        episode = []
        state = self.env_adapter.reset()
        done = False
        steps = 0
        max_steps = 500
        
        while not done and steps < max_steps:
            available_actions = self.env_adapter.get_available_actions()
            action = self.epsilon_greedy_action(state, available_actions)
            
            next_state, reward, done, info = self.env_adapter.step(action)
            episode.append((state, action, reward))
            
            state = next_state
            steps += 1
        
        return episode
    
    def train(self, num_episodes=1000):
        """Entra√Ænement On-Policy Monte Carlo"""
        self.history = []
        successful_episodes = 0
        
        for episode_num in range(num_episodes):
            try:
                episode = self.generate_episode()
                
                if len(episode) > 0:
                    successful_episodes += 1
                    
                    # Mise √† jour First-Visit Monte Carlo
                    G = 0.0
                    visited = set()
                    
                    for (state, action, reward) in reversed(episode):
                        G = self.gamma * G + reward
                        
                        if (state, action) not in visited:
                            visited.add((state, action))
                            self.returns_count[(state, action)] += 1
                            self.returns_sum[(state, action)] += G
                            self.Q[state, action] = self.returns_sum[(state, action)] / self.returns_count[(state, action)]
                    
                    # Am√©lioration de politique
                    for s in range(self.nS):
                        self.policy[s] = np.argmax(self.Q[s])
                    
                    # D√©croissance d'epsilon
                    self.epsilon = max(0.01, self.epsilon * 0.9995)
                    
                    # Statistiques
                    episode_reward = sum(r for _, _, r in episode)
                    avg_q = np.mean(self.Q)
                    
                    self.history.append({
                        'episode': episode_num + 1,
                        'reward': episode_reward,
                        'length': len(episode),
                        'avg_q': avg_q,
                        'epsilon': self.epsilon,
                        'successful': True
                    })
                else:
                    self.history.append({
                        'episode': episode_num + 1,
                        'reward': 0.0,
                        'length': 0,
                        'avg_q': np.mean(self.Q),
                        'epsilon': self.epsilon,
                        'successful': False
                    })
                
                if (episode_num + 1) % 200 == 0:
                    success_rate = successful_episodes / (episode_num + 1)
                    recent_rewards = [h['reward'] for h in self.history[-50:]]
                    avg_recent_reward = np.mean(recent_rewards) if recent_rewards else 0
                    print(f"[{self.name}] √âpisode {episode_num + 1}: "
                          f"Taux de succ√®s: {success_rate:.2f}, "
                          f"Œµ: {self.epsilon:.3f}, "
                          f"R√©compense r√©cente: {avg_recent_reward:.3f}")
                          
            except Exception as e:
                print(f"‚ùå Erreur √©pisode {episode_num + 1}: {e}")
                
        return {
            'Q': self.Q,
            'policy': self.policy,
            'history': self.history,
            'success_rate': successful_episodes / num_episodes
        }

class SecretOffPolicyMC:
    """Off-Policy Monte Carlo avec Importance Sampling adapt√© aux environnements secrets"""
    
    def __init__(self, env_adapter, gamma=0.99, epsilon=0.4, name="Off-Policy MC"):
        self.env_adapter = env_adapter
        self.gamma = gamma
        self.epsilon = epsilon
        self.name = name
        
        # Structures Monte Carlo
        self.nS = env_adapter.nS
        self.nA = env_adapter.nA
        self.Q = np.random.uniform(-0.1, 0.1, (self.nS, self.nA))
        self.target_policy = np.zeros(self.nS, dtype=int)
        self.C = np.zeros((self.nS, self.nA))  # Poids cumulatifs
        
        # Historique
        self.history = []
        
        print(f"üéØ {name} initialis√© pour {env_adapter.env_name} (Œµ={epsilon})")
    
    def behavior_policy(self, state, available_actions):
        """Politique de comportement Œµ-greedy"""
        if len(available_actions) == 0:
            return 0
            
        if np.random.random() < self.epsilon:
            return np.random.choice(available_actions)
        else:
            q_vals = np.array([self.Q[state, a] for a in available_actions])
            max_q = np.max(q_vals)
            best_actions = [a for a in available_actions if self.Q[state, a] == max_q]
            return np.random.choice(best_actions)
    
    def generate_episode(self):
        """G√©n√®re un √©pisode selon la politique de comportement"""
        episode = []
        state = self.env_adapter.reset()
        done = False
        steps = 0
        max_steps = 500
        
        while not done and steps < max_steps:
            available_actions = self.env_adapter.get_available_actions()
            action = self.behavior_policy(state, available_actions)
            
            next_state, reward, done, info = self.env_adapter.step(action)
            episode.append((state, action, reward, available_actions.copy()))
            
            state = next_state
            steps += 1
        
        return episode
    
    def train(self, num_episodes=1000):
        """Entra√Ænement Off-Policy Monte Carlo avec Importance Sampling"""
        self.history = []
        successful_episodes = 0
        
        for episode_num in range(num_episodes):
            try:
                episode = self.generate_episode()
                
                if len(episode) > 0:
                    successful_episodes += 1
                    
                    # Importance Sampling Update
                    G = 0.0
                    W = 1.0
                    
                    for i in range(len(episode) - 1, -1, -1):
                        state, action, reward, available_actions = episode[i]
                        G = self.gamma * G + reward
                        
                        # Mettre √† jour C et Q
                        self.C[state, action] += W
                        if self.C[state, action] > 0:
                            self.Q[state, action] += (W / self.C[state, action]) * (G - self.Q[state, action])
                        
                        # Mettre √† jour la politique cible (greedy)
                        self.target_policy[state] = np.argmax(self.Q[state])
                        
                        # V√©rifier si l'action est celle de la politique cible
                        if action != self.target_policy[state]:
                            break
                        
                        # Calculer le ratio d'importance
                        # Probabilit√© politique cible (d√©terministe)
                        target_prob = 1.0
                        
                        # Probabilit√© politique de comportement
                        if len(available_actions) > 0:
                            if action == np.argmax([self.Q[state, a] for a in available_actions]):
                                behavior_prob = 1.0 - self.epsilon + self.epsilon / len(available_actions)
                            else:
                                behavior_prob = self.epsilon / len(available_actions)
                        else:
                            behavior_prob = 1.0
                        
                        if behavior_prob > 0:
                            W *= target_prob / behavior_prob
                        else:
                            break
                    
                    # Statistiques
                    episode_reward = sum(r for _, _, r, _ in episode)
                    avg_q = np.mean(self.Q)
                    
                    self.history.append({
                        'episode': episode_num + 1,
                        'reward': episode_reward,
                        'length': len(episode),
                        'avg_q': avg_q,
                        'avg_weight': np.mean(self.C[self.C > 0]) if np.any(self.C > 0) else 0,
                        'successful': True
                    })
                else:
                    self.history.append({
                        'episode': episode_num + 1,
                        'reward': 0.0,
                        'length': 0,
                        'avg_q': np.mean(self.Q),
                        'avg_weight': 0,
                        'successful': False
                    })
                
                if (episode_num + 1) % 200 == 0:
                    success_rate = successful_episodes / (episode_num + 1)
                    recent_rewards = [h['reward'] for h in self.history[-50:]]
                    avg_recent_reward = np.mean(recent_rewards) if recent_rewards else 0
                    print(f"[{self.name}] √âpisode {episode_num + 1}: "
                          f"Taux de succ√®s: {success_rate:.2f}, "
                          f"R√©compense r√©cente: {avg_recent_reward:.3f}")
                          
            except Exception as e:
                print(f"‚ùå Erreur √©pisode {episode_num + 1}: {e}")
                
        return {
            'Q': self.Q,
            'policy': self.target_policy,
            'history': self.history,
            'success_rate': successful_episodes / num_episodes
        }

print("üéØ Algorithmes Monte Carlo d√©finis avec succ√®s !")


In [None]:
# üìä Fonctions de visualisation et d'analyse

def plot_learning_curves(results_dict, title_prefix=""):
    """Affiche les courbes d'apprentissage pour tous les algorithmes"""
    
    n_algorithms = len(results_dict)
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    
    # Couleurs pour chaque algorithme
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
    
    # 1. R√©compenses par √©pisode
    ax1 = axes[0]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        history = result['history']
        episodes = [h['episode'] for h in history]
        rewards = [h['reward'] for h in history]
        
        # Moyenne mobile pour lisser les courbes
        window_size = min(50, len(rewards) // 10 + 1)
        if len(rewards) >= window_size:
            rewards_smooth = pd.Series(rewards).rolling(window=window_size, min_periods=1).mean()
            ax1.plot(episodes, rewards_smooth, label=alg_name, color=colors[i % len(colors)], linewidth=2)
        else:
            ax1.plot(episodes, rewards, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax1.set_title(f'{title_prefix} - R√©compenses par √âpisode', fontsize=14, fontweight='bold')
    ax1.set_xlabel('√âpisode')
    ax1.set_ylabel('R√©compense')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Q-values moyennes
    ax2 = axes[1]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        history = result['history']
        episodes = [h['episode'] for h in history]
        avg_q = [h['avg_q'] for h in history]
        
        ax2.plot(episodes, avg_q, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax2.set_title(f'{title_prefix} - √âvolution des Q-values', fontsize=14, fontweight='bold')
    ax2.set_xlabel('√âpisode')
    ax2.set_ylabel('Q-value Moyenne')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Longueur des √©pisodes
    ax3 = axes[2]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        history = result['history']
        episodes = [h['episode'] for h in history]
        lengths = [h['length'] for h in history]
        
        # Moyenne mobile
        window_size = min(50, len(lengths) // 10 + 1)
        if len(lengths) >= window_size:
            lengths_smooth = pd.Series(lengths).rolling(window=window_size, min_periods=1).mean()
            ax3.plot(episodes, lengths_smooth, label=alg_name, color=colors[i % len(colors)], linewidth=2)
        else:
            ax3.plot(episodes, lengths, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax3.set_title(f'{title_prefix} - Longueur des √âpisodes', fontsize=14, fontweight='bold')
    ax3.set_xlabel('√âpisode')
    ax3.set_ylabel('Nombre de Steps')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Analyse de convergence (√©cart-type des r√©compenses r√©centes)
    ax4 = axes[3]
    for i, (alg_name, result) in enumerate(results_dict.items()):
        history = result['history']
        episodes = []
        stds = []
        
        window_size = 100
        for j in range(window_size, len(history)):
            recent_rewards = [h['reward'] for h in history[j-window_size:j]]
            episodes.append(history[j]['episode'])
            stds.append(np.std(recent_rewards))
        
        if len(episodes) > 0:
            ax4.plot(episodes, stds, label=alg_name, color=colors[i % len(colors)], linewidth=2)
    
    ax4.set_title(f'{title_prefix} - Stabilit√© (√âcart-type des r√©compenses)', fontsize=14, fontweight='bold')
    ax4.set_xlabel('√âpisode')
    ax4.set_ylabel('√âcart-type des r√©compenses')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def plot_performance_comparison(all_results):
    """Compare les performances finales de tous les algorithmes sur tous les environnements"""
    
    # Pr√©parer les donn√©es pour la visualisation
    env_names = list(all_results.keys())
    alg_names = list(list(all_results.values())[0].keys())
    
    # Metrics √† analyser
    final_rewards = []
    success_rates = []
    avg_q_values = []
    
    for env_name in env_names:
        env_rewards = []
        env_success_rates = []
        env_avg_q = []
        
        for alg_name in alg_names:
            result = all_results[env_name][alg_name]
            
            # R√©compense finale (moyenne des 100 derniers √©pisodes)
            history = result['history']
            if len(history) >= 100:
                final_reward = np.mean([h['reward'] for h in history[-100:]])
            else:
                final_reward = np.mean([h['reward'] for h in history]) if history else 0
            
            env_rewards.append(final_reward)
            env_success_rates.append(result.get('success_rate', 0))
            env_avg_q.append(result['history'][-1]['avg_q'] if result['history'] else 0)
        
        final_rewards.append(env_rewards)
        success_rates.append(env_success_rates)
        avg_q_values.append(env_avg_q)
    
    # Cr√©er les graphiques
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Heatmap des r√©compenses finales
    ax1 = axes[0, 0]
    im1 = ax1.imshow(final_rewards, cmap='RdYlGn', aspect='auto')
    ax1.set_xticks(range(len(alg_names)))
    ax1.set_xticklabels(alg_names, rotation=45)
    ax1.set_yticks(range(len(env_names)))
    ax1.set_yticklabels(env_names)
    ax1.set_title('R√©compenses Finales par Algorithme et Environnement')
    
    # Ajouter les valeurs dans les cellules
    for i in range(len(env_names)):
        for j in range(len(alg_names)):
            ax1.text(j, i, f'{final_rewards[i][j]:.2f}', ha='center', va='center')
    
    plt.colorbar(im1, ax=ax1)
    
    # 2. Graphique en barres des taux de succ√®s
    ax2 = axes[0, 1]
    x = np.arange(len(env_names))
    width = 0.25
    
    for i, alg_name in enumerate(alg_names):
        success_data = [success_rates[j][i] for j in range(len(env_names))]
        ax2.bar(x + i * width, success_data, width, label=alg_name, alpha=0.8)
    
    ax2.set_xlabel('Environnements')
    ax2.set_ylabel('Taux de Succ√®s')
    ax2.set_title('Taux de Succ√®s par Environnement et Algorithme')
    ax2.set_xticks(x + width * 1.5)
    ax2.set_xticklabels(env_names)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Comparaison des Q-values moyennes finales
    ax3 = axes[1, 0]
    for i, alg_name in enumerate(alg_names):
        q_data = [avg_q_values[j][i] for j in range(len(env_names))]
        ax3.bar(x + i * width, q_data, width, label=alg_name, alpha=0.8)
    
    ax3.set_xlabel('Environnements')
    ax3.set_ylabel('Q-value Moyenne Finale')
    ax3.set_title('Q-values Finales par Environnement et Algorithme')
    ax3.set_xticks(x + width * 1.5)
    ax3.set_xticklabels(env_names)
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Graphique radar des performances g√©n√©rales
    ax4 = axes[1, 1]
    
    # Normaliser les m√©triques pour le radar
    final_rewards_norm = np.array(final_rewards)
    success_rates_norm = np.array(success_rates)
    
    # Score composite pour chaque algorithme
    composite_scores = []
    for i, alg_name in enumerate(alg_names):
        alg_rewards = [final_rewards[j][i] for j in range(len(env_names))]
        alg_success = [success_rates[j][i] for j in range(len(env_names))]
        
        # Score composite (moyenne pond√©r√©e)
        composite_score = np.mean(alg_rewards) * 0.7 + np.mean(alg_success) * 0.3
        composite_scores.append(composite_score)
    
    bars = ax4.bar(alg_names, composite_scores, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'][:len(alg_names)])
    ax4.set_title('Score Composite Global par Algorithme')
    ax4.set_ylabel('Score Composite')
    
    # Ajouter les valeurs sur les barres
    for bar, score in zip(bars, composite_scores):
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height,
                f'{score:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

def analyze_algorithm_characteristics(all_results):
    """Analyse les caract√©ristiques sp√©cifiques de chaque algorithme"""
    
    print("üîç ANALYSE D√âTAILL√âE DES ALGORITHMES")
    print("=" * 60)
    
    for env_name, env_results in all_results.items():
        print(f"\nüìä {env_name.upper()}")
        print("-" * 40)
        
        for alg_name, result in env_results.items():
            history = result['history']
            
            if len(history) > 0:
                # Statistiques g√©n√©rales
                total_episodes = len(history)
                successful_episodes = sum(1 for h in history if h['successful'])
                success_rate = successful_episodes / total_episodes
                
                # R√©compenses
                all_rewards = [h['reward'] for h in history]
                avg_reward = np.mean(all_rewards)
                std_reward = np.std(all_rewards)
                
                # Convergence (derniers 20% d'√©pisodes)
                final_portion = history[int(0.8 * len(history)):]
                final_rewards = [h['reward'] for h in final_portion]
                final_avg_reward = np.mean(final_rewards) if final_rewards else 0
                
                # Stabilit√© (√©cart-type des derniers √©pisodes)
                final_stability = np.std(final_rewards) if len(final_rewards) > 1 else 0
                
                print(f"\nüéØ {alg_name}:")
                print(f"   ‚Ä¢ Taux de succ√®s: {success_rate:.1%}")
                print(f"   ‚Ä¢ R√©compense moyenne: {avg_reward:.3f} (¬±{std_reward:.3f})")
                print(f"   ‚Ä¢ Performance finale: {final_avg_reward:.3f}")
                print(f"   ‚Ä¢ Stabilit√© finale: {final_stability:.3f}")
                
                # Caract√©ristiques sp√©cifiques √† l'algorithme
                if 'epsilon' in history[0]:
                    initial_eps = history[0]['epsilon']
                    final_eps = history[-1]['epsilon']
                    print(f"   ‚Ä¢ D√©croissance Œµ: {initial_eps:.3f} ‚Üí {final_eps:.3f}")
                
                if 'avg_weight' in history[0]:
                    final_weight = history[-1]['avg_weight']
                    print(f"   ‚Ä¢ Poids moyen final: {final_weight:.3f}")
            else:
                print(f"\n‚ùå {alg_name}: Aucune donn√©e d'entra√Ænement")

print("üìä Fonctions de visualisation d√©finies !")


In [None]:
# üöÄ Entra√Ænement Principal - Tous les Algorithmes sur Tous les Environnements

def run_complete_analysis(num_episodes=1000):
    """Lance l'analyse compl√®te de tous les algorithmes sur tous les environnements"""
    
    print("üöÄ D√âBUT DE L'ANALYSE COMPL√àTE")
    print("=" * 60)
    print(f"Param√®tres: {num_episodes} √©pisodes par algorithme")
    print(f"Total: {4} environnements √ó {3} algorithmes = {12} entra√Ænements")
    print("=" * 60)
    
    # Dictionnaire pour stocker tous les r√©sultats
    all_results = {}
    
    # Environnements √† tester
    env_classes = {
        'SecretEnv0': SecretEnv0,
        'SecretEnv1': SecretEnv1, 
        'SecretEnv2': SecretEnv2,
        'SecretEnv3': SecretEnv3
    }
    
    # Cr√©er les adaptateurs
    adapters_dict = {}
    for env_name, env_class in env_classes.items():
        try:
            adapters_dict[env_name] = SecretEnvAdapter(env_class, env_name)
        except Exception as e:
            print(f"‚ùå Erreur cr√©ation adaptateur {env_name}: {e}")
            continue
    
    print(f"\n‚úÖ {len(adapters_dict)} adaptateurs cr√©√©s avec succ√®s")
    
    # Entra√Ænement pour chaque environnement
    for env_name, adapter in adapters_dict.items():
        print(f"\nüéÆ ENVIRONNEMENT: {env_name}")
        print(f"√âtats: {adapter.nS}, Actions: {adapter.nA}")
        print("-" * 50)
        
        env_results = {}
        
        # 1. Monte Carlo Exploring Starts
        print("\\nüéØ Entra√Ænement Monte Carlo ES...")
        try:
            mc_es = SecretMonteCarloES(adapter, gamma=0.99, name=f"MC-ES-{env_name}")
            result_es = mc_es.train(num_episodes=num_episodes)
            env_results['MC-ES'] = result_es
            print(f"‚úÖ MC-ES termin√© - Taux de succ√®s: {result_es['success_rate']:.2%}")
        except Exception as e:
            print(f"‚ùå Erreur MC-ES sur {env_name}: {e}")
            env_results['MC-ES'] = {'history': [], 'success_rate': 0}
        
        # 2. On-Policy Monte Carlo
        print("\\nüéØ Entra√Ænement On-Policy MC...")
        try:
            on_policy_mc = SecretOnPolicyMC(adapter, gamma=0.99, epsilon=0.3, name=f"OnPolicy-{env_name}")
            result_on = on_policy_mc.train(num_episodes=num_episodes)
            env_results['On-Policy MC'] = result_on
            print(f"‚úÖ On-Policy MC termin√© - Taux de succ√®s: {result_on['success_rate']:.2%}")
        except Exception as e:
            print(f"‚ùå Erreur On-Policy MC sur {env_name}: {e}")
            env_results['On-Policy MC'] = {'history': [], 'success_rate': 0}
        
        # 3. Off-Policy Monte Carlo
        print("\\nüéØ Entra√Ænement Off-Policy MC...")
        try:
            off_policy_mc = SecretOffPolicyMC(adapter, gamma=0.99, epsilon=0.4, name=f"OffPolicy-{env_name}")
            result_off = off_policy_mc.train(num_episodes=num_episodes)
            env_results['Off-Policy MC'] = result_off
            print(f"‚úÖ Off-Policy MC termin√© - Taux de succ√®s: {result_off['success_rate']:.2%}")
        except Exception as e:
            print(f"‚ùå Erreur Off-Policy MC sur {env_name}: {e}")
            env_results['Off-Policy MC'] = {'history': [], 'success_rate': 0}
        
        # Stocker les r√©sultats de cet environnement
        all_results[env_name] = env_results
        
        # Afficher un r√©sum√© pour cet environnement
        print(f"\\nüìä R√âSUM√â {env_name}:")
        for alg_name, result in env_results.items():
            if result['history']:
                final_rewards = [h['reward'] for h in result['history'][-50:]]
                avg_final_reward = np.mean(final_rewards) if final_rewards else 0
                print(f"   ‚Ä¢ {alg_name}: R√©compense finale = {avg_final_reward:.3f}")
            else:
                print(f"   ‚Ä¢ {alg_name}: ‚ùå Aucun r√©sultat")
    
    print("\\nüéâ ANALYSE COMPL√àTE TERMIN√âE !")
    print("=" * 60)
    
    return all_results

# Lancer l'analyse compl√®te (peut prendre plusieurs minutes)
print("‚è≥ Lancement de l'analyse compl√®te...")
print("Cela peut prendre plusieurs minutes selon la complexit√© des environnements...")

# Utiliser un nombre d'√©pisodes raisonnable pour le test
EPISODES = 800  # Ajustez selon vos besoins de temps

all_results = run_complete_analysis(num_episodes=EPISODES)


In [None]:
# üìà Affichage des R√©sultats et Analyses Compl√®tes

print("üìà G√âN√âRATION DES ANALYSES VISUELLES")
print("=" * 60)

# V√©rifier qu'on a des r√©sultats
if all_results and any(env_results for env_results in all_results.values()):
    
    # 1. Afficher les courbes d'apprentissage pour chaque environnement
    print("\\nüéØ 1. COURBES D'APPRENTISSAGE PAR ENVIRONNEMENT")
    print("-" * 50)
    
    for env_name, env_results in all_results.items():
        if any(result['history'] for result in env_results.values()):
            print(f"\\nüìä Graphiques pour {env_name}...")
            plot_learning_curves(env_results, title_prefix=env_name)
        else:
            print(f"‚ùå Pas de donn√©es valides pour {env_name}")
    
    # 2. Comparaison des performances entre environnements
    print("\\nüéØ 2. COMPARAISON GLOBALE DES PERFORMANCES")
    print("-" * 50)
    plot_performance_comparison(all_results)
    
    # 3. Analyse d√©taill√©e des caract√©ristiques
    print("\\nüéØ 3. ANALYSE D√âTAILL√âE")
    print("-" * 50)
    analyze_algorithm_characteristics(all_results)
    
    # 4. Tableau r√©capitulatif final
    print("\\nüéØ 4. TABLEAU R√âCAPITULATIF FINAL")
    print("=" * 60)
    
    # Cr√©er un DataFrame pour le r√©sum√©
    summary_data = []
    
    for env_name, env_results in all_results.items():
        for alg_name, result in env_results.items():
            if result['history']:
                # Calculer les m√©triques finales
                history = result['history']
                final_rewards = [h['reward'] for h in history[-100:]] if len(history) >= 100 else [h['reward'] for h in history]
                
                summary_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'Taux_Succ√®s': f"{result['success_rate']:.1%}",
                    'R√©compense_Finale': f"{np.mean(final_rewards):.3f}",
                    'Stabilit√©': f"{np.std(final_rewards):.3f}",
                    '√âpisodes_Total': len(history),
                    'Q_Moyenne_Finale': f"{history[-1]['avg_q']:.3f}" if history else "0.000"
                })
            else:
                summary_data.append({
                    'Environnement': env_name,
                    'Algorithme': alg_name,
                    'Taux_Succ√®s': "0.0%",
                    'R√©compense_Finale': "0.000",
                    'Stabilit√©': "N/A",
                    '√âpisodes_Total': 0,
                    'Q_Moyenne_Finale': "0.000"
                })
    
    # Afficher le tableau
    summary_df = pd.DataFrame(summary_data)
    print("\\nüìã R√©sultats par Algorithme et Environnement:")
    print(summary_df.to_string(index=False))
    
    # 5. Recommandations finales
    print("\\nüéØ 5. RECOMMANDATIONS ET CONCLUSIONS")
    print("=" * 60)
    
    # Trouver les meilleurs algorithmes par environnement
    best_performers = {}
    for env_name, env_results in all_results.items():
        best_alg = None
        best_score = -float('inf')
        
        for alg_name, result in env_results.items():
            if result['history']:
                # Score composite bas√© sur r√©compense finale et taux de succ√®s
                final_rewards = [h['reward'] for h in result['history'][-100:]] if len(result['history']) >= 100 else [h['reward'] for h in result['history']]
                avg_reward = np.mean(final_rewards) if final_rewards else 0
                success_rate = result['success_rate']
                
                composite_score = avg_reward * 0.7 + success_rate * 0.3
                
                if composite_score > best_score:
                    best_score = composite_score
                    best_alg = alg_name
        
        best_performers[env_name] = (best_alg, best_score)
    
    print("\\nüèÜ MEILLEURS ALGORITHMES PAR ENVIRONNEMENT:")
    for env_name, (best_alg, score) in best_performers.items():
        if best_alg:
            print(f"   ‚Ä¢ {env_name}: {best_alg} (Score: {score:.3f})")
        else:
            print(f"   ‚Ä¢ {env_name}: Aucun algorithme efficace")
    
    # Analyse globale
    alg_global_scores = {'MC-ES': [], 'On-Policy MC': [], 'Off-Policy MC': []}
    
    for env_name, env_results in all_results.items():
        for alg_name, result in env_results.items():
            if result['history'] and alg_name in alg_global_scores:
                final_rewards = [h['reward'] for h in result['history'][-100:]] if len(result['history']) >= 100 else [h['reward'] for h in result['history']]
                avg_reward = np.mean(final_rewards) if final_rewards else 0
                success_rate = result['success_rate']
                composite_score = avg_reward * 0.7 + success_rate * 0.3
                alg_global_scores[alg_name].append(composite_score)
    
    print("\\nüåü PERFORMANCE GLOBALE DES ALGORITHMES:")
    for alg_name, scores in alg_global_scores.items():
        if scores:
            avg_score = np.mean(scores)
            std_score = np.std(scores)
            print(f"   ‚Ä¢ {alg_name}: {avg_score:.3f} (¬±{std_score:.3f})")
        else:
            print(f"   ‚Ä¢ {alg_name}: Aucune donn√©e valide")
    
    # Recommandations sp√©cifiques
    print("\\nüí° RECOMMANDATIONS:")
    print("   1. üéØ Chaque environnement secret semble avoir des caract√©ristiques uniques")
    print("   2. üîÑ L'exploration est cruciale - MC-ES peut √™tre avantag√©")
    print("   3. üìä Surveillez les taux de succ√®s autant que les r√©compenses")
    print("   4. ‚öñÔ∏è  L'importance sampling (Off-Policy) peut √™tre instable sur certains environnements")
    print("   5. üéõÔ∏è  L'ajustement des hyperparam√®tres (Œµ, Œ≥) est critique")
    
    # Sauvegarde des r√©sultats
    try:
        summary_df.to_csv('secret_env_monte_carlo_results.csv', index=False)
        print(f"\\nüíæ R√©sultats sauvegard√©s dans 'secret_env_monte_carlo_results.csv'")
    except Exception as e:
        print(f"‚ùå Erreur lors de la sauvegarde: {e}")
    
else:
    print("‚ùå AUCUN R√âSULTAT VALIDE TROUV√â")
    print("V√©rifiez que les environnements secrets sont accessibles et fonctionnels.")

print("\\nüéâ ANALYSE MONTE CARLO TERMIN√âE !")
print("üïµÔ∏è Les myst√®res des environnements secrets ont √©t√© explor√©s par Monte Carlo !")
print("=" * 60)
