In [95]:
import sys
import gymnasium as gym
import os
import time
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import joblib
from joblib import dump, load
import xgboost as xgb
import matplotlib.animation as animation
from typing import Dict, Any
from copy import deepcopy
import random 
from sklearn.base import BaseEstimator
from torch import nn
import torch
from torch.distributions.categorical import Categorical
from sklearn.utils import gen_batches
from collections import deque
import pickle
from torch.optim import Adam
from copy import deepcopy
from torch.cuda.amp import autocast, GradScaler

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))
sys.path.append(os.path.abspath('..'))

# Import the BaseAgent class
from src.agents.base_agent import BaseAgent
from initial_windfields import get_initial_windfield, INITIAL_WINDFIELDS
from src.env_sailing import SailingEnv
from src.test_agent_validity import validate_agent, load_agent_class
from src.evaluation import evaluate_agent, visualize_trajectory
from src.utils.agent_utils import save_qdn_agent

# Environment parameters
env = SailingEnv(**get_initial_windfield('simple_static'))
n_actions = env.action_space.n
d_s = 2054

### Define the playing function

In [95]:
def make_animation(imgs):
  """
  Makes an animation from a list of images
  Parameters
  ----------
  imgs: list of (height, width, 3) np arrays
    list of images
  Return
  -------
  ani: animation
  """
  fig, ax = plt.subplots()
  draw = []
  for i in range(len(imgs)):
    draw_i = ax.imshow(imgs[i])
    if i == 0:
      ax.imshow(imgs[0]) # Show an initial one first
    draw.append([draw_i])
  plt.close()
  ani = animation.ArtistAnimation(fig, draw, interval=200, blit=True,
                              repeat=False)
  return ani

In [96]:
def play_policy(env, pi, horizon=200, capture_rate=1):
  s, _ = env.reset()
  a = pi(s)
  imgs = []
  imgs.append(env.render())
  for tt in range(horizon):
    s, rew, term, trunc, _ = env.step(a)
    a = pi(s)
    if tt % capture_rate == 0:
      imgs.append(env.render())
    if term or trunc:
      break
  return make_animation(imgs)

In [97]:
for initial_windfield_name, initial_windfield in INITIAL_WINDFIELDS.items():
    print(initial_windfield_name)
    print(initial_windfield)

training_1
{'wind_init_params': {'base_speed': 3.0, 'base_direction': (-0.8, -0.2), 'pattern_scale': 32, 'pattern_strength': 0.3, 'strength_variation': 0.4, 'noise': 0.1}, 'wind_evol_params': {'wind_change_prob': 1.0, 'pattern_scale': 128, 'perturbation_angle_amplitude': 0.1, 'perturbation_strength_amplitude': 0.1, 'rotation_bias': 0.02, 'bias_strength': 1.0}, 'env_params': {'wind_grid_density': 25, 'wind_arrow_scale': 80, 'render_mode': 'rgb_array'}}
training_2
{'wind_init_params': {'base_speed': 3.0, 'base_direction': (-0.2, 0.8), 'pattern_scale': 128, 'pattern_strength': 0.6, 'strength_variation': 0.3, 'noise': 0.1}, 'wind_evol_params': {'wind_change_prob': 1.0, 'pattern_scale': 128, 'perturbation_angle_amplitude': 0.1, 'perturbation_strength_amplitude': 0.1, 'rotation_bias': 0.02, 'bias_strength': 1.0}, 'env_params': {'wind_grid_density': 25, 'wind_arrow_scale': 80, 'render_mode': 'rgb_array'}}
training_3
{'wind_init_params': {'base_speed': 3.0, 'base_direction': (0.2, -0.8), 'patt

## Fitted Q iteration

>Implement fitted Q iterations with random forest using uniform exploration for $\pi$.

In [5]:
# Collect a dataset 



In [6]:
class FQI(BaseAgent):
    """ FQI agent"""
    
    def __init__(self):
        super().__init__()
        self.d_s = 2054
        self.gamma = 0.99 
        self.n_iterations = 100
        self.n_actions = 9
        self.epsilon = 0.1
        self.model = RandomForestRegressor()
        self.pi = None
        self.data = None

    def collect_dataset(self, n=10000):
        pi = lambda x: np.random.randint(0, self.n_actions)
        data = []
        for initial_windfield_name, initial_windfield in INITIAL_WINDFIELDS.items():
            env = SailingEnv(**get_initial_windfield(initial_windfield_name))
            s0, _ = env.reset()
            s = s0.copy()
            n_actions = env.action_space.n

            for i in range(n//4):
                a = pi(s)
                s2, r, done, trunc, _ = env.step(a)
                data.append(s.copy().tolist() + [a, r, done] + s2.copy().tolist())
                if done or trunc:
                    s, _ = env.reset()
                else:
                    s = s2.copy()
        return np.array(data)
        
    
    def act(self, observation: np.ndarray) -> int:
        if self.pi is None:
            print("The Agent has not been trained")
        else:
            if np.random.rand() < self.epsilon:
                return np.random.randint(self.n_actions)
            return int(self.pi(observation))
        
    def trained_policy(self, observation):
        values = []
        for a in range(self.n_actions):
            X = np.hstack((observation, [a])).reshape(1, -1)
            values.append(self.model.predict(X)[0])
        return np.argmax(values)
    
    def trainFQI(self):
        # Use the data collected before
        self.data = self.collect_dataset()
        n = len(self.data)
        states, actions, rewards, dones, next_states = self.data[:, :2054], self.data[:, 2054], self.data[:, 2054+1], self.data[:, 2054+2], self.data[:, 2054+3:]
        X, Y = self.data[:, :2054+1], self.data[:, 2054+1]
        self.model.fit(X, Y)

        for _ in tqdm(range(self.n_iterations)):
            Qmax = np.max(
                            [
                            self.model.predict(np.column_stack([
                                            self.data[:, self.d_s + 3:],
                                            np.ones(n).reshape(-1, 1) * a
                                            ]))
                                for a in range(self.n_actions)
                            ],axis=0)
            Y = self.data[:, self.d_s + 1] + self.gamma * (1 - dones) * Qmax
            self.model.fit(X, Y)
        
        self.pi = lambda s: self.trained_policy(s)
        self.save(path="models/FQI_trained.pkl")
    
    def reset(self) -> None:
        """Reset the agent."""
        pass  # Nothing to reset in this simple agent
    
    def seed(self, seed: int = None) -> None:
        """Set the random seed."""
        self.np_random = np.random.default_rng(seed)

    def save(self, path):
        """Save the model and policy to a file."""
        with open(path, 'wb') as f:
            pickle.dump({
                'model': self.model,
                'pi': self.pi
            }, f)

    def load(self, path):
        """Load the model and policy from a file."""
        with open(path, 'rb') as f:
            data = pickle.load(f)
            self.model = data['model']
            self.pi = data['pi']


agent = FQI()
agent.trainFQI()

100%|██████████| 100/100 [33:55:12<00:00, 1221.13s/it]  


In [8]:
# Choose which training initial windfields to evaluate on
TRAINING_INITIAL_WINDFIELDS = ["simple_static", "training_1", "training_2", "training_3"]

# Evaluation parameters for all initial windfields
ALL_SEEDS = [42, 43, 44, 45, 46]  # Seeds to use for all evaluations
ALL_MAX_HORIZON = 200             # Maximum steps per episode

# Only run if the agent was successfully loaded
if 'agent' in locals():
    # Store results for each initial windfield
    all_results = {}
    
    print(f"Evaluating agent on {len(TRAINING_INITIAL_WINDFIELDS)} training initial windfields...")
    
    # Evaluate on each initial windfield
    for initial_windfield_name in TRAINING_INITIAL_WINDFIELDS:
        print(f"\nInitial windfield: {initial_windfield_name}")
        
        # Get the initial windfield
        initial_windfield = get_initial_windfield(initial_windfield_name)
        
        # Run the evaluation
        results = evaluate_agent(
            agent=agent,
            initial_windfield=initial_windfield,
            seeds=ALL_SEEDS,
            max_horizon=ALL_MAX_HORIZON,
            verbose=False,  # Less verbose for multiple evaluations
            render=False,
            full_trajectory=False
        )
        
        # Store results
        all_results[initial_windfield_name] = results
        
        # Print summary
        print(f"  Success Rate: {results['success_rate']:.2%}")
        print(f"  Mean Reward: {results['mean_reward']:.2f}")
        print(f"  Mean Steps: {results['mean_steps']:.1f}")
    
    # Print overall performance
    total_success = sum(r['success_rate'] for r in all_results.values()) / len(all_results)
    print("\n" + "="*50)
    print(f"OVERALL SUCCESS RATE: {total_success:.2%}")
    print("="*50)

Evaluating agent on 4 training initial windfields...

Initial windfield: simple_static
  Success Rate: 0.00%
  Mean Reward: 0.00
  Mean Steps: 200.0

Initial windfield: training_1
  Success Rate: 20.00%
  Mean Reward: 7.70
  Mean Steps: 179.2

Initial windfield: training_2
  Success Rate: 0.00%
  Mean Reward: 0.00
  Mean Steps: 200.0

Initial windfield: training_3
  Success Rate: 20.00%
  Mean Reward: 3.12
  Mean Steps: 197.2

OVERALL SUCCESS RATE: 10.00%


## DQN

In [111]:
class DQNNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 2032)
        self.fc2 = nn.Linear(2032, 2032)
        self.fc3 = nn.Linear(2032, 1016)
        self.fc4 = nn.Linear(1016, 508)
        self.fc5 = nn.Linear(508, 128)
        self.fc6 = nn.Linear(128, output_dim)
        self.dropout = nn.Dropout(p=0.2)  

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)  
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = self.dropout(x)  
        return self.fc6(x)

class DQNAgent(BaseAgent):
    """ DQN Agent with custom SailingFeatureExtractor """

    def __init__(self):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.d_s = 2054  # Dimension de l'état
        self.n_actions = 9  # Nombre d'actions
        self.pi = lambda x: np.random.randint(0, self.n_actions)
        self.PI = None
        self.capacity = 2000
        self.batch_size = 200  
        self.eps = 0.2  # Valeur initiale de epsilon pour l'exploration
        self.gamma = 0.99  
        self.C = 20
        self.n_iterations = 10000
        self.nb_gradient_steps = 5
        self.target_update_interval = 5  
        self.tau = 0.005  # Taux de mise à jour douce
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

        # Initialisation du modèle Q-Network
        self.q_network = DQNNetwork(self.d_s, self.n_actions).to(self.device)
        self.target_network = deepcopy(self.q_network).to(self.device)  
        self.optimizer = Adam(self.q_network.parameters(), lr=0.01)
        
        # Buffer de replay
        self.buffer = deque(maxlen=self.capacity)
        self.memory = []
        self.criterion = torch.nn.SmoothL1Loss()  # Fonction de perte (SmoothL1Loss pour DQN)
        self.env = SailingEnv(**get_initial_windfield("training_1"))
        self.reset()
        self.episode = 0

    def trainDQN(self, path):
        epsilon = self.eps
        epsilon_warmup = 1000  # Épisodes de warmup avant que epsilon commence à diminuer

        for name, wf in INITIAL_WINDFIELDS.items():
            self.env = SailingEnv(**get_initial_windfield(name))
            state, _ = self.env.reset()

            for t in tqdm(range(self.n_iterations)):
                # Exploration plus forte au début
                if t < epsilon_warmup:
                    epsilon = 1.0  # Exploration totale au début
                else:
                    epsilon = max(self.epsilon_min, epsilon * self.epsilon_decay)

                # Choisir l'action selon la politique ε-greedy
                if np.random.rand() < epsilon:
                    action = np.random.choice(self.n_actions)  # Exploration
                else:
                    action = self.greedy_action(state)  # Exploitation

                # Passer à l'état suivant dans l'environnement
                next_state, reward, done, trunc, _ = self.env.step(action)
                
                # Stocker la transition dans le buffer
                transition = (state, action, reward, done, next_state)  # Utiliser un tuple
                self.buffer.append(transition)

                # Effectuer l'entraînement après avoir stocké la transition
                for _ in range(self.nb_gradient_steps):
                    self.gradient_step(t)
                
                # Réinitialiser l'état si l'épisode est terminé
                if done or trunc:
                    state = self.env.reset()[0]  # Réinitialisation de l'environnement
                    self.episode += 1
                else:
                    state = next_state  # Continuer avec l'état suivant

                self.s = state.copy()  # Mettre à jour l'état actuel

            # Enregistrer la politique après l'entraînement
            self.PI = self.definePI()

            # Sauvegarder le modèle à la fin de l'entraînement
            self.save(path)

    def definePI(self):
        def pi_function(state):
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
            q_values = self.q_network(state_tensor)
            action = q_values.argmax(1).item()
            return action
        return pi_function

    def act(self, state):
        if random.random() < self.epsilon:
            # Exploration : choisir une action aléatoire
            return np.random.choice(self.n_actions)
        else:
            # Exploitation : choisir l'action avec la plus grande Q-value
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
            q_values = self.q_network(state)
            action = q_values.argmax(1).item()
            return action
        
    def greedy_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        q_values = self.q_network(state)
        action = q_values.argmax(1).item()
        return action

    def gradient_step(self, t):
        # Calculer la perte
        if len(self.buffer) < self.batch_size:
            return  # Pas assez de transitions pour faire un gradient step

        batch = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, dones, next_states = zip(*batch)

        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)

        # Calculer les Q-values pour l'état actuel
        q_values = self.q_network(states)
        q_value = q_values.gather(1, actions.unsqueeze(1))

        # Calculer les Q-values pour l'état suivant avec le réseau cible
        with torch.no_grad():
            next_q_values = self.target_network(next_states)
            next_q_value = next_q_values.max(1)[0]
            target_q_value = rewards + (1 - dones) * self.gamma * next_q_value

        # Calculer la perte et faire un pas de gradient
        loss = self.criterion(q_value.squeeze(1), target_q_value)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Mise à jour "soft" du réseau cible
        self.soft_update()

    def soft_update(self):
        for target_param, param in zip(self.target_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_((1.0 - self.tau) * target_param.data + self.tau * param.data)

    def reset(self):
        if not hasattr(self, 'epsilon'):
            self.epsilon = self.eps
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        self.env.reset()

    def save(self, filename):
        torch.save({
            'model_state_dict': self.q_network.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'episode': self.episode
        }, filename)

    def load(self, filename):
        checkpoint = torch.load(filename)
        self.q_network.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.epsilon = checkpoint['epsilon']
        self.episode = checkpoint['episode']

    def seed(self, seed_value):
        random.seed(seed_value)
        np.random.seed(seed_value)
        torch.manual_seed(seed_value)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed_value)

agent = DQNAgent()
agent.trainDQN(path="models/DQN_trained.pth")
agent.load(filename="models/DQN_trained.pth")

  3%|▎         | 295/10000 [00:42<1:11:40,  2.26it/s]

In [110]:
# Choisir les champs de vent de formation à évaluer
TRAINING_INITIAL_WINDFIELDS = ["simple_static", "training_1", "training_2", "training_3"]

# Paramètres d'évaluation pour tous les champs de vent initiaux
ALL_SEEDS = [42, 43, 44, 45, 46]  # Graines pour les évaluations
ALL_MAX_HORIZON = 200             # Nombre maximal de pas par épisode

# S'assurer que l'agent a été chargé avant d'exécuter
if 'agent' in locals():
    # Stocker les résultats pour chaque champ de vent initial
    all_results = {}
    
    print(f"Évaluation de l'agent sur {len(TRAINING_INITIAL_WINDFIELDS)} champs de vent initiaux...")
    
    # Évaluer sur chaque champ de vent initial
    for initial_windfield_name in TRAINING_INITIAL_WINDFIELDS:
        print(f"\nChamp de vent initial : {initial_windfield_name}")
        
        # Obtenir le champ de vent initial
        initial_windfield = get_initial_windfield(initial_windfield_name)
        
        # Exécuter l'évaluation
        results = evaluate_agent(
            agent=agent,  # Agent DQN chargé
            initial_windfield=initial_windfield,
            seeds=ALL_SEEDS,
            max_horizon=ALL_MAX_HORIZON,
            verbose=False,  # Moins verbeux pour plusieurs évaluations
            render=False,
            full_trajectory=False
        )
        
        # Stocker les résultats
        all_results[initial_windfield_name] = results
        
        # Afficher le résumé des résultats pour ce champ de vent
        print(f"  Taux de succès : {results['success_rate']:.2%}")
        print(f"  Récompense moyenne : {results['mean_reward']:.2f}")
        print(f"  Nombre moyen de pas : {results['mean_steps']:.1f}")
    
    # Afficher la performance globale
    total_success = sum(r['success_rate'] for r in all_results.values()) / len(all_results)
    print("\n" + "="*50)
    print(f"Taux de succès GLOBAL : {total_success:.2%}")
    print("="*50)
else:
    print("Agent non chargé. Assurez-vous d'avoir chargé l'agent avant d'exécuter l'évaluation.")


Évaluation de l'agent sur 4 champs de vent initiaux...

Champ de vent initial : simple_static
  Taux de succès : 0.00%
  Récompense moyenne : 0.00
  Nombre moyen de pas : 200.0

Champ de vent initial : training_1
  Taux de succès : 0.00%
  Récompense moyenne : 0.00
  Nombre moyen de pas : 200.0

Champ de vent initial : training_2
  Taux de succès : 0.00%
  Récompense moyenne : 0.00
  Nombre moyen de pas : 200.0

Champ de vent initial : training_3
  Taux de succès : 0.00%
  Récompense moyenne : 0.00
  Nombre moyen de pas : 200.0

Taux de succès GLOBAL : 0.00%
