In [3]:
import ctypes
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns


In [4]:
class Environnement:
    def nombre_etats(self):
        raise NotImplementedError

    def nombre_actions(self):
        raise NotImplementedError

    def recompense(self, etat, action, prochain_etat):
        raise NotImplementedError

    def probabilite_transition(self, etat, action, prochain_etat):
        raise NotImplementedError


In [5]:
class SecretEnv0(Environnement):
    def __init__(self):
        dll_path = r'C:\Users\rabia\OneDrive\Documents\iabd\deep renfo\[Projet] Secret envs 0_1_ 2 and 3 Python wrapper 2024_06_18 (with intel Mac architecture)\libs\secret_envs.dll'
        self.lib = ctypes.cdll.LoadLibrary(dll_path)
        
        self.lib.secret_env_0_num_states.argtypes = []
        self.lib.secret_env_0_num_states.restype = ctypes.c_size_t
        
        self.lib.secret_env_0_num_actions.argtypes = []
        self.lib.secret_env_0_num_actions.restype = ctypes.c_size_t
        
        self.lib.secret_env_0_reward.argtypes = [ctypes.c_size_t, ctypes.c_size_t, ctypes.c_size_t]
        self.lib.secret_env_0_reward.restype = ctypes.c_float
        
        self.lib.secret_env_0_transition_probability.argtypes = [ctypes.c_size_t, ctypes.c_size_t, ctypes.c_size_t]
        self.lib.secret_env_0_transition_probability.restype = ctypes.c_float
        
        self.nS = self.lib.secret_env_0_num_states()
        self.nA = self.lib.secret_env_0_num_actions()
        
        self.P = self._construire_probabilites_transition()

    def _construire_probabilites_transition(self):
        P = {s: {a: []} for s in range(self.nS) for a in range(self.nA)}
        
        for s in range(self.nS):
            for a in range(self.nA):
                for prochain_s in range(self.nS):
                    prob = self.lib.secret_env_0_transition_probability(s, a, prochain_s)
                    if prob > 0:
                        recompense = self.lib.secret_env_0_reward(s, a, prochain_s)
                        termine = False  # Ajuster si l'environnement a des états terminaux
                        P[s][a].append((prob, prochain_s, recompense, termine))
        return P

    def nombre_etats(self):
        return self.nS

    def nombre_actions(self):
        return self.nA

    def recompense(self, etat, action, prochain_etat):
        return self.lib.secret_env_0_reward(etat, action, prochain_etat)

    def probabilite_transition(self, etat, action, prochain_etat):
        return self.lib.secret_env_0_transition_probability(etat, action, prochain_etat)

In [None]:
class SecretEnv1(Environnement):
    def __init__(self):
        dll_path = r'C:\Users\rabia\OneDrive\Documents\iabd\deep renfo\[Projet] Secret envs 0_1_ 2 and 3 Python wrapper 2024_06_18 (with intel Mac architecture)\libs\secret_envs.dll'
        self.lib = ctypes.cdll.LoadLibrary(dll_path)
        
        self.lib.secret_env_1_num_states.argtypes = []
        self.lib.secret_env_1_num_states.restype = ctypes.c_size_t
        
        self.lib.secret_env_1_num_actions.argtypes = []
        self.lib.secret_env_1_num_actions.restype = ctypes.c_size_t
        
        self.lib.secret_env_1_reward.argtypes = [ctypes.c_size_t, ctypes.c_size_t, ctypes.c_size_t]
        self.lib.secret_env_1_reward.restype = ctypes.c_float
        
        self.lib.secret_env_1_transition_probability.argtypes = [ctypes.c_size_t, ctypes.c_size_t, ctypes.c_size_t]
        self.lib.secret_env_1_transition_probability.restype = ctypes.c_float
        
        self.nS = self.lib.secret_env_1_num_states()
        self.nA = self.lib.secret_env_1_num_actions()
        
        self.P = self._construire_probabilites_transition()

    def _construire_probabilites_transition(self):
        P = {s: {a: []} for s in range(self.nS) for a in range(self.nA)}
        
        for s in range(self.nS):
            for a in range(self.nA):
                for prochain_s in range(self.nS):
                    prob = self.lib.secret_env_1_transition_probability(s, a, prochain_s)
                    if prob > 0:
                        recompense = self.lib.secret_env_1_reward(s, a, prochain_s)
                        termine = False  # Ajuster si l'environnement a des états terminaux
                        P[s][a].append((prob, prochain_s, recompense, termine))
        return P
    #POUR dynamic programming 
    def nombre_etats(self):
        return self.nS

    def nombre_actions(self):
        return self.nA

    def recompense(self, etat, action, prochain_etat):
        return self.lib.secret_env_1_reward(etat, action, prochain_etat)

    def probabilite_transition(self, etat, action, prochain_etat):
        return self.lib.secret_env_1_transition_probability(etat, action, prochain_etat)
    # Pour monte carlo
    
    # Monte Carlo and TD Methods related functions:
    def state_id(self) -> int:
        return self.wrapper.lib.secret_env_2_state_id(self.instance)

    def reset(self):
        self.wrapper.lib.secret_env_2_reset(self.instance)

    def display(self):
        self.wrapper.lib.secret_env_2_display(self.instance)

    def is_forbidden(self, action: int) -> int:
        return self.wrapper.lib.secret_env_2_is_forbidden(self.instance, action)

    def is_game_over(self) -> bool:
        return self.wrapper.lib.secret_env_2_is_game_over(self.instance)

    def available_actions(self) -> np.ndarray:
        actions_len = self.wrapper.lib.secret_env_2_available_actions_len(self.instance)
        actions_pointer = self.wrapper.lib.secret_env_2_available_actions(self.instance)
        arr = np.ctypeslib.as_array(actions_pointer, (actions_len,))
        arr_copy = np.copy(arr)
        self.wrapper.lib.secret_env_2_available_actions_delete(actions_pointer, actions_len)
        return arr_copy

    def step(self, action: int):
        self.wrapper.lib.secret_env_2_step(self.instance, action)

    def score(self):
        return self.wrapper.lib.secret_env_2_score(self.instance)

    @staticmethod
    def from_random_state() -> 'SecretEnv2':
        wrapper = SecretEnv2Wrapper()
        instance = wrapper.lib.secret_env_2_from_random_state()
        return SecretEnv2(wrapper, instance)

In [6]:
class IterationDePolitique:
    def __init__(self, env, gamma=0.9, theta=1e-6):
        self.env = env
        self.gamma = gamma
        self.theta = theta
        self.politique = np.ones([env.nombre_etats(), env.nombre_actions()]) / env.nombre_actions()
        self.fonction_de_valeur = np.zeros(env.nombre_etats())
    
    def evaluation_de_politique(self):
        while True:
            delta = 0
            for e in range(self.env.nombre_etats()):
                v = self.fonction_de_valeur[e]
                nouvelle_valeur = 0
                for a in range(self.env.nombre_actions()):
                    for proba, prochain_etat, recompense, termine in self.env.P[e][a]:
                        nouvelle_valeur += self.politique[e, a] * proba * (recompense + self.gamma * self.fonction_de_valeur[prochain_etat])
                self.fonction_de_valeur[e] = nouvelle_valeur
                delta = max(delta, abs(v - nouvelle_valeur))
            if delta < self.theta:
                break
    
    def amelioration_de_politique(self):
        politique_stable = True
        for e in range(self.env.nombre_etats()):
            ancienne_action = np.argmax(self.politique[e])
            valeurs_actions = np.zeros(self.env.nombre_actions())
            for a in range(self.env.nombre_actions()):
                for proba, prochain_etat, recompense, termine in self.env.P[e][a]:
                    valeurs_actions[a] += proba * (recompense + self.gamma * self.fonction_de_valeur[prochain_etat])
            meilleure_action = np.argmax(valeurs_actions)
            self.politique[e] = np.eye(self.env.nombre_actions())[meilleure_action]
            if ancienne_action != meilleure_action:
                politique_stable = False
        return politique_stable
    
    def iterer(self):
        start_time = time.time()
        while True:
            self.evaluation_de_politique()
            if self.amelioration_de_politique():
                break
        end_time = time.time()
        return self.politique, self.fonction_de_valeur, end_time - start_time
 

In [7]:
def grid_search(env, gammas, thetas):
    results = []
    for gamma in gammas:
        for theta in thetas:
            iteration_de_politique = IterationDePolitique(env, gamma=gamma, theta=theta)
            politique_optimale, fonction_de_valeur_optimale, exec_time = iteration_de_politique.iterer()
            results.append((gamma, theta, exec_time))
            print(f"Gamma: {gamma}, Theta: {theta}, Time: {exec_time:.4f} seconds")
    return results

# Définir les plages de valeurs pour gamma et theta
gammas = [0.8, 0.9, 0.95, 0.99]
thetas = [1e-4, 1e-5, 1e-6, 1e-7]


In [None]:
# Initialiser l'environnement
env0 = SecretEnv0()

# Effectuer la recherche en grille
results = grid_search(env0, gammas, thetas)

# Convertir les résultats en un DataFrame pour une meilleure visualisation
import pandas as pd

results_df = pd.DataFrame(results, columns=['Gamma', 'Theta', 'ExecutionTime'])

# Visualiser les résultats
plt.figure(figsize=(12, 8))
sns.heatmap(results_df.pivot('Gamma', 'Theta', 'ExecutionTime'), annot=True, fmt=".4f", cmap="YlGnBu")
plt.title("Execution Time for Different Gamma and Theta Values")
plt.xlabel("Theta")
plt.ylabel("Gamma")
plt.show()
