In [1]:
import logging
import numpy as np
from rlberry.agents import Agent
from finite_deterministic_mdp import FiniteDMDP
from utils import backward_induction_sd_active, policy_evaluation
logger = logging.getLogger(__name__)

In [4]:
class EPRLAgent(Agent):
    """
    sampling_rule : str, 'max_diameter' or 'max_coverage' or 'adaptive_max_coverage'
    """

    name = "EPRL"

    def __init__(
        self,
        env,
        delta=0.1, 
        epsilon=5,
        sampling_rule="max_diameter",
        stage_dependent=True,
        period  = None,
        **kwargs
    ):
        Agent.__init__(self, env, **kwargs)
        self.epsilon = epsilon
        self.delta = delta
        self.stage_dependent = stage_dependent

        assert sampling_rule in ["max_diameter", "max_coverage", "adaptive_max_coverage"],"\
        Sampling rule must be either max_diameter or max_coverage or adaptive_max_coverage"
        self.sampling_rule = sampling_rule
        # Depending on the sampling rule, the algorithm works either in periods or by phases
        if self.sampling_rule == "max_diameter":
            if period is None:
                self.period = 10*self.env.S*self.env.A
            else:
                self.period = period
        elif self.sampling_rule == "adaptive_max_coverage":
            self.phase_ended = False
        
        
        # set of active actions
        # In case the MDP is not communicating, we only look at state-action pairs that are reachable 
        # starting from the initial state 
        self.Active = [[set() for ss in range(self.env.S)] for hh in range(self.env.horizon)] 
        self.Reachable = self.env.get_Reachable()
        for hh in range(self.env.horizon):
            for ss in self.Reachable[hh]:
                self.Active[hh][ss] = set(range(self.env.A))
        self.reset()

    def reset(self, **kwargs):
        del kwargs
        H = self.env.horizon
        S = self.env.S
        A = self.env.A

        # initial state (for stopping rule)
        self.initial_state = self.env.reset()
        
        # Boolean updated by stopping rules
        self.stop = False
        # Episode counter
        self.episode = 0

        if self.stage_dependent:
            shape_hs = (H, S)
            shape_hsa = (H, S, A)
            shape_hsas = (H, S, A, S)
        else:
            shape_hsa = (S, A)
            shape_hsas = (S, A, S)

        # D_h^t(s, a), used to compute the max diameter
        self.D_hsa = np.ones(shape_hsa)
        self.V_hs = np.zeros(shape_hs)  # auxiliary only
        
        #lower bound on the action-value function, used in eliminations
        self.LQ_hsa = np.zeros(shape_hsa) 
        self.LV_hs = np.zeros(shape_hs)  # auxiliary only
        
    
        # N_h^t(s, a) and N_h^t(s,a,s'), counting the number of visits
        self.N_hsa = np.zeros(shape_hsa)
        self.N_hsas = np.zeros(shape_hsas)
        self.N_min = 0 
        # counting cumulative reward
        self.cumR = np.zeros(shape_hsa)
        self.bonus = np.ones(shape_hsa)
        self.R_hat = np.zeros(shape_hsa)
        self.P_hat = self.env.P #np.ones(shape_hsas) * 1.0 / S

            
    # Threshold for building confidence intervals on Q-functions
    def _beta(self, n):
        H = self.env.horizon
        S = self.env.S
        A = self.env.A
        delta = self.delta
        beta = 0.5* np.log(S * A * H * np.exp(1) * (n + 1) / delta) 
        return beta

    # first stopping rule : check if max_diameter < epsilon
    def get_max_diameter(self):
        maxD = -np.inf
        for aa in self.Active[0][self.initial_state]:
            if self.D_hsa[0, self.initial_state, aa] > maxD:
                maxD = self.D_hsa[0, self.initial_state, aa]
        return maxD
    
    # Computes average number of active actions per stage-state pairs (h,s)
    def get_avg_active(self):
        H = self.env.horizon
        Card = 0
        Count = 0
        for hh in range(H):
            for ss in self.Reachable[hh]:
                Count += 1
                Card += len(self.Active[hh][ss])
        return Card/Count
    
    # second stopping rule : Check if there is only one active action left for each (h,s) 
    def one_action_left(self): 
        for hh in range(self.env.horizon):
            for ss in self.Reachable[hh]:
                if len(self.Active[hh][ss]) > 1:
                    return False
        return True

    # Combining both stopping rules
    def stopping(self):
        self.stop = (self.get_max_diameter() < self.epsilon ) or \
                 self.one_action_left()

    #Reommendation rule : policy output by the algorithm
    def recommendation(self):
        return np.argmax(self.LQ_hsa, axis = 2)
        
    # Sampling rule
    def exploration_policy(self):
        H = self.env.horizon
        S = self.env.S
        A = self.env.A
        if self.sampling_rule == "max_diameter":
            pi = np.zeros((H,S), dtype = int)
            for hh in range(H):
                for ss in self.Reachable[hh]:
                    maxD = -np.inf
                    for aa in self.Active[hh][ss]:
                        if self.D_hsa[hh, ss, aa] > maxD:
                            maxD = self.D_hsa[hh, ss, aa]
                            pi[hh, ss] = aa
            return pi
        elif self.sampling_rule == "adaptive_max_coverage":
            k = self.N_min + 1 # phase number to be incremented gradually
            # Exploration reward, non-zero only at active state action pairs
            ExpR = np.zeros((H, S, A))
            for hh in range(H):
                for ss in self.Reachable[hh]:
                    for aa in self.Active[hh][ss]:
                        if self.N_hsa[hh, ss, aa] < k:
                            ExpR[hh, ss, aa] = 1
            ExpQ = np.zeros((H, S, A))
            ExpV = np.zeros((H, S))
            # eliminated actions can still be played (if they enable to cover the MDP better)
            backward_induction_sd_active(
                ExpQ,
                ExpV,
                ExpR,
                self.P_hat,
                Active = None,
                vmax = np.inf,
            )
            return np.argmax(ExpQ, axis = 2)
        
    #Check if we need to perfom elimination rule. 
    #To reduce computational time, this done every once in a while
    def run_eliminations(self):
        if self.sampling_rule == "max_diameter" :
            return self.episode%self.period == 0
        elif self.sampling_rule == "adaptive_max_coverage" :
            if self.phase_ended:
                self.phase_ended = False # Next episode will start a new phase, so reset bolean to False
                return True       
    
    # Elimination rule
    def elimination(self):
        H = self.env.horizon
        S = self.env.S
        A = self.env.A
        Ur = self.R_hat + self.bonus #upper bounds on the reward
        # Compute lower bound on the action-value function
        backward_induction_sd_active(
            self.LQ_hsa,
            self.LV_hs,
            self.R_hat - self.bonus,
            self.P_hat,
            Active = None,
            vmax = np.inf,
        )
        # maximum lower bound : serves as threshold to eliminate suboptimal triplets (h,s,a)
        maxL = self.LV_hs[0, self.initial_state]
        for hh in range(H):
            states = {self.initial_state}
            if hh > 0 :
                states = range(S)
            for ss in states:
                Eliminate = []
                for aa in self.Active[hh][ss]:
                    Ur_hsa = Ur.copy()
                    temp = Ur_hsa[hh, ss, aa]
                    Ur_hsa[hh,:, :] = -np.inf #penalize policies that don't pass through (h,s,a)
                    Ur_hsa[hh, ss, aa] = temp
                    UQ = np.zeros((H, S, A))
                    UV = np.zeros((H, S))
                    backward_induction_sd_active(
                        UQ,
                        UV,
                        Ur_hsa,
                        self.P_hat,
                        self.Active,
                        vmax = np.inf,
                    )
                    if UV[0, self.initial_state] < maxL :
                        Eliminate.append(aa)
                for aa in Eliminate:
                    if len(self.Active[hh][ss]) > 1:
                        self.Active[hh][ss].remove(aa)
       
    # Updating counts, reward estimates and bonuses     
    def _update(self, hh, state, action, reward, next_state):
        if self.stage_dependent:
            self.N_hsa[hh, state, action] += 1
            self.N_hsas[hh, state, action, next_state] += 1
            self.cumR[hh, state, action] += reward

            n_hsa = self.N_hsa[hh, state, action]
            #n_hsas = self.N_hsas[hh, state, action, :]
            self.R_hat[hh, state, action] = self.cumR[hh, state, action] / n_hsa
            #self.P_hat[hh, state, action, :] = n_hsas / n_hsa
            self.bonus[hh, state, action] = (
                 min( np.sqrt( self._beta(n_hsa) / n_hsa ), 1 )
                )
            prev_N_min = self.N_min
            #we only care about the number of visits to state-action pairs that reachabe from s_0
            m = np.inf
            for hh in range(self.env.horizon):
                for ss in self.Reachable[hh]:
                    for aa in self.Active[hh][ss]:
                        if self.N_hsa[hh, ss, aa] < m:
                            m = self.N_hsa[hh, ss, aa]
            self.N_min = m
            if self.sampling_rule == "adaptive_max_coverage" and m > prev_N_min :
                self.phase_ended = True
        else:
            self.N_hsa[hh, state, action] += 1
            self.N_hsas[hh, state, action, next_state] += 1
            self.cumR[hh, state, action] += reward
            n_hsa = self.N_hsa[hh, state, action]
            n_hsas = self.N_hsas[hh, state, action, :]
            
            self.R_hat[hh, state, action] = self.cumR[hh, state, action] / n_hsa
            self.P_hat[hh, state, action, :] = n_hsas / n_hsa

            self.bonus[state, action] = (
                 min( np.sqrt( self._beta(n_hsa) / n_hsa ), 1 )
                )
    def run_episode(self, eliminate=False):
        state = self.env.reset()
        for hh in range(self.env.horizon):
            action = self.exploration_policy()[hh, state]
            next_s, reward, done, _ = self.env.step(action)
            del done

            #self.counter.update(hh, state, action, next_s, 0.0)
            self._update(hh, state, action, reward, next_s)

            state = next_s

        if self.sampling_rule == "max_diameter" or eliminate :
            # update Diameter
            if self.stage_dependent:
                H = self.env.horizon
                S = self.env.S
                backward_induction_sd_active(
                    self.D_hsa,
                    np.zeros((H, S)),
                    2*self.bonus,
                    self.P_hat,
                    self.Active,
                    vmax=np.inf,
                )
            else:
                self.D_hsa, _ = backward_induction(
                    2*self.bonus, self.P_hat, self.env.horizon, self.Active, vmax=np.inf
                )
        if eliminate and self.N_min > 0:
            self.elimination()
            self.stopping()

        # write info
        if self.writer is not None:
            self.writer.add_scalar(
                "max_diameter", self.get_max_diameter(), self.episode
            )
            self.writer.add_scalar(
                "avg_active_actions", self.get_avg_active(), self.episode
            )
            
        self.episode += 1
        
    def eval(self, **kwargs):
        del kwargs
        H = self.env.horizon
        S = self.env.S
        A = self.env.A
        pi_hat = self.recommendation()
        Vpi = policy_evaluation(self.env, pi_hat)[0, self.initial_state]
        Qstar = np.zeros((H, S, A))
        Vstar = np.zeros((H, S))
        backward_induction_sd_active(Qstar, Vstar, self.env.R, self.env.P, Active = None)
        Vstar = Vstar[0, self.initial_state]
        return int(Vpi > Vstar - self.epsilon)
        
    def fit(self, budget, **kwargs):
        del kwargs
        while (not self.stop) and self.episode < budget:
            if self.run_eliminations() :
                self.run_episode(eliminate = True)
            else:
                self.run_episode()
            print(f"Episode n°{self.episode}, N_min = {self.N_min}\
            #Max diameter = {self.get_max_diameter()}, Average active actions = {self.get_avg_active()}")
        pi_hat = self.recommendation()
        return pi_hat

In [3]:
S = 2
A = 2
H = 2
epsilon = 0.7
delta = 0.1
env = FiniteDMDP.Random(H, S, A, Rmax = 5)

In [21]:
params = {
    "sampling_rule": "max_diameter",
    "delta": delta,
    "epsilon": epsilon,
    "stage_dependent": True,
    "period" : None,
}
agent = EPRLAgent(env, **params)
pi = agent.fit(1e6, period = None)

Episode n°1, N_min = 0.0            #Max diameter = 4.0, Average active actions = 2.0
Episode n°2, N_min = 0.0            #Max diameter = 4.0, Average active actions = 2.0
Episode n°3, N_min = 0.0            #Max diameter = 4.0, Average active actions = 2.0
Episode n°4, N_min = 0.0            #Max diameter = 4.0, Average active actions = 2.0
Episode n°5, N_min = 0.0            #Max diameter = 4.0, Average active actions = 2.0
Episode n°6, N_min = 0.0            #Max diameter = 4.0, Average active actions = 2.0
Episode n°7, N_min = 0.0            #Max diameter = 4.0, Average active actions = 2.0
Episode n°8, N_min = 0.0            #Max diameter = 3.869687747607603, Average active actions = 2.0
Episode n°9, N_min = 0.0            #Max diameter = 3.869687747607603, Average active actions = 2.0
Episode n°10, N_min = 1.0            #Max diameter = 3.693964120505737, Average active actions = 2.0
Episode n°11, N_min = 1.0            #Max diameter = 3.693964120505737, Average active actions = 

In [22]:
agent.cumR - x

array([[[-12.38044938,  25.41448743],
        [  0.        ,   0.        ]],

       [[-74.80961166,   1.38573704],
        [-34.55874052, -10.97415811]]])

In [18]:
x = agent.cumR

In [6]:
agent.eval()

1

In [7]:
params = {
    "sampling_rule": "adaptive_max_coverage",
    "delta": delta,
    "epsilon": epsilon,
    "stage_dependent": True,
    "period" : None,
}
agent = EPRLAgent(env, **params)
pi = agent.fit(1e6, period = None)

Episode n°1, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°2, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°3, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°4, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°5, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°6, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°7, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°8, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°9, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°10, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°11, N_min = 0.0            Max diameter = 0.0, Average active actions = 3.0
Episode n°12, N_min = 0.0            Max diameter = 0.0, Averag

Episode n°209, N_min = 16.0            Max diameter = 5.483520567254724, Average active actions = 2.9166666666666665
Episode n°210, N_min = 16.0            Max diameter = 5.483520567254724, Average active actions = 2.9166666666666665
Episode n°211, N_min = 16.0            Max diameter = 5.483520567254724, Average active actions = 2.9166666666666665
Episode n°212, N_min = 16.0            Max diameter = 5.483520567254724, Average active actions = 2.9166666666666665
Episode n°213, N_min = 16.0            Max diameter = 5.483520567254724, Average active actions = 2.9166666666666665
Episode n°214, N_min = 16.0            Max diameter = 5.483520567254724, Average active actions = 2.9166666666666665
Episode n°215, N_min = 16.0            Max diameter = 5.483520567254724, Average active actions = 2.9166666666666665
Episode n°216, N_min = 16.0            Max diameter = 5.483520567254724, Average active actions = 2.9166666666666665
Episode n°217, N_min = 16.0            Max diameter = 5.48352056

Episode n°421, N_min = 32.0            Max diameter = 4.014294176870368, Average active actions = 2.6666666666666665
Episode n°422, N_min = 32.0            Max diameter = 4.014294176870368, Average active actions = 2.6666666666666665
Episode n°423, N_min = 32.0            Max diameter = 4.014294176870368, Average active actions = 2.6666666666666665
Episode n°424, N_min = 32.0            Max diameter = 4.014294176870368, Average active actions = 2.6666666666666665
Episode n°425, N_min = 32.0            Max diameter = 4.014294176870368, Average active actions = 2.6666666666666665
Episode n°426, N_min = 32.0            Max diameter = 4.014294176870368, Average active actions = 2.6666666666666665
Episode n°427, N_min = 32.0            Max diameter = 4.014294176870368, Average active actions = 2.6666666666666665
Episode n°428, N_min = 33.0            Max diameter = 4.014294176870368, Average active actions = 2.6666666666666665
Episode n°429, N_min = 33.0            Max diameter = 3.96020566

Episode n°646, N_min = 55.0            Max diameter = 3.04039691019528, Average active actions = 2.0
Episode n°647, N_min = 55.0            Max diameter = 3.04039691019528, Average active actions = 2.0
Episode n°648, N_min = 55.0            Max diameter = 3.04039691019528, Average active actions = 2.0
Episode n°649, N_min = 55.0            Max diameter = 3.04039691019528, Average active actions = 2.0
Episode n°650, N_min = 55.0            Max diameter = 3.04039691019528, Average active actions = 2.0
Episode n°651, N_min = 55.0            Max diameter = 3.04039691019528, Average active actions = 2.0
Episode n°652, N_min = 55.0            Max diameter = 3.04039691019528, Average active actions = 2.0
Episode n°653, N_min = 56.0            Max diameter = 3.04039691019528, Average active actions = 2.0
Episode n°654, N_min = 56.0            Max diameter = 3.0173682753336406, Average active actions = 2.0
Episode n°655, N_min = 56.0            Max diameter = 3.0173682753336406, Average active 

Episode n°869, N_min = 93.0            Max diameter = 2.2626062994328744, Average active actions = 1.6666666666666667
Episode n°870, N_min = 93.0            Max diameter = 2.2626062994328744, Average active actions = 1.6666666666666667
Episode n°871, N_min = 93.0            Max diameter = 2.2626062994328744, Average active actions = 1.6666666666666667
Episode n°872, N_min = 94.0            Max diameter = 2.2626062994328744, Average active actions = 1.6666666666666667
Episode n°873, N_min = 94.0            Max diameter = 2.252439386127936, Average active actions = 1.6666666666666667
Episode n°874, N_min = 94.0            Max diameter = 2.252439386127936, Average active actions = 1.6666666666666667
Episode n°875, N_min = 94.0            Max diameter = 2.252439386127936, Average active actions = 1.6666666666666667
Episode n°876, N_min = 94.0            Max diameter = 2.252439386127936, Average active actions = 1.6666666666666667
Episode n°877, N_min = 95.0            Max diameter = 2.2524

Episode n°1090, N_min = 137.0            Max diameter = 1.918321687446625, Average active actions = 1.6666666666666667
Episode n°1091, N_min = 137.0            Max diameter = 1.918321687446625, Average active actions = 1.6666666666666667
Episode n°1092, N_min = 138.0            Max diameter = 1.918321687446625, Average active actions = 1.6666666666666667
Episode n°1093, N_min = 138.0            Max diameter = 1.9123163173005837, Average active actions = 1.6666666666666667
Episode n°1094, N_min = 138.0            Max diameter = 1.9123163173005837, Average active actions = 1.6666666666666667
Episode n°1095, N_min = 138.0            Max diameter = 1.9123163173005837, Average active actions = 1.6666666666666667
Episode n°1096, N_min = 138.0            Max diameter = 1.9123163173005837, Average active actions = 1.6666666666666667
Episode n°1097, N_min = 139.0            Max diameter = 1.9123163173005837, Average active actions = 1.6666666666666667
Episode n°1098, N_min = 139.0            Ma

Episode n°1308, N_min = 181.0            Max diameter = 1.7001015589108288, Average active actions = 1.5
Episode n°1309, N_min = 181.0            Max diameter = 1.7001015589108288, Average active actions = 1.5
Episode n°1310, N_min = 181.0            Max diameter = 1.7001015589108288, Average active actions = 1.5
Episode n°1311, N_min = 181.0            Max diameter = 1.7001015589108288, Average active actions = 1.5
Episode n°1312, N_min = 182.0            Max diameter = 1.7001015589108288, Average active actions = 1.5
Episode n°1313, N_min = 182.0            Max diameter = 1.696008099614727, Average active actions = 1.5
Episode n°1314, N_min = 182.0            Max diameter = 1.696008099614727, Average active actions = 1.5
Episode n°1315, N_min = 182.0            Max diameter = 1.696008099614727, Average active actions = 1.5
Episode n°1316, N_min = 182.0            Max diameter = 1.696008099614727, Average active actions = 1.5
Episode n°1317, N_min = 183.0            Max diameter = 1.6

Episode n°1539, N_min = 227.0            Max diameter = 1.4736498035603571, Average active actions = 1.4166666666666667
Episode n°1540, N_min = 227.0            Max diameter = 1.4736498035603571, Average active actions = 1.4166666666666667
Episode n°1541, N_min = 227.0            Max diameter = 1.4736498035603571, Average active actions = 1.4166666666666667
Episode n°1542, N_min = 228.0            Max diameter = 1.4736498035603571, Average active actions = 1.4166666666666667
Episode n°1543, N_min = 228.0            Max diameter = 1.470989958069039, Average active actions = 1.4166666666666667
Episode n°1544, N_min = 228.0            Max diameter = 1.470989958069039, Average active actions = 1.4166666666666667
Episode n°1545, N_min = 228.0            Max diameter = 1.470989958069039, Average active actions = 1.4166666666666667
Episode n°1546, N_min = 228.0            Max diameter = 1.470989958069039, Average active actions = 1.4166666666666667
Episode n°1547, N_min = 229.0            Max

Episode n°1772, N_min = 274.0            Max diameter = 1.3653094516564246, Average active actions = 1.25
Episode n°1773, N_min = 274.0            Max diameter = 1.3632305285849458, Average active actions = 1.25
Episode n°1774, N_min = 274.0            Max diameter = 1.3632305285849458, Average active actions = 1.25
Episode n°1775, N_min = 274.0            Max diameter = 1.3632305285849458, Average active actions = 1.25
Episode n°1776, N_min = 274.0            Max diameter = 1.3632305285849458, Average active actions = 1.25
Episode n°1777, N_min = 275.0            Max diameter = 1.3632305285849458, Average active actions = 1.25
Episode n°1778, N_min = 275.0            Max diameter = 1.3611617692631186, Average active actions = 1.25
Episode n°1779, N_min = 275.0            Max diameter = 1.3611617692631186, Average active actions = 1.25
Episode n°1780, N_min = 275.0            Max diameter = 1.3611617692631186, Average active actions = 1.25
Episode n°1781, N_min = 275.0            Max d

Episode n°2014, N_min = 322.0            Max diameter = 1.2425412146096995, Average active actions = 1.0833333333333333
Episode n°2015, N_min = 322.0            Max diameter = 1.2425412146096995, Average active actions = 1.0833333333333333
Episode n°2016, N_min = 322.0            Max diameter = 1.2425412146096995, Average active actions = 1.0833333333333333
Episode n°2017, N_min = 323.0            Max diameter = 1.2425412146096995, Average active actions = 1.0833333333333333
Episode n°2018, N_min = 323.0            Max diameter = 1.240801950384084, Average active actions = 1.0833333333333333
Episode n°2019, N_min = 323.0            Max diameter = 1.240801950384084, Average active actions = 1.0833333333333333
Episode n°2020, N_min = 323.0            Max diameter = 1.240801950384084, Average active actions = 1.0833333333333333
Episode n°2021, N_min = 323.0            Max diameter = 1.240801950384084, Average active actions = 1.0833333333333333
Episode n°2022, N_min = 324.0            Max

Episode n°2259, N_min = 371.0            Max diameter = 1.1654349556457495, Average active actions = 1.0833333333333333
Episode n°2260, N_min = 371.0            Max diameter = 1.1654349556457495, Average active actions = 1.0833333333333333
Episode n°2261, N_min = 371.0            Max diameter = 1.1654349556457495, Average active actions = 1.0833333333333333
Episode n°2262, N_min = 372.0            Max diameter = 1.1654349556457495, Average active actions = 1.0833333333333333
Episode n°2263, N_min = 372.0            Max diameter = 1.1640152426726917, Average active actions = 1.0833333333333333
Episode n°2264, N_min = 372.0            Max diameter = 1.1640152426726917, Average active actions = 1.0833333333333333
Episode n°2265, N_min = 372.0            Max diameter = 1.1640152426726917, Average active actions = 1.0833333333333333
Episode n°2266, N_min = 372.0            Max diameter = 1.1640152426726917, Average active actions = 1.0833333333333333
Episode n°2267, N_min = 373.0           

Episode n°2497, N_min = 419.0            Max diameter = 1.1040950698802132, Average active actions = 1.0833333333333333
Episode n°2498, N_min = 419.0            Max diameter = 1.102898561448447, Average active actions = 1.0833333333333333
Episode n°2499, N_min = 419.0            Max diameter = 1.102898561448447, Average active actions = 1.0833333333333333
Episode n°2500, N_min = 419.0            Max diameter = 1.102898561448447, Average active actions = 1.0833333333333333
Episode n°2501, N_min = 419.0            Max diameter = 1.102898561448447, Average active actions = 1.0833333333333333
Episode n°2502, N_min = 420.0            Max diameter = 1.102898561448447, Average active actions = 1.0833333333333333
Episode n°2503, N_min = 420.0            Max diameter = 1.101706147361514, Average active actions = 1.0833333333333333
Episode n°2504, N_min = 420.0            Max diameter = 1.101706147361514, Average active actions = 1.0833333333333333
Episode n°2505, N_min = 420.0            Max di

Episode n°2734, N_min = 466.0            Max diameter = 1.0509062182658684, Average active actions = 1.0833333333333333
Episode n°2735, N_min = 466.0            Max diameter = 1.0509062182658684, Average active actions = 1.0833333333333333
Episode n°2736, N_min = 466.0            Max diameter = 1.0509062182658684, Average active actions = 1.0833333333333333
Episode n°2737, N_min = 467.0            Max diameter = 1.0509062182658684, Average active actions = 1.0833333333333333
Episode n°2738, N_min = 467.0            Max diameter = 1.0498825309292446, Average active actions = 1.0833333333333333
Episode n°2739, N_min = 467.0            Max diameter = 1.0498825309292446, Average active actions = 1.0833333333333333
Episode n°2740, N_min = 467.0            Max diameter = 1.0498825309292446, Average active actions = 1.0833333333333333
Episode n°2741, N_min = 467.0            Max diameter = 1.0498825309292446, Average active actions = 1.0833333333333333
Episode n°2742, N_min = 468.0           

Episode n°2961, N_min = 511.0            Max diameter = 1.0077393815369986, Average active actions = 1.0833333333333333
Episode n°2962, N_min = 512.0            Max diameter = 1.0077393815369986, Average active actions = 1.0833333333333333
Episode n°2963, N_min = 512.0            Max diameter = 1.00684263264978, Average active actions = 1.0833333333333333
Episode n°2964, N_min = 512.0            Max diameter = 1.00684263264978, Average active actions = 1.0833333333333333
Episode n°2965, N_min = 512.0            Max diameter = 1.00684263264978, Average active actions = 1.0833333333333333
Episode n°2966, N_min = 512.0            Max diameter = 1.00684263264978, Average active actions = 1.0833333333333333
Episode n°2967, N_min = 513.0            Max diameter = 1.00684263264978, Average active actions = 1.0833333333333333
Episode n°2968, N_min = 513.0            Max diameter = 1.0059483990369151, Average active actions = 1.0833333333333333
Episode n°2969, N_min = 513.0            Max diame

Episode n°3204, N_min = 560.0            Max diameter = 0.9665529788743643, Average active actions = 1.0833333333333333
Episode n°3205, N_min = 560.0            Max diameter = 0.9665529788743643, Average active actions = 1.0833333333333333
Episode n°3206, N_min = 560.0            Max diameter = 0.9665529788743643, Average active actions = 1.0833333333333333
Episode n°3207, N_min = 561.0            Max diameter = 0.9665529788743643, Average active actions = 1.0833333333333333
Episode n°3208, N_min = 561.0            Max diameter = 0.9657668118942502, Average active actions = 1.0833333333333333
Episode n°3209, N_min = 561.0            Max diameter = 0.9657668118942502, Average active actions = 1.0833333333333333
Episode n°3210, N_min = 561.0            Max diameter = 0.9657668118942502, Average active actions = 1.0833333333333333
Episode n°3211, N_min = 561.0            Max diameter = 0.9657668118942502, Average active actions = 1.0833333333333333
Episode n°3212, N_min = 562.0           

[INFO] [EPRL[worker: -1]] | max_global_step = 3385 | max_diameter = 0.9394607410448919 | dw_time_elapsed = 3.000398915995902 | avg_active_actions = 1.0833333333333333 |  


Episode n°3245, N_min = 568.0            Max diameter = 0.9603195565494383, Average active actions = 1.0833333333333333
Episode n°3246, N_min = 568.0            Max diameter = 0.9603195565494383, Average active actions = 1.0833333333333333
Episode n°3247, N_min = 569.0            Max diameter = 0.9603195565494383, Average active actions = 1.0833333333333333
Episode n°3248, N_min = 569.0            Max diameter = 0.9595492623218395, Average active actions = 1.0833333333333333
Episode n°3249, N_min = 569.0            Max diameter = 0.9595492623218395, Average active actions = 1.0833333333333333
Episode n°3250, N_min = 569.0            Max diameter = 0.9595492623218395, Average active actions = 1.0833333333333333
Episode n°3251, N_min = 569.0            Max diameter = 0.9595492623218395, Average active actions = 1.0833333333333333
Episode n°3252, N_min = 570.0            Max diameter = 0.9595492623218395, Average active actions = 1.0833333333333333
Episode n°3253, N_min = 570.0           

Episode n°3429, N_min = 605.0            Max diameter = 0.9330534373059948, Average active actions = 1.0833333333333333
Episode n°3430, N_min = 605.0            Max diameter = 0.9330534373059948, Average active actions = 1.0833333333333333
Episode n°3431, N_min = 605.0            Max diameter = 0.9330534373059948, Average active actions = 1.0833333333333333
Episode n°3432, N_min = 606.0            Max diameter = 0.9330534373059948, Average active actions = 1.0833333333333333
Episode n°3433, N_min = 606.0            Max diameter = 0.9323499875160142, Average active actions = 1.0833333333333333
Episode n°3434, N_min = 606.0            Max diameter = 0.9323499875160142, Average active actions = 1.0833333333333333
Episode n°3435, N_min = 606.0            Max diameter = 0.9323499875160142, Average active actions = 1.0833333333333333
Episode n°3436, N_min = 606.0            Max diameter = 0.9323499875160142, Average active actions = 1.0833333333333333
Episode n°3437, N_min = 607.0           

Episode n°3655, N_min = 650.0            Max diameter = 0.9029616438227805, Average active actions = 1.0833333333333333
Episode n°3656, N_min = 650.0            Max diameter = 0.9029616438227805, Average active actions = 1.0833333333333333
Episode n°3657, N_min = 651.0            Max diameter = 0.9029616438227805, Average active actions = 1.0833333333333333
Episode n°3658, N_min = 651.0            Max diameter = 0.9023272104361872, Average active actions = 1.0833333333333333
Episode n°3659, N_min = 651.0            Max diameter = 0.9023272104361872, Average active actions = 1.0833333333333333
Episode n°3660, N_min = 651.0            Max diameter = 0.9023272104361872, Average active actions = 1.0833333333333333
Episode n°3661, N_min = 651.0            Max diameter = 0.9023272104361872, Average active actions = 1.0833333333333333
Episode n°3662, N_min = 652.0            Max diameter = 0.9023272104361872, Average active actions = 1.0833333333333333
Episode n°3663, N_min = 652.0           

Episode n°3864, N_min = 692.0            Max diameter = 0.877466098661224, Average active actions = 1.0833333333333333
Episode n°3865, N_min = 692.0            Max diameter = 0.877466098661224, Average active actions = 1.0833333333333333
Episode n°3866, N_min = 692.0            Max diameter = 0.877466098661224, Average active actions = 1.0833333333333333
Episode n°3867, N_min = 693.0            Max diameter = 0.877466098661224, Average active actions = 1.0833333333333333
Episode n°3868, N_min = 693.0            Max diameter = 0.8768863743507044, Average active actions = 1.0833333333333333
Episode n°3869, N_min = 693.0            Max diameter = 0.8768863743507044, Average active actions = 1.0833333333333333
Episode n°3870, N_min = 693.0            Max diameter = 0.8768863743507044, Average active actions = 1.0833333333333333
Episode n°3871, N_min = 693.0            Max diameter = 0.8768863743507044, Average active actions = 1.0833333333333333
Episode n°3872, N_min = 694.0            Max

Episode n°4098, N_min = 739.0            Max diameter = 0.8514565661255433, Average active actions = 1.0833333333333333
Episode n°4099, N_min = 739.0            Max diameter = 0.8514565661255433, Average active actions = 1.0833333333333333
Episode n°4100, N_min = 739.0            Max diameter = 0.8514565661255433, Average active actions = 1.0833333333333333
Episode n°4101, N_min = 739.0            Max diameter = 0.8514565661255433, Average active actions = 1.0833333333333333
Episode n°4102, N_min = 740.0            Max diameter = 0.8514565661255433, Average active actions = 1.0833333333333333
Episode n°4103, N_min = 740.0            Max diameter = 0.8509292223018351, Average active actions = 1.0833333333333333
Episode n°4104, N_min = 740.0            Max diameter = 0.8509292223018351, Average active actions = 1.0833333333333333
Episode n°4105, N_min = 740.0            Max diameter = 0.8509292223018351, Average active actions = 1.0833333333333333
Episode n°4106, N_min = 740.0           

Episode n°4314, N_min = 782.0            Max diameter = 0.8296684238740473, Average active actions = 1.0833333333333333
Episode n°4315, N_min = 782.0            Max diameter = 0.8296684238740473, Average active actions = 1.0833333333333333
Episode n°4316, N_min = 782.0            Max diameter = 0.8296684238740473, Average active actions = 1.0833333333333333
Episode n°4317, N_min = 783.0            Max diameter = 0.8296684238740473, Average active actions = 1.0833333333333333
Episode n°4318, N_min = 783.0            Max diameter = 0.8291823799214939, Average active actions = 1.0833333333333333
Episode n°4319, N_min = 783.0            Max diameter = 0.8291823799214939, Average active actions = 1.0833333333333333
Episode n°4320, N_min = 783.0            Max diameter = 0.8291823799214939, Average active actions = 1.0833333333333333
Episode n°4321, N_min = 783.0            Max diameter = 0.8291823799214939, Average active actions = 1.0833333333333333
Episode n°4322, N_min = 784.0           

Episode n°4524, N_min = 824.0            Max diameter = 0.8099946783635668, Average active actions = 1.0833333333333333
Episode n°4525, N_min = 824.0            Max diameter = 0.8099946783635668, Average active actions = 1.0833333333333333
Episode n°4526, N_min = 824.0            Max diameter = 0.8099946783635668, Average active actions = 1.0833333333333333
Episode n°4527, N_min = 825.0            Max diameter = 0.8099946783635668, Average active actions = 1.0833333333333333
Episode n°4528, N_min = 825.0            Max diameter = 0.8095439694683432, Average active actions = 1.0833333333333333
Episode n°4529, N_min = 825.0            Max diameter = 0.8095439694683432, Average active actions = 1.0833333333333333
Episode n°4530, N_min = 825.0            Max diameter = 0.8095439694683432, Average active actions = 1.0833333333333333
Episode n°4531, N_min = 825.0            Max diameter = 0.8095439694683432, Average active actions = 1.0833333333333333
Episode n°4532, N_min = 826.0           

Episode n°4759, N_min = 871.0            Max diameter = 0.7896273689962601, Average active actions = 1.0833333333333333
Episode n°4760, N_min = 871.0            Max diameter = 0.7896273689962601, Average active actions = 1.0833333333333333
Episode n°4761, N_min = 871.0            Max diameter = 0.7896273689962601, Average active actions = 1.0833333333333333
Episode n°4762, N_min = 872.0            Max diameter = 0.7896273689962601, Average active actions = 1.0833333333333333
Episode n°4763, N_min = 872.0            Max diameter = 0.7892113418183336, Average active actions = 1.0833333333333333
Episode n°4764, N_min = 872.0            Max diameter = 0.7892113418183336, Average active actions = 1.0833333333333333
Episode n°4765, N_min = 872.0            Max diameter = 0.7892113418183336, Average active actions = 1.0833333333333333
Episode n°4766, N_min = 872.0            Max diameter = 0.7892113418183336, Average active actions = 1.0833333333333333
Episode n°4767, N_min = 873.0           

Episode n°4980, N_min = 915.0            Max diameter = 0.7719492178529233, Average active actions = 1.0833333333333333
Episode n°4981, N_min = 915.0            Max diameter = 0.7719492178529233, Average active actions = 1.0833333333333333
Episode n°4982, N_min = 916.0            Max diameter = 0.7719492178529233, Average active actions = 1.0833333333333333
Episode n°4983, N_min = 916.0            Max diameter = 0.771561771055867, Average active actions = 1.0833333333333333
Episode n°4984, N_min = 916.0            Max diameter = 0.771561771055867, Average active actions = 1.0833333333333333
Episode n°4985, N_min = 916.0            Max diameter = 0.771561771055867, Average active actions = 1.0833333333333333
Episode n°4986, N_min = 916.0            Max diameter = 0.771561771055867, Average active actions = 1.0833333333333333
Episode n°4987, N_min = 917.0            Max diameter = 0.771561771055867, Average active actions = 1.0833333333333333
Episode n°4988, N_min = 917.0            Max 

Episode n°5207, N_min = 961.0            Max diameter = 0.7550967073122306, Average active actions = 1.0833333333333333
Episode n°5208, N_min = 961.0            Max diameter = 0.7547352249378734, Average active actions = 1.0833333333333333
Episode n°5209, N_min = 961.0            Max diameter = 0.7547352249378734, Average active actions = 1.0833333333333333
Episode n°5210, N_min = 961.0            Max diameter = 0.7547352249378734, Average active actions = 1.0833333333333333
Episode n°5211, N_min = 961.0            Max diameter = 0.7547352249378734, Average active actions = 1.0833333333333333
Episode n°5212, N_min = 962.0            Max diameter = 0.7547352249378734, Average active actions = 1.0833333333333333
Episode n°5213, N_min = 962.0            Max diameter = 0.7543742860378948, Average active actions = 1.0833333333333333
Episode n°5214, N_min = 962.0            Max diameter = 0.7543742860378948, Average active actions = 1.0833333333333333
Episode n°5215, N_min = 962.0           

Episode n°5454, N_min = 1010.0            Max diameter = 0.7376624603700649, Average active actions = 1.0833333333333333
Episode n°5455, N_min = 1010.0            Max diameter = 0.7376624603700649, Average active actions = 1.0833333333333333
Episode n°5456, N_min = 1010.0            Max diameter = 0.7376624603700649, Average active actions = 1.0833333333333333
Episode n°5457, N_min = 1011.0            Max diameter = 0.7376624603700649, Average active actions = 1.0833333333333333
Episode n°5458, N_min = 1011.0            Max diameter = 0.7373265573815173, Average active actions = 1.0833333333333333
Episode n°5459, N_min = 1011.0            Max diameter = 0.7373265573815173, Average active actions = 1.0833333333333333
Episode n°5460, N_min = 1011.0            Max diameter = 0.7373265573815173, Average active actions = 1.0833333333333333
Episode n°5461, N_min = 1011.0            Max diameter = 0.7373265573815173, Average active actions = 1.0833333333333333
Episode n°5462, N_min = 1012.0  

Episode n°5697, N_min = 1059.0            Max diameter = 0.7220615969857862, Average active actions = 1.0833333333333333
Episode n°5698, N_min = 1059.0            Max diameter = 0.7217475072895572, Average active actions = 1.0833333333333333
Episode n°5699, N_min = 1059.0            Max diameter = 0.7217475072895572, Average active actions = 1.0833333333333333
Episode n°5700, N_min = 1059.0            Max diameter = 0.7217475072895572, Average active actions = 1.0833333333333333
Episode n°5701, N_min = 1059.0            Max diameter = 0.7217475072895572, Average active actions = 1.0833333333333333
Episode n°5702, N_min = 1060.0            Max diameter = 0.7217475072895572, Average active actions = 1.0833333333333333
Episode n°5703, N_min = 1060.0            Max diameter = 0.7214338465031819, Average active actions = 1.0833333333333333
Episode n°5704, N_min = 1060.0            Max diameter = 0.7214338465031819, Average active actions = 1.0833333333333333
Episode n°5705, N_min = 1060.0  

Episode n°5953, N_min = 1110.0            Max diameter = 0.7062767048237281, Average active actions = 1.0833333333333333
Episode n°5954, N_min = 1110.0            Max diameter = 0.7062767048237281, Average active actions = 1.0833333333333333
Episode n°5955, N_min = 1110.0            Max diameter = 0.7062767048237281, Average active actions = 1.0833333333333333
Episode n°5956, N_min = 1110.0            Max diameter = 0.7062767048237281, Average active actions = 1.0833333333333333
Episode n°5957, N_min = 1111.0            Max diameter = 0.7062767048237281, Average active actions = 1.0833333333333333
Episode n°5958, N_min = 1111.0            Max diameter = 0.7059836779289276, Average active actions = 1.0833333333333333
Episode n°5959, N_min = 1111.0            Max diameter = 0.7059836779289276, Average active actions = 1.0833333333333333
Episode n°5960, N_min = 1111.0            Max diameter = 0.7059836779289276, Average active actions = 1.0833333333333333
Episode n°5961, N_min = 1111.0  

In [16]:
env.R

array([[[2.31426581, 3.05189284, 3.77593981],
        [4.18906457, 4.22974016, 1.81154989],
        [2.85284899, 0.69458005, 0.77685269]],

       [[4.22090351, 3.5755985 , 3.04354327],
        [2.59941104, 3.38132414, 1.14024523],
        [0.42770628, 3.11406324, 1.9335355 ]],

       [[3.15987656, 2.73089348, 3.70908804],
        [2.24172982, 0.13550049, 0.44939998],
        [4.7286501 , 1.21674737, 2.50362218]],

       [[3.26349671, 2.65378138, 0.05939658],
        [2.37605557, 0.58122708, 0.62496351],
        [4.19244264, 1.37521627, 3.33148151]],

       [[0.87377627, 4.49693418, 0.64739882],
        [0.08429628, 1.17384139, 0.63380822],
        [2.65193002, 0.65218786, 3.4755173 ]]])

In [9]:
env.P

array([[[[0., 0., 1.],
         [1., 0., 0.],
         [0., 0., 1.]],

        [[0., 0., 1.],
         [0., 0., 1.],
         [0., 1., 0.]],

        [[1., 0., 0.],
         [0., 1., 0.],
         [0., 0., 1.]]],


       [[[0., 0., 1.],
         [0., 0., 1.],
         [1., 0., 0.]],

        [[0., 1., 0.],
         [1., 0., 0.],
         [0., 1., 0.]],

        [[0., 0., 1.],
         [0., 1., 0.],
         [0., 0., 1.]]],


       [[[0., 1., 0.],
         [1., 0., 0.],
         [0., 1., 0.]],

        [[0., 0., 1.],
         [1., 0., 0.],
         [1., 0., 0.]],

        [[1., 0., 0.],
         [1., 0., 0.],
         [0., 1., 0.]]],


       [[[0., 0., 1.],
         [0., 0., 1.],
         [1., 0., 0.]],

        [[0., 0., 1.],
         [0., 1., 0.],
         [1., 0., 0.]],

        [[1., 0., 0.],
         [1., 0., 0.],
         [1., 0., 0.]]]])

In [11]:
pi

array([[2, 0, 2],
       [2, 1, 1],
       [1, 2, 2],
       [0, 0, 1],
       [2, 1, 0]])