# Environment Base

> Base class for CRLD environments

In [None]:
#| default_exp Environments/Base

In [None]:
#| hide
# Imports for the nbdev development environment
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
from fastcore.utils import *
import numpy as np

In [None]:
#| export
class ebase(object):
    """Base environment. All environments should inherit from this one."""
    
    def __init__(self):
               
        self.T = self.TransitionTensor()
        self.F = np.array(self.FinalStates())
        self.R = self.RewardTensor()
        self.O = self.ObservationTensor()
                
        self.Aset = self.actions()
        self.Sset = self.states() 
        self.Oset = self.observations()

        # CHECKS
        R, T, O = self.R, self.T, self.O
        
        # number of agents
        N = R.shape[0]  
        assert O.shape[0] == N, "Inconsistent number of agents"
        assert len(T.shape[1:-1]) == N, "Inconsistent number of agents"
        assert len(R.shape[2:-1]) == N, "Inconsistent number of agents"
        
        # number of actions for each agent        
        M = T.shape[1] 
        assert np.allclose(T.shape[1:-1], M), 'Inconsistent number of actions'
        assert np.allclose(R.shape[2:-1], M), 'Inconsistent number of actions'
        assert np.all(list(map(len, self.Aset)) == np.array(M).repeat(N)),\
            'Inconsistent number of actions'
            
        # number of states
        Z = T.shape[0] 
        assert T.shape[-1] == Z, 'Inconsistent number of states'
        assert R.shape[-1] == Z, 'Inconsistent number of states'
        assert R.shape[1] == Z, 'Inconsistent number of states'
        assert O.shape[1] == Z, 'Inconsistent number of states'
        assert len(self.F) == Z, 'Inconsistent number of states'
        assert len(self.Sset) == Z, 'Inconsistent number of states'

        # number of observations
        Q = O.shape[-1]
        assert np.all(list(map(len, self.Oset)) == np.array(Q).repeat(N)),\
            'Inconsistent number of observations'
        
        assert np.allclose(T.sum(-1), 1), 'Transition model wrong'
        # TODO: Observations dont have to sum up[ to 1 anymore because we might have observations = 
        # 0.5 for partial observable agents
        # Make sure this is correct. Should Obs be zero? or should they always sum up to 1?
        # I don't think that's possible for ?
        # assert np.allclose(O.sum(-1), 1), 'Observation model wrong'


The `ebase` class `__init__` mostly contains consistency checks.

## Core methods

These need to be implemented by a concrete environment.

The transitions tensor `Tsjas'` gives the probability of the environment to transition to state `s'`, given that it was in state `s` and the agent chose the joint action `ja`.

In [None]:
#| export
@patch
def TransitionTensor(self:ebase):
    raise NotImplementedError

In [None]:
class slf: pass
test_fail(ebase.TransitionTensor, args=slf)

raises `NotImplementedError`.

The reward tensor `Risjas'` gives the reward agent `i` receives when the environment is in state `s`, all agents choose the join action `ja`, and the environment transitions to state `s'`.

In [None]:
#| export
@patch
def RewardTensor(self:ebase):
    raise NotImplementedError

In [None]:
class slf: pass
test_fail(ebase.RewardTensor, args=slf)

raises `NotImplementedError`.

The following two "core" methods are optional. If the concrete environment class does not implement them, they default to the following:

The observation tensor `Oiso` gives the probability that agent `i` observes observation `o` when the environment is in state `s`. The default observation tensor assumes perfect observation and sets the number of observations `Q` to the number of states `Z`.

In [None]:
#| export
@patch
def ObservationTensor(self:ebase):
    """Default observation tensor: perfect observation"""
    self.defaultObsTensUsed = True
    self.Q = self.Z
    Oiso = np.ones((self.N, self.Z, self.Q))
    for i in range(self.N):
        Oiso[i, :, :] = np.eye(self.Q)
    return Oiso

In [None]:
class slf: Z = 2; N = 3  # dummy self for demonstration only
ebase.ObservationTensor(slf)

array([[[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]]])

Final states `Fs` indicate which states of the environment cause the end of an episode. Their meaning and use within CRLD are not fully resolved yet. If an environment does not implement `FinalStates` they default to no final states.

In [None]:
#| export
@patch
def FinalStates(self:ebase):
    """Default final states: no final states"""
    return np.zeros(self.Z, dtype=int)

In [None]:
class slf: Z = 7 # dummy self for demonstration only
ebase.FinalStates(slf)

array([0, 0, 0, 0, 0, 0, 0])

## Default string representations
String representations of actions, states and observations help with interpreting the results of simulation runs. Ideally, an environment class will implement these methods with descriptive values.

To show these methods here we create a dummy "self" of 2 environmental states, containing 3 agents with 4 actions and 5 observations of the environmental states.

In [None]:
# dummy self of 2 environmental 2 agents with 3 actions in an environment
class slf: Z = 2; N = 3; M=4; Q=5

In [None]:
#| export
@patch
def actions(self:ebase):
    """Default action set representations `act_im`."""
    return [[str(a) for a in range(self.M)] for _ in range(self.N)]

In [None]:
ebase.actions(slf)

[['0', '1', '2', '3'], ['0', '1', '2', '3'], ['0', '1', '2', '3']]

In [None]:
#| export
@patch
def states(self:ebase):
    """Default state set representation `state_s`."""
    return [str(s) for s in range(self.Z)]

In [None]:
ebase.states(slf)

['0', '1']

In [None]:
#| export
@patch
def observations(self:ebase):
    """Default observation set representations `obs_io`."""
    if hasattr(self, 'defaultObsTensUsed'):
        return [[str(o) for o in self.states()] for _ in range(self.N)]
    else:
        return [[str(o) for o in range(self.Q)] for _ in range(self.N)]


In [None]:
ebase.observations(slf)

[['0', '1', '2', '3', '4'],
 ['0', '1', '2', '3', '4'],
 ['0', '1', '2', '3', '4']]

In [None]:
#| export
@patch
def id(self:ebase):
    """
    Returns id string of environment
    """
    # Default
    id = f"{self.__class__.__name__}"
    return id

@patch
def __str__(self:ebase): return self.id()

@patch
def __repr__(self:ebase): return self.id()

## Interactive use
Environments can also be used interactivly, e.g., with iterative learning algorithms. For this purpose we provide the [OpenAI Gym `step` Interface](https://github.com/openai/gym#api).

In [None]:
#| export
@patch
def step(self:ebase, 
         jA:Iterable # joint actions
        ) -> tuple:  # (observations_i, rewards_i, done, info)
    """
    Iterate the environment one step forward.
    """
    # choose a next state according to transition tensor T
    tps = self.T[tuple([self.state]+list(jA))].astype(float)
    next_state = np.random.choice(range(len(tps)), p=tps)

    # obtain the current rewards
    rewards = self.R[tuple([slice(self.N),self.state]+list(jA)
                           +[next_state])]

    # advance the state and collect info
    self.state = next_state
    obs = self.observation()     

    # if state is a final state the episode is done
    done = self.state in np.where(self.F==1)[0]

    # report the true state in the info dict
    info = {'state': self.state}

    return obs, rewards.astype(float), done, info

In [None]:
#| export
@patch
def observation(self:ebase
               ) -> np.ndarray:  # observations_i
    """
    Possibly random observation for each agent from the current state.
    """
    OBS = np.zeros(self.N, dtype=int)
    for i in range(self.N):
        ops = self.O[i, self.state]
        obs = np.random.choice(range(len(ops)), p=ops)
        OBS[i] = obs
    return OBS

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()