In [1]:
%load_ext autoreload
%autoreload 2

# imports

In [2]:
from hiive.mdptoolbox.example import forest
from hiive.mdptoolbox.util import check
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import re
import numpy as np

# forest management

https://pymdptoolbox.readthedocs.io/en/latest/api/example.html  
A forest is managed by two actions: ‘Wait’ and ‘Cut’. An action is decided each year with first the objective to maintain an old forest for wildlife and second to make money selling cut wood. Each year there is a probability p that a fire burns the forest.  
Here is how the problem is modelled. Let {0, 1 . . . S-1 } be the states of the forest, with S-1 being the oldest. Let ‘Wait’ be action 0 and ‘Cut’ be action 1. After a fire, the forest is in the youngest state, that is state 0. 

In [3]:
'''
transition probability (A × S × S) array P
           | p 1-p 0.......0  |
           | .  0 1-p 0....0  |
P[0,:,:] = | .  .  0  .       |
           | .  .        .    |
           | .  .         1-p |
           | p  0  0....0 1-p |

           | 1 0..........0 |
           | . .          . |
P[1,:,:] = | . .          . |
           | . .          . |
           | . .          . |
           | 1 0..........0 |
reward (S × A) matrix R
         |  0  |
         |  .  |
R[:,0] = |  .  |
         |  .  |
         |  0  |
         |  r1 |

         |  0  |
         |  1  |
R[:,1] = |  .  |
         |  .  |
         |  1  |
         |  r2 |
'''
print()




In [4]:
P_forest_small, R_forest_small = forest(S=10, r1=10, r2=2, p=0.1)
check(P_forest_small, R_forest_small)

In [5]:
P_forest_large, R_forest_large = forest(S=500, r1=40, r2=20, p=0.1)
check(P_forest_large, R_forest_large)

# frozen lake

https://gymnasium.farama.org/environments/toy_text/frozen_lake/  
Frozen lake involves crossing a frozen lake from Start(S) to Goal(G) without falling into any Holes(H) by walking over the Frozen(F) lake. The agent may not always move in the intended direction due to the slippery nature of the frozen lake.  
Action Space  
The agent takes a 1-element vector for actions. The action space is (dir), where dir decides direction to move in which can be:

0: LEFT

1: DOWN

2: RIGHT

3: UP

Observation Space  
The observation is a value representing the agent’s current position as current_row * nrows + current_col (where both the row and col start at 0). For example, the goal position in the 4x4 map can be calculated as follows: 3 * 4 + 3 = 15. The number of possible observations is dependent on the size of the map. For example, the 4x4 map has 16 possible observations.

Rewards  
Reward schedule:

Reach goal(G): +1

Reach hole(H): 0

Reach frozen(F): 0

Slippery world  
Move in intended direction with probability of 1/3 else will move in either perpendicular direction with equal probability of 1/3 in both directions.

For example, if action is left and is_slippery is True, then:
- P(move left)=1/3
- P(move up)=1/3
- P(move down)=1/3

In [6]:
generate_random_map(size=4, p=0.8) # p: probability that a tile is frozen

['SFFF', 'FFFF', 'HFFF', 'HFFG']

In [7]:
generate_random_map(size=15, p=0.8)

['SFFHFHFFFHFFFFH',
 'FFFFFFHFFFFFHHF',
 'FFFFHFFHFHHFFFF',
 'FFFFFFFFFFFFHHF',
 'FFFFFFFFFFFFHFH',
 'FFFHFFHHHFFFFFH',
 'FFFFFFFHFFHFFFF',
 'FFFFFHFFFFFFFFF',
 'FHFFFFHHFFFFFFF',
 'FHFFFFFFFHHHFHF',
 'FHFFFFFFFFHFFFF',
 'FFFHFFFFFFFFFFF',
 'HFFFFFFFFHFFFFF',
 'FFFHFFFFFFFFFHH',
 'FFHFFFFFFFFFFFG']

In [8]:
generate_random_map(size=20, p=0.9)

['SFFFFFFFFFFFFFFFFFFH',
 'FHFFFFFFFFFHFFFFFFFF',
 'FFHFFFFFFFFFFFFFFHFH',
 'FFFFFFFFFFFFFFFFFFFF',
 'FFHFFFFFFHFFHFFFFFFF',
 'FFFFFFFFFFFFFFFFFFFF',
 'FFFFFFFFFFFFFFHFFFHF',
 'FFFFFFFFFFFFFFFFFFFF',
 'FFHFFFFFFHFHFFFFFFFF',
 'FFFFFFFFFFFFHHFFFFFF',
 'FFFFFFFFHFFFFFFFFHFF',
 'FFFFFFFFHFFFFFFFFFHF',
 'FFFFFFFFFFFFFFFFFFFF',
 'FFFFFFFFFFFFFFHFFFFF',
 'FFFHFFFFFFFFFFFFFFFF',
 'FFHFFFFFFFFFFFFFFFFF',
 'FFFFFFFFFFFFFHFFFHFF',
 'FFFFFHFFFFFFFHFFFFFF',
 'FFFFFFFFFFFFFFFFFFFF',
 'FFHFFFFFFFFFFFFFFFHG']

In [9]:
map4 = ['SFFH', 'FFFF', 'FFFF', 'FFHG']
map15 = ['SFFFFFHHHFFHFFF', 'FFFFFFFFFHFFFFF', 'HFFFFFFFFFFFHFH', 'FFFHFFFFFFFFFFF', 
         'HFFFFHFFFFFFFFF', 'FFFFFFFFFHFHFFF', 'FFHFFFFFFFFFFFF', 'FFFFFFFFFHFFFHF', 
         'HFFFFFFFFFFFFHH', 'FHFFFFFHHFFHHFF', 'FFFHFFFFFFFFFFF', 'HHHHFFHFHFHFFHF', 
         'FHFFFFFFFFFFFFF', 'FHFFFFFFFHHFFFF', 'FHFFFFFFHFFFFFG']

In [10]:
lake_small = gym.make('FrozenLake-v1', desc=map4, is_slippery=True, render_mode="rgb_array")

In [11]:
print(lake_small.desc.astype('str'))

[['S' 'F' 'F' 'H']
 ['F' 'F' 'F' 'F']
 ['F' 'F' 'F' 'F']
 ['F' 'F' 'H' 'G']]


In [12]:
lake_small.observation_space.n

16

In [13]:
lake_small.action_space.n

4

In [14]:
len(lake_small.P)

16

In [15]:
def convert_gym(env):
    '''
    Converts the transition probabilities provided by gymnasium envrionment to 
    MDPToolbox-compatible P and R arrays
    modified from https://github.com/hiive/hiivemdptoolbox/blob/master/hiive/mdptoolbox/openai.py
    
    gymnasium env.P format:
    {state1: {action1: [(prob_to_newstate1, newstate1, reward, terminated (boolean)),
                        (prob_to_newstate2, newstate2, reward, terminated (boolean)),
                        ...],
             action2: [(prob_to_newstate1, newstate1, reward, terminated (boolean)),
                        (prob_to_newstate2, newstate2, reward, terminated (boolean)),
                        ...],
             ...
            },
     state2: ... 
    }

    mdptoolbox P format: (A × S × S)
    mdptoolbox R format: (S × A)
    '''
    env.reset()

    transitions = env.P
    actions = int(re.findall(r'\d+', str(env.action_space))[0])
    states = int(re.findall(r'\d+', str(env.observation_space))[0])
    P = np.zeros((actions, states, states))
    R = np.zeros((states, actions))

    for state in range(states):
        for action in range(actions):
            for i in range(len(transitions[state][action])):
                tran_prob = transitions[state][action][i][0]
                state_ = transitions[state][action][i][1]
                R[state][action] += tran_prob*transitions[state][action][i][2]
                P[action, state, state_] += tran_prob
    return P, R

In [16]:
lake_small = gym.make('FrozenLake-v1', desc=[['S', 'H'],
                                             ['F', 'G']], 
                      is_slippery=False, render_mode="rgb_array")
P, R = convert_gym(lake_small)

In [17]:
P

array([[[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]],

       [[0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]],

       [[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]],

       [[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.]]])

In [18]:
R

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.]])