In [11]:
# import gym
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import numpy as np
from mdptoolbox import mdp

env = gym.make(
    "FrozenLake-v1",
    is_slippery=True,
    render_mode="rgb_array",
    desc=generate_random_map(
        size=16, p=0.85, seed=26
    ),
)
# Reset the environment to get the initial state
env.reset()

# Render the environment to see its current state
env.render()

n_states = env.observation_space.n
n_actions = env.action_space.n
P = np.zeros([n_actions, n_states, n_states])
R = np.zeros([n_states, n_actions])

for state in range(n_states):
    for action in range(n_actions):
        transitions = env.P[state][action]
        for trans_prob, next_state, reward, done in transitions:
            P[action, state, next_state] += trans_prob
            if not done or reward == 1:  # Include reward for non-terminal states
                R[state, action] += reward * trans_prob

for action in range(n_actions):
    P[action] /= P[action].sum(axis=1, keepdims=True)


vi = mdp.ValueIteration(P, R, 0.9)
vi.run()

optimal_policy = vi.policy
print(optimal_policy)


(1, 2, 2, 2, 1, 2, 1, 1, 3, 2, 2, 1, 0, 2, 2, 0)
