In [6]:
import gym
import numpy as np
import copy

In [7]:
seed = 0
np.random.seed(seed)
env = gym.make('FrozenLake-v1', desc=None, map_name="8x8", is_slippery=False,render_mode='human')
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space", env.observation_space)
print("Sample observation", env.observation_space.sample())
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())
print("Reward range", env.reward_range)
print('Meta Data',env.metadata)
print(env.P[1][2])
# p, s`, r, done
print(env.P[1])
# a: p, s`, r, done

_____OBSERVATION SPACE_____ 

Observation Space Discrete(64)
Sample observation 59

 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 0
Reward range (0, 1)
Meta Data {'render_modes': ['human', 'ansi', 'rgb_array'], 'render_fps': 4}
[(1.0, 2, 0.0, False)]
{0: [(1.0, 0, 0.0, False)], 1: [(1.0, 9, 0.0, False)], 2: [(1.0, 2, 0.0, False)], 3: [(1.0, 1, 0.0, False)]}


POLICY ITERATION

In [5]:
def policy_evaluation(policy, gamma,theta, init_V = None):
    if init_V is None:
        init_V = np.zeros(env.observation_space.n)
    V = np.copy(init_V)
    delta = np.inf
    while delta > theta:
        delta =0
        for state in range(env.observation_space.n):
            v = V[state]
            action = policy[state]
            V[state] = sum([trans_prob * (reward_prob + gamma * V[next_state]) 
                        for trans_prob, next_state, reward_prob, _ in env.P[state][action]])
            delta = max(delta,abs(v-V[state]))
    return V

def policy_improvement(policy,V, gamma):
    policy_stable = True
    policy = np.copy(policy)
    for state in range(env.observation_space.n):
        old_action = policy[state]
        Q_table = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]: 
                trans_prob, next_state, reward_prob, _ = next_sr 
                Q_table[action] += (trans_prob * (reward_prob + gamma * V[next_state]))
        new_action = np.argmax(Q_table)
        policy[state] = new_action
        if old_action != new_action:
            policy_stable = False
    return policy,policy_stable

def policy_iteration(init_policy = None,gamma = 0.9,theta = 1e-5):
    if init_policy is None:
        init_policy = np.zeros(env.observation_space.n)
    policy = np.copy(init_policy)
    policy_stable = False
    i = 0
    while not policy_stable:
        print('Loop -->',i)
        i+=1
        V = policy_evaluation(policy, gamma, theta)
        policy,policy_stable = policy_improvement(policy, V, gamma)
    print(f'Policy converged in {i} steps')
    policy = policy.astype(int)
    return policy, V


In [6]:
policy, V = policy_iteration()

observation,_ = env.reset()
for _ in range(1000):
    action = policy[observation]
    # env.render()
    observation, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        observation,_ = env.reset()
env.close()

Loop --> 0
Loop --> 1
Loop --> 2
Loop --> 3
Loop --> 4
Loop --> 5
Loop --> 6
Loop --> 7
Loop --> 8
Loop --> 9
Loop --> 10
Loop --> 11
Loop --> 12
Loop --> 13
Loop --> 14
Policy converged in 15 steps


AttributeError: module 'numpy' has no attribute 'bool8'

VALUE ITERATION

In [None]:
def value_iteration(gamma = 0.9,theta = 1e-5,max_iterations=1000,init_policy = None,init_V = None):
    if init_V is None:
        init_V = np.zeros(env.observation_space.n)
    V = np.copy(init_V)
    if init_policy is None:
        init_policy = np.zeros(env.observation_space.n)
    policy = np.copy(init_policy)
    delta = np.inf
    i=0
    while delta > theta and i<max_iterations:
        print('Loop -->',i)
        i+=1
        delta =0
        for state in range(env.observation_space.n):
            v = V[state]
            Q_table = np.zeros(env.action_space.n)
            for action in range(env.action_space.n):
                for next_sr in env.P[state][action]: 
                    trans_prob, next_state, reward_prob, _ = next_sr 
                    Q_table[action] += (trans_prob * (reward_prob + gamma * V[next_state]))
            new_action = np.argmax(Q_table)
            V[state] = Q_table[new_action]
            policy[state] = new_action
            delta = max(delta,abs(v-V[state]))
    print(f'Policy converged in {i} steps')
    policy = policy.astype(int)
    return  policy, V


policy, V = value_iteration()


observation,_ = env.reset()
for _ in range(1000):
    action = policy[observation]
    # env.render()
    observation, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        observation,_ = env.reset()
env.close()

Loop --> 0
Loop --> 1
Loop --> 2
Loop --> 3
Loop --> 4
Loop --> 5
Loop --> 6
Loop --> 7
Loop --> 8
Loop --> 9
Loop --> 10
Loop --> 11
Loop --> 12
Loop --> 13
Loop --> 14
Policy converged in 15 steps


AttributeError: module 'numpy' has no attribute 'bool8'

: 