In [12]:
import gym
import numpy as np

def policy_iteration(env):
    # Initialize a random policy
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    policy = np.zeros((num_states, num_actions))
    for state in range(num_states):
        best_action = np.random.choice(np.arange(num_actions))
        policy[state, best_action] = 1.0


    while True:
        # Policy evaluation
        prev_V = np.zeros(num_states)  # Initialize prev_V
        while True:
            V = np.zeros(num_states)
            for state in range(num_states):
                for action, action_prob in enumerate(policy[state]):
                    for transition_prob, next_state, reward, _ in env.P[state][action]:
                        V[state] += action_prob * transition_prob * (reward + gamma * prev_V[next_state])
            # Check for convergence
            if np.max(np.abs(V - prev_V)) < 1e-6:
                break
            prev_V = np.copy(V)

        # Policy improvement
        policy_stable = True
        for state in range(num_states):
            old_action = np.argmax(policy[state])
            q_values = np.zeros(num_actions)
            for action in range(num_actions):
                for transition_prob, next_state, reward, _ in env.P[state][action]:
                    q_values[action] += transition_prob * (reward + gamma * prev_V[next_state])
            best_action = np.argmax(q_values)
            if old_action != best_action:
                policy_stable = False
            policy[state] = np.eye(num_actions)[best_action]

        # Check for convergence
        if policy_stable:
            break

    return policy

# Create the environment
env = gym.make('FrozenLake-v1')
gamma = 0.9  # discount factor

# Run policy iteration
optimal_policy = policy_iteration(env)
print("Optimal Policy:")
print(optimal_policy)


Optimal Policy:
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]


### Import Packages

In [1]:
import numpy as np
import gym

### Create enviornment

In [2]:
env=gym.make("FrozenLake-v1")
obs=env.observation_space.n
act=env.action_space.n

### Policy Intialization

In [4]:
V=np.zeros(obs)
num_states=env.observation_space.n
num_actions=env.action_space.n
policy=np.ones((num_states,num_actions))
for state in range(num_states):
    best_action=np.random.choice(np.arange(num_actions))
    policy[state,best_action]=1
policy
    

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

### Policy Evaluation

In [7]:
prev_v=np.zeros(num_states)
V=np.zeros(num_states)
for state in range(num_states):
    for action,action_prob in enumerate(policy[state]):
        for trans_prob,nxt_state,reward,done in env.P[state][action]:
            V[state]+=trans_prob*action_prob*(reward+gamma*prev_v[nxt_state])
                    
    if np.max(np.abs(V-prev_v))<1e-6:
        break
    prev_v=np.copy(V)
            

### Policy Improvement

In [13]:
policy_stable=True
for state in range(num_states):
    old_action=np.argmax(policy[state])
    q_values=np.zeros(num_actions)
    for action in range(num_actions):
        for trans_prob,nxt_state,reward,done in env.P[state][action]:
            q_values[action] += trans_prob * (reward + gamma * prev_v[nxt_state])
    best_action=np.argmax(q_values)   
    if old_action != best_action:
        policy_stable = False
    policy[state] = np.eye(num_actions)[best_action]
    
    
    if policy_stable:
        break
    
print(policy)

    

[[1. 0. 0. 0.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
