In [2]:
import numpy as np

from system import (
    mdp,
    reward,
    states,
    num_states,
    num_actions,
    to_idx,
    get_valid_actions,
)
from model import Model
from policy import Policy, PolicyInit


In [3]:
def EvaluatePolicy(
    policy: Policy, valf: np.ndarray, model: Model, thresh=0.01, gamma=1.0
):
    i = 0
    delta = thresh + 1
    while delta > thresh and i < 1000:
        delta = 0
        for s in states:
            v = valf[to_idx(s)]
            a = policy.get_action(s)
            # policy.set_action(to_state(s), a)

            valf[to_idx(s)] = model.prob(s, a) @ (model.get_reward(s) + gamma * valf)

            delta = max(delta, abs(v - valf[to_idx(s)]))
        i += 1
    return valf

In [4]:
def UpdatePolicy(policy: Policy, valf: np.ndarray, model: Model, gamma=1.0):
    policy_stable = True
    k = 0
    for s in states:
        old_action = policy.get_action(s)
        amzt = np.argmax(
            [
                model.prob(s, a) @ (model.get_reward(s) + gamma * valf)
                for a in get_valid_actions(s, idx=True)
            ]
        )
        amzt = get_valid_actions(s, idx=True)[amzt]
        policy.set_action(s, amzt)
        if old_action != policy.get_action(s):
            k+=1
            policy_stable = False
    print(f"Policy changed for {k} states")
    return policy_stable



In [5]:
# Initialize model
model = Model(mdp, reward)

# Initialize policy
policy = Policy(num_states, num_actions, PolicyInit.RANDOM)

In [6]:
for s in states:
    policy.set_action(s, policy.gen_action_idx(s))

In [7]:
# Initialize random state function
valf = np.random.uniform(0, 2, size=num_states)
valf_store = []
valf_store.append(valf)

thresh = 0.01
gamma = 0.9

policy_stable = False
i = 0
while (not policy_stable) and i < 1000:
    print(f"Policy Iteration {i + 1}")

    # Policy Evaluation
    print("Evaluating Policy")
    valf = EvaluatePolicy(policy, valf, model, thresh, gamma)
    valf_store.append(valf)

    # Policy Improvement
    print("Updating Policy")
    policy_stable = UpdatePolicy(policy, valf, model, gamma)
    i += 1


Policy Iteration 1
Evaluating Policy
Updating Policy
Policy changed for 59 states
Policy Iteration 2
Evaluating Policy
Updating Policy
Policy changed for 39 states
Policy Iteration 3
Evaluating Policy
Updating Policy
Policy changed for 29 states
Policy Iteration 4
Evaluating Policy
Updating Policy
Policy changed for 18 states
Policy Iteration 5
Evaluating Policy
Updating Policy
Policy changed for 18 states
Policy Iteration 6
Evaluating Policy
Updating Policy
Policy changed for 12 states
Policy Iteration 7
Evaluating Policy
Updating Policy
Policy changed for 8 states
Policy Iteration 8
Evaluating Policy
Updating Policy
Policy changed for 4 states
Policy Iteration 9
Evaluating Policy
Updating Policy
Policy changed for 1 states
Policy Iteration 10
Evaluating Policy
Updating Policy
Policy changed for 0 states


In [11]:
# Run agent
score = 0
steps = 0
s = np.array([3,5])

path = [s]

while(True):
    a = policy.get_action(s)
    print(f"State: {s} Action: {a}")

    s_ = model.gen_next(s, a)
    path.append(s_)
    print(f"Next State: {s_}")

    reward = model.get_reward(s, s_)
    score += reward
    print(f"Reward: {reward}")

    if(reward != -1):
        print(f"Game Over - Score: {score}")
        break
    
    s = s_
    steps += 1

State: [3 5] Action: 7
Next State: [3 6]
Reward: -1
State: [3 6] Action: 7
Next State: [0 7]
Reward: -1
State: [0 7] Action: 7
Next State: [1 8]
Reward: -1
State: [1 8] Action: 7
Next State: [2 9]
Reward: -1
State: [2 9] Action: 6
Next State: [3 9]
Reward: -1
State: [3 9] Action: 6
Next State: [4 9]
Reward: -1
State: [4 9] Action: 6
Next State: [5 9]
Reward: -1
State: [5 9] Action: 5
Next State: [5 8]
Reward: -1
State: [5 8] Action: 5
Next State: [3 7]
Reward: 10
Game Over - Score: 2
