In [1]:
import numpy as np

In [2]:
valid_actions = np.array([[0,1,0],[1,0,1]])

policy = np.array([[0,1,0],[0.5,0,0.5]])

rewards = np.array([[0,0,0],[0,0,10]])

In [6]:
def calculate_state_values(policy, rewards, gamma=0.9, theta=0.01):
  """Evaluate values of states for given policy and reward mappings.

  Args:
      policy (np.array): Policy giving the probability of taking each action from each state.
      rewards (np.array): Rewards corresponding to reaching the next state from each state (so r(s,s') here rather than the usual r(s,a)).
      gamma (float, optional): Discount factor. Defaults to 0.9.
      theta (float, optional): Minimum value change threshold to terminate iteration. Defaults to 0.01.

  Returns:
      values (np.array): State values for given policy and reward mappings.
  """    

  num_states, num_actions = policy.shape # 2, 3
  #values = np.zeros((num_states+1)) # Include terminal state
  values = np.array([10*gamma**3, 0.5*(10*gamma**2+10), 0])

  print(f'Beginning policy evaluation for given policy and MDP...\n')
  iteration = 1

  while True:
    print(f'Iteration: {iteration} \t Current Values: {values}')
    delta=0
    initial_values = values
    values = np.zeros_like(values)
    for state in range(num_states):
      for action in range(num_actions):
        # With probability 0.9, next state = action with corresponding reward. With probability 0.1, next state = state with no reward.
        next_state_probabilities = {action:1.0, state:0.0}
        # Often we leave this next state computation (transition dynamics) to the environment. But to use complete dynamic programming we must know 
        # the transition probablities.
        for next_state in next_state_probabilities.keys():
          # Note the expectation here is over both the policy and next state probabilities (environment dynamics)
          values[state] += policy[state][action]*next_state_probabilities[next_state]*(rewards[state][next_state]+gamma*initial_values[next_state])

      delta = max(delta, abs(initial_values[state]-values[state]))

    if delta < theta:
      print(f'\nMax difference in state value from previous iteration = {delta} which is less than threshold {theta}. Policy Evaluation terminating...\n')
      break

    iteration+=1

  print(f'Final policy state values: {values}')
  return values

In [5]:
values = calculate_state_values(policy, rewards, gamma=0.9, theta=0.01)

Beginning policy evaluation for given policy and MDP...

Iteration: 1 	 Current Values: [0. 0. 0.]
Iteration: 2 	 Current Values: [0. 5. 0.]
Iteration: 3 	 Current Values: [4.5 5.  0. ]
Iteration: 4 	 Current Values: [4.5   7.025 0.   ]
Iteration: 5 	 Current Values: [6.3225 7.025  0.    ]
Iteration: 6 	 Current Values: [6.3225   7.845125 0.      ]
Iteration: 7 	 Current Values: [7.0606125 7.845125  0.       ]
Iteration: 8 	 Current Values: [7.0606125  8.17727563 0.        ]
Iteration: 9 	 Current Values: [7.35954806 8.17727563 0.        ]
Iteration: 10 	 Current Values: [7.35954806 8.31179663 0.        ]
Iteration: 11 	 Current Values: [7.48061697 8.31179663 0.        ]
Iteration: 12 	 Current Values: [7.48061697 8.36627763 0.        ]
Iteration: 13 	 Current Values: [7.52964987 8.36627763 0.        ]
Iteration: 14 	 Current Values: [7.52964987 8.38834244 0.        ]
Iteration: 15 	 Current Values: [7.5495082  8.38834244 0.        ]

Max difference in state value from previous iterati

In [7]:
values = calculate_state_values(policy, rewards, gamma=0.9, theta=0.01)

Beginning policy evaluation for given policy and MDP...

Iteration: 1 	 Current Values: [7.29 9.05 0.  ]
Iteration: 2 	 Current Values: [8.145  8.2805 0.    ]
Iteration: 3 	 Current Values: [7.45245 8.66525 0.     ]
Iteration: 4 	 Current Values: [7.798725  8.3536025 0.       ]
Iteration: 5 	 Current Values: [7.51824225 8.50942625 0.        ]
Iteration: 6 	 Current Values: [7.65848363 8.38320901 0.        ]
Iteration: 7 	 Current Values: [7.54488811 8.44631763 0.        ]
Iteration: 8 	 Current Values: [7.60168587 8.39519965 0.        ]
Iteration: 9 	 Current Values: [7.55567969 8.42075864 0.        ]
Iteration: 10 	 Current Values: [7.57868278 8.40005586 0.        ]
Iteration: 11 	 Current Values: [7.56005027 8.41040725 0.        ]

Max difference in state value from previous iteration = 0.009316252071421616 which is less than threshold 0.01. Policy Evaluation terminating...

Final policy state values: [7.56936652 8.40202262 0.        ]


In [8]:
from datetime import datetime
datetime.now().strftime("%Y-%m-%d-%H:%M:%S")

'2023-04-11-10:54:41'