### MS&E 346 Assignment 3
#### January 18

In [32]:
import numpy as np
from scipy.linalg import eig
import copy
import math

#### Question 1:
Write code for Policy Evaluation (tabular) algorithm
#### Answer:

Problem: evaluate a given policy ${\pi}$  
Solution: iteratively application of Bellman expectation backup
Using synchronus backups:
* At each iteration k + 1
* For all states ${s \in S}$ , update ${v_{k+1}(s)}$ from ${v_{k}(s')}$
* where ${s'}$ is a successor state of s


In [8]:
# when the tabular states/ policies are represented as a matrix, the update function can be written as follows
# another implementation of the policy evaluation using the previous definition of MDP will be used later
class MRP:
    """Iterative policy evaluation"""
    def iterative_policy_evaluation(self):
        # initiate the value of the state to zeros
        reward = self.reward
        transition = self.transition
        value = self.value
        gamma = self.gamma
        value = np.zeros(self.reward.shape)
        
        while True:
            # iteratively update the value function of each state until the value of state no longer changes
            old_v = copy.deepcopy(value)
            value = reward + gamma * np.matmul(transition, value)
            if np.sum(np.abs(value - old_v)) < 1e-8:
                # use L1 norm to measure the distance
                break
        self.value = value
                    
        
    def __init__(self, reward, transition, value, gamma):
        """
        In the Markovian reward process, the reward is a function of state
        """
        self.reward = reward
        self.transition = transition
        self.value = value
        self.gamma = gamma

A toy example of a Mars-Rover, where the reward space is defined as $${[1, 0, 0, 0, 0, 0, 10]}$$, and the transition probability of each state is it has 50% chance to go to the left state, and 50% chance going to the right state, assume gamma is defined as 0.95. The Markov reward process can be defined as follows:

In [11]:
reward = np.matrix([[1, 0, 0, 0, 0, 0, 10]]).T
transition = np.matrix([[1, 0, 0, 0, 0, 0, 0],
                        [0.5, 0, 0.5, 0, 0, 0, 0], 
                        [0, 0.5, 0, 0.5, 0, 0, 0], 
                        [0, 0, 0.5, 0, 0.5, 0, 0], 
                        [0, 0, 0, 0.5, 0, 0.5, 0], 
                        [0, 0, 0, 0, 0.5, 0, 0.5], 
                        [0, 0, 0, 0, 0, 0, 1]])
value = np.random.rand(reward.shape[0], reward.shape[1])
gamma = 0.95
mrp_process = MRP(reward, transition, value, gamma)

In [12]:
mrp_process.iterative_policy_evaluation()
print(mrp_process.value)

[[ 20.        ]
 [ 33.5282314 ]
 [ 50.58575033]
 [ 72.96808508]
 [103.0312709 ]
 [143.93985365]
 [199.99999995]]


#### Question 2:
Write code for Policy Iteration (tabular) algorithm
#### Answer:

policy improvement:
Consider a deterministic policy: ${a = \pi(s)}$, we can improve the policy by acting greedily, $${\pi'(s) = argmax_{a \in A} q_\pi (s, a) }$$  
This improves the value from any state s over one time step

In [70]:
# Consider a Markov Desicion Process where the policy and reward are deterministic
class MDP:
    
    """Iterative policy evaluation"""
    def iterative_policy_evaluation(self):
        # initiate the value of the state to zeros
        reward = self.reward
        transition = self.transition
        value = self.value
        gamma = self.gamma
        value = np.zeros(self.reward.shape)
        
        while True:
            # iteratively update the value function of each state until the value of state no longer changes
            old_v = copy.deepcopy(value)
            value = reward + gamma * np.matmul(transition, value)
            if np.sum(np.abs(value - old_v)) < 1e-8:
                # use L1 norm to measure the distance
                break
        self.value = value
        
    def construct_policy_set(self, policy):
        # mapping a policy to a integer and process it later
        policy_set = {}
        for key, value in policy.items():
            policy_set[key] = value
        return policy_set
    
    def construct_policy_matrix(self):
        self.policy_matrix = np.random.randint(len(self.policy_set), size = self.reward.shape)
    
    def argmax_policy(self, policy_matrix, i):
        # This function is left abstract and when initiate a MDP object, this should be overwritten by an implementation
        best_policy = -1
        best_v = - math.inf
        for key, value in self.policy_set.items():
            curt_value = 0
            if key == 0:
                if i > 0:
                    curt_value = self.reward[i - 1] + self.value[i - 1]
                else:
                    # absorbing state
                    curt_value = self.reward[i] + self.value[i]
            elif key == 1:
                if i < policy_matrix.shape[0] - 1:
                    curt_value = self.reward[i + 1] + self.value[i + 1]
                else:
                    curt_value = self.reward[i] + self.value[i]
            else:
                if i != 0 and i < policy_matrix.shape[0] - 1:
                    curt_value = 0.5 * (self.reward[i - 1] + self.value[i - 1]) + 0.5 * (self.reward[i + 1] + self.value[i + 1])
                else:
                    curt_value = self.reward[i] + self.value[i]
            if best_v <= curt_value:
                best_v = curt_value
                best_policy = key
        if best_v != -math.inf:
            self.value[i] = best_v
            policy_matrix[i] = best_policy
        return policy_matrix[i]
    
    """Policy Iteration"""
    def policy_iteration(self):
        # initialize the policy matrix with a random policy
        policy_matrix = np.random.randint(len(self.policy_set), size = self.reward.shape)
        print('policy matrix', policy_matrix)
        value = np.zeros(self.reward.shape)
        while True:
            old_policy = copy.deepcopy(policy_matrix)
            for i in range(policy_matrix.shape[0]):
                policy_matrix[i] = self.argmax_policy(policy_matrix, i)
            # compare the result from the update
            if (np.sum(np.abs(old_policy - policy_matrix)) == 0):
                break
        self.policy_matrix = copy.deepcopy(policy_matrix)
            
        
    def __init__(self, reward, transition, value, gamma, policy):
        self.reward = reward
        self.transition = transition
        self.value = value
        self.gamma = gamma
        self.policy = policy
        self.policy_set = self.construct_policy_set(policy)
        # initialize the policy randomly
        self.policy_matrix = self.construct_policy_matrix()

Consider the similar Mars-Rover example as before:
A toy example of a Mars-Rover, where the reward space is defined as $${[1, -1, -1, -1, -1, -1, 10]}$$, and the transition probability of each state is it has 50% chance to go to the left state, and 50% chance going to the right state, assume gamma is defined as 0.95. This time, we have 3 policies, namely, walk to the left for sure, walk to the right for sure, walk to the left/ right w/ 50% probability each.

The Markov reward process can be defined as follows:

In [71]:
reward = np.matrix([[1, -1, -1, -1, -1, -1, 10]]).T
transition = np.matrix([[1, 0, 0, 0, 0, 0, 0],
                        [0.5, 0, 0.5, 0, 0, 0, 0], 
                        [0, 0.5, 0, 0.5, 0, 0, 0], 
                        [0, 0, 0.5, 0, 0.5, 0, 0], 
                        [0, 0, 0, 0.5, 0, 0.5, 0], 
                        [0, 0, 0, 0, 0.5, 0, 0.5], 
                        [0, 0, 0, 0, 0, 0, 1]])
value = np.random.rand(reward.shape[0], reward.shape[1])
gamma = 0.95
policy = {0:'To left', 1:'To right', 2:'50% go to left, 50% go to right'}
mdp_process = MDP(reward, transition, value, gamma, policy)

In [72]:
mdp_process.policy_set

{0: 'To left', 1: 'To right', 2: '50% go to left, 50% go to right'}

In [73]:
mdp_process.policy_iteration()

policy matrix [[1]
 [1]
 [2]
 [1]
 [2]
 [0]
 [2]]


In [74]:
mdp_process.policy_matrix

array([[2],
       [0],
       [1],
       [1],
       [1],
       [1],
       [2]])

#### Question 3:
Write code for Value Iteration (tabular) algorithm
#### Answer:

Problem: find the optimal policy ${\pi}$  
Solution: iteratively application of Bellman optimality backup 
Using synchronous backups:
* At each iteration k + 1, for all states ${s \in S}$
* Update ${v_{k+1}(s)}$ from ${v_k(s')}$
$${v_{k + 1} = max_{a \in A}(R^a + \gamma P^a v_{k})}$$

In [89]:
# Consider a Markov Desicion Process where the policy and reward are deterministic
class MDP:
    
    """Iterative policy evaluation"""
    def iterative_policy_evaluation(self):
        # initiate the value of the state to zeros
        reward = self.reward
        transition = self.transition
        value = self.value
        gamma = self.gamma
        value = np.zeros(self.reward.shape)
        
        while True:
            # iteratively update the value function of each state until the value of state no longer changes
            old_v = copy.deepcopy(value)
            value = reward + gamma * np.matmul(transition, value)
            if np.sum(np.abs(value - old_v)) < 1e-8:
                # use L1 norm to measure the distance
                break
        self.value = value
        
    def construct_policy_set(self, policy):
        # mapping a policy to a integer and process it later
        policy_set = {}
        for key, value in policy.items():
            policy_set[key] = value
        return policy_set
    
    def construct_policy_matrix(self):
        self.policy_matrix = np.random.randint(len(self.policy_set), size = self.reward.shape)
    
    def argmax_policy(self, policy_matrix, i):
        # This function is left abstract and when initiate a MDP object, this should be overwritten by an implementation
        best_policy = -1
        best_v = - math.inf
        for key, value in self.policy_set.items():
            curt_value = 0
            if key == 0:
                if i > 0:
                    curt_value = self.reward[i - 1] + self.value[i - 1]
                else:
                    # absorbing state
                    curt_value = self.reward[i] + self.value[i]
            elif key == 1:
                if i < policy_matrix.shape[0] - 1:
                    curt_value = self.reward[i + 1] + self.value[i + 1]
                else:
                    curt_value = self.reward[i] + self.value[i]
            else:
                if i != 0 and i < policy_matrix.shape[0] - 1:
                    curt_value = 0.5 * (self.reward[i - 1] + self.value[i - 1]) + 0.5 * (self.reward[i + 1] + self.value[i + 1])
                else:
                    curt_value = self.reward[i] + self.value[i]
            if best_v <= curt_value:
                best_v = curt_value
                best_policy = key
        if best_v != -math.inf:
            self.value[i] = best_v
            policy_matrix[i] = best_policy
        return policy_matrix[i]
    
    """Policy Iteration"""
    def policy_iteration(self):
        # initialize the policy matrix with a random policy
        policy_matrix = np.random.randint(len(self.policy_set), size = self.reward.shape)
        print('policy matrix', policy_matrix)
        value = np.zeros(self.reward.shape)
        while True:
            old_policy = copy.deepcopy(policy_matrix)
            for i in range(policy_matrix.shape[0]):
                policy_matrix[i] = self.argmax_policy(policy_matrix, i)
            # compare the result from the update
            if (np.sum(np.abs(old_policy - policy_matrix)) == 0):
                break
        self.policy_matrix = copy.deepcopy(policy_matrix)
        
    def value_iteration(self):
        # the value matrix and the transition matrix are pre-defined before problem and are deterministic
        value = self.value
        transition = self.transition
        while True:
            old_v = copy.deepcopy(value)
            for i in range(old_v.shape[0]):
                value[i] = np.max(reward + self.gamma * np.dot(transition[i, :, :], value).reshape(-1, reward.shape[1]))
            if np.sum(np.abs(old_v - value)) < 1e-8:
                break;
        self.value = value
        
        
        
    def __init__(self, reward, transition, value, gamma, policy):
        """
        By this definition, the reward is a n * m matrix with one state and one action
        the transition matrix is a n * m * n matrix, and each value of the matrix is indicating the probability
        of p(s'|s, a)
        """
        self.reward = reward
        self.transition = transition
        self.value = value
        self.gamma = gamma
        self.policy = policy
        self.policy_set = self.construct_policy_set(policy)
        # initialize the policy randomly
        self.policy_matrix = self.construct_policy_matrix()

In [90]:
reward = np.matrix([[1, -1],[-1, 1]])
transition = np.zeros((2,2,2))
transition[0, 0, 0] = 0.7
transition[0, 0, 1] = 0.3
transition[0, 1, 0] = 0.3
transition[0, 1, 1] = 0.7
transition[1, 1, 0] = 0.4
transition[1, 1, 1] = 0.6
transition[1, 0, 0] = 0.6
transition[1, 0, 1] = 0.4
value = np.random.rand(reward.shape[0], reward.shape[1])
gamma = 0.9
policy = {0:'To left', 1:'To right', 2:'50% go to left, 50% go to right'}
mdp_process = MDP(reward, transition, value, gamma, policy)