In [3]:
import numpy as np

grid_size = (3, 4)
rewards = np.zeros(grid_size)

# Setting rewards 
rewards[0, 3] = 1
rewards[1, 3] = -1

# Parameters
gamma = 0.9
convergent = 0.0001

# Initializing value function to a matrix of zeros 
value_function = np.zeros(grid_size)

# Setting actions up, down, left, right
actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]

# Defining the terinal positions
def terminalPositions(state):
    return state == (0, 3) or state == (1, 3)

# Function to get next state
def get_next_state(state, action):
    next_state = (state[0] + action[0], state[1] + action[1])
    
    # Check if next state is within the grid boundaries
    if 0 <= next_state[0] < grid_size[0] and 0 <= next_state[1] < grid_size[1]:
        return next_state
    else:# If out of bounds, stay in the same state
        return state 

# Value iteration

while True:
    # Difference checks the convergence 
    delta = 0

    # Creates same copy of the matrix to store updated values of the value function
    new_value_function = np.copy(value_function)

    # Iterating each element in the loop 
    for i in range(grid_size[0]):
        for j in range(grid_size[1]):
            
            # Declaring initial position 
            state = (i, j)

            if terminalPositions(state):
                continue

            value_sum = []

            # Getting the next step's position
            for move in actions:
                next_state = get_next_state(state, move)
                
                # Stores all the calculated value functions in all the directions using the Bellman equation in the list created above 
                value_sum.append(rewards[next_state] + gamma * value_function[next_state])

            # Here all the possible values are sorted and the maximum value is stored in the copy array 
            new_value_function[state] = max(value_sum)

            # Calculating the difference between the new matrix and the old one to see if the difference is negligible 
            delta = max(delta, abs(new_value_function[state] - value_function[state]))

    # Updating the value function
    value_function = new_value_function

    if delta < convergent:
        break

# Printing the final value function
print("Final Value Function:" )
print(value_function)


Final Value Function:
[[0.81   0.9    1.     0.    ]
 [0.729  0.81   0.9    0.    ]
 [0.6561 0.729  0.81   0.729 ]]
