In [1]:
#######################################################################
# Copyright (C)                                                       #
# 2016-2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com)             #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')

WORLD_SIZE = 4
# left, up, right, down
ACTIONS = [np.array([0, -1]),
           np.array([-1, 0]),
           np.array([0, 1]),
           np.array([1, 0])]
ACTION_PROB = 0.25


def is_terminal(state):
    x, y = state
    return (x == 0 and y == 0) or (x == WORLD_SIZE - 1 and y == WORLD_SIZE - 1)


def step(state, action):
    if is_terminal(state):
        return state, 0

    next_state = (np.array(state) + action).tolist()
    x, y = next_state

    if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
        next_state = state

    reward = -1
    return next_state, reward


def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(image):
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')

        # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')
    ax.add_table(tb)


def compute_state_value(in_place=True, discount=1.0):
    new_state_values = np.zeros((WORLD_SIZE, WORLD_SIZE))
    iteration = 0
    while True:
        if in_place:
            state_values = new_state_values
        else:
            state_values = new_state_values.copy()
        old_state_values = state_values.copy()

        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                value = 0
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action)
                    value += ACTION_PROB * (reward + discount * state_values[next_i, next_j])
                new_state_values[i, j] = value

        max_delta_value = abs(old_state_values - new_state_values).max()
        if max_delta_value < 1e-4:
            break

        iteration += 1

    return new_state_values, iteration


def figure_4_1():
    # While the author suggests using in-place iterative policy evaluation,
    # Figure 4.1 actually uses out-of-place version.
    _, asycn_iteration = compute_state_value(in_place=True)
    values, sync_iteration = compute_state_value(in_place=False)
    draw_image(np.round(values, decimals=2))
    print('In-place: {} iterations'.format(asycn_iteration))
    print('Synchronous: {} iterations'.format(sync_iteration))

    plt.savefig('figure_4_1.png')
    plt.close()


if __name__ == '__main__':
    figure_4_1()

In-place: 113 iterations
Synchronous: 172 iterations


In [2]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')

WORLD_HEIGHT = 5  # Tăng chiều cao lên 5 để thêm state 15
WORLD_WIDTH = 4
ACTIONS = [np.array([0, -1]),  # left
           np.array([-1, 0]),  # up
           np.array([0, 1]),   # right
           np.array([1, 0])]   # down
ACTION_PROB = 0.25

# State 15 nằm ở vị trí (4,1)
# Terminal states: (0,0) và (3,3)
def is_terminal(state):
    x, y = state
    return (x == 0 and y == 0) or (x == 3 and y == 3)

# Hàm step mở rộng cho Bài tập 4.2
def step(state, action, scenario='a'):
    if is_terminal(state):
        return state, 0

    x, y = state

    # Xử lý đặc biệt cho state 13 (3,1)
    if scenario == 'b' and (x, y) == (3, 1) and np.array_equal(action, ACTIONS[3]):  # down
        return (4, 1), -1

    # Xử lý đặc biệt cho state 15 (4,1)
    if (x, y) == (4, 1):
        if np.array_equal(action, ACTIONS[0]):  # left
            next_state = (3, 0)
        elif np.array_equal(action, ACTIONS[1]):  # up
            next_state = (3, 1)
        elif np.array_equal(action, ACTIONS[2]):  # right
            next_state = (3, 2)
        elif np.array_equal(action, ACTIONS[3]):  # down
            next_state = (4, 1)
        return next_state, -1

    # Xử lý các state khác
    next_state = (np.array(state) + action).tolist()
    x, y = next_state

    # Kiểm tra biên
    if x < 0 or x >= WORLD_HEIGHT or y < 0 or y >= WORLD_WIDTH:
        next_state = state

    reward = -1
    return next_state, reward

def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(image):
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')

    # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')
    ax.add_table(tb)

def compute_state_value(scenario='a', discount=1.0):
    state_values = np.zeros((WORLD_HEIGHT, WORLD_WIDTH))
    iteration = 0
    while True:
        new_state_values = state_values.copy()
        delta = 0

        for i in range(WORLD_HEIGHT):
            for j in range(WORLD_WIDTH):
                if is_terminal((i, j)):
                    continue

                value = 0
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action, scenario)
                    value += ACTION_PROB * (reward + discount * state_values[next_i, next_j])

                new_state_values[i, j] = value
                delta = max(delta, abs(value - state_values[i, j]))

        state_values = new_state_values
        if delta < 1e-4:
            break

        iteration += 1

    return state_values, iteration

def exercise_4_2():
    # Phần (a): Không thay đổi dynamics của state 13
    print("Part (a): Original dynamics unchanged for state 13")
    values_a, iter_a = compute_state_value(scenario='a')
    print(f"Converged in {iter_a} iterations")
    print(f"v_π(15) = {values_a[4, 1]:.2f}\n")

    # Phần (b): Thay đổi dynamics của state 13
    print("Part (b): Changed dynamics for state 13")
    values_b, iter_b = compute_state_value(scenario='b')
    print(f"Converged in {iter_b} iterations")
    print(f"v_π(15) = {values_b[4, 1]:.2f}")

    # Vẽ kết quả cho phần (b)
    draw_image(np.round(values_b, decimals=2))
    plt.savefig('gridworld_4_2.png')
    plt.close()

if __name__ == '__main__':
    exercise_4_2()

Part (a): Original dynamics unchanged for state 13
Converged in 183 iterations
v_π(15) = -21.45

Part (b): Changed dynamics for state 13
Converged in 183 iterations
v_π(15) = -21.45
