In [None]:
# 3.11 Actor-Critic Algorithm- HoaDNt@fe.edu.vn
import numpy as np

class GridWorld:
    def __init__(self):
        self.grid_size = (3, 3)
        self.num_actions = 4  # Up, Down, Left, Right
        self.start_state = (0, 0)
        self.goal_state = (2, 2)

    def step(self, state, action):
        # Define the dynamics of the environment
        row, col = state
        if action == 0:  # Up
            row = max(0, row - 1)
        elif action == 1:  # Down
            row = min(self.grid_size[0] - 1, row + 1)
        elif action == 2:  # Left
            col = max(0, col - 1)
        elif action == 3:  # Right
            col = min(self.grid_size[1] - 1, col + 1)
        next_state = (row, col)
        reward = 0
        if next_state == self.goal_state:
            reward = 1  # Reward of +1 upon reaching the goal state
        return next_state, reward

class ActorCritic:
    def __init__(self, num_actions, alpha_actor, alpha_critic, gamma):
        self.num_actions = num_actions
        self.alpha_actor = alpha_actor
        self.alpha_critic = alpha_critic
        self.gamma = gamma
        self.actor_params = np.zeros((3, 3, num_actions))  # Tabular actor parameters
        self.critic_values = np.zeros((3, 3))  # Tabular critic values

    def select_action(self, state):
        # Select action probabilistically based on actor parameters
        action_probs = self.softmax(self.actor_params[state])
        action = np.random.choice(self.num_actions, p=action_probs)
        return action

    def update(self, state, action, reward, next_state):
        # Compute TD error (advantage)
        td_error = reward + self.gamma * self.critic_values[next_state] - self.critic_values[state]

        # Update critic values
        self.critic_values[state] += self.alpha_critic * td_error

        # Update actor parameters
        self.actor_params[state][action] += self.alpha_actor * td_error

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

# Create a grid world environment
grid_world = GridWorld()

# Create an Actor-Critic agent
num_actions = 4  # Up, Down, Left, Right
alpha_actor = 0.1
alpha_critic = 0.1
gamma = 0.9
actor_critic_agent = ActorCritic(num_actions, alpha_actor, alpha_critic, gamma)

# Train the Actor-Critic agent
num_episodes = 1000
for _ in range(num_episodes):
    state = grid_world.start_state
    while state != grid_world.goal_state:
        action = actor_critic_agent.select_action(state)
        next_state, reward = grid_world.step(state, action)
        actor_critic_agent.update(state, action, reward, next_state)
        state = next_state

# Evaluate the learned policy
total_reward = 0
state = grid_world.start_state
while state != grid_world.goal_state:
    action = actor_critic_agent.select_action(state)
    next_state, reward = grid_world.step(state, action)
    total_reward += reward
    state = next_state

print("Total reward obtained by learned policy:", total_reward)
