In [4]:
import numpy as np
import matplotlib.pyplot as plt

# Definindo o ambiente Grid World
# S: Estado inicial
# G: Estado objetivo
# #: Obstáculo
# -: Caminho livre
# A: Ação
grid_world = [
    ['#', '#', '#', '#', '#', '#', '#', '#', '#', '#'],
    ['#', 'S', '#', '#', '#', '#', '#', '#', '#', '#'],
    ['#', '-', '-', '-', '-', '-', '-', '#', '#', '#'],
    ['#', '#', '#', '#', '#', '#', '-', '#', '#', '#'],
    ['#', '#', '#', '#', '#', '#', 'A', '#', '#', '#'],
    ['#', '#', '#', '#', '#', '#', '#', '#', 'G', '#'],
    ['#', '#', '#', '#', '#', '#', '#', '#', '#', '#']
]

# Definindo parâmetros
num_episodes = 500
gamma = 0.9  # Fator de desconto
alpha = 0.1  # Taxa de aprendizado

# Convertendo o ambiente para uma matriz NumPy para facilitar a manipulação
grid_world = np.array(grid_world)

# Obtendo as coordenadas dos estados S, G e A
start_state = np.argwhere(grid_world == 'S')[0]
goal_state = np.argwhere(grid_world == 'G')[0]
action_state = np.argwhere(grid_world == 'A')[0]

# Função para escolher uma ação (movimento) aleatória
def choose_action():
    return np.random.choice(['up', 'down', 'left', 'right'])

# Função para executar um episódio
def run_episode():
    state = start_state
    episode = []

    while tuple(state) != tuple(goal_state):
        action = choose_action()
        next_state = take_action(state, action)
        reward = get_reward(next_state)
        episode.append((state, action, reward))
        state = next_state

    return episode

# Função para executar uma ação
def take_action(state, action):
    if action == 'up' and state[0] > 0 and grid_world[state[0] - 1, state[1]] != '#':
        return state[0] - 1, state[1]
    elif action == 'down' and state[0] < grid_world.shape[0] - 1 and grid_world[state[0] + 1, state[1]] != '#':
        return state[0] + 1, state[1]
    elif action == 'left' and state[1] > 0 and grid_world[state[0], state[1] - 1] != '#':
        return state[0], state[1] - 1
    elif action == 'right' and state[1] < grid_world.shape[1] - 1 and grid_world[state[0], state[1] + 1] != '#':
        return state[0], state[1] + 1
    else:
        return state

# Função para obter a recompensa
def get_reward(state):
    if tuple(state) == tuple(goal_state):
        return 1
    else:
        return 0

# Função principal de Monte Carlo
def monte_carlo():
    state_values = np.zeros_like(grid_world, dtype=float)
    state_counts = np.zeros_like(grid_world, dtype=int)

    for episode_num in range(num_episodes):
        episode = run_episode()
        G = 0  # Retorno acumulado

        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward

            if state not in [x[0] for x in episode[:t]]:
                state_counts[state] += 1
                state_values[state] += (1 / state_counts[state]) * (G - state_values[state])

    return state_values

# Executando o Monte Carlo e obtendo os valores dos estados
values = monte_carlo()

# Visualizando os resultados
plt.imshow(values, cmap='viridis', origin='upper')
plt.colorbar()
plt.title('Valores de Estado (Monte Carlo)')
plt.show()
