In [None]:
import numpy as np
import random

class PuzzleEnvironment:
    def __init__(self):
        self.rows = 4
        self.cols = 5
        self.state = np.zeros((self.rows, self.cols), dtype=int)  # Estado inicial
        self.goal_state = np.array([[1, 1, 1, 1, 1],
                                    [1, 0, 0, 0, 1],
                                    [1, 0, 0, 0, 1],
                                    [1, 1, 1, 1, 1]])  # Estado objetivo

    def reset(self):
        self.state = np.zeros((self.rows, self.cols), dtype=int)  # Reinicia al estado inicial vacío
        return self.state

    def step(self, action):
        row, col = divmod(action, self.cols)
        # Cambia el valor de la celda seleccionada (de 0 a 1 o de 1 a 0)
        self.state[row, col] = 1 - self.state[row, col]

        # Comprueba si se ha alcanzado el estado objetivo
        done = np.array_equal(self.state, self.goal_state)

        # Calcula la recompensa
        reward = 1 if done else -1  # Penaliza las acciones no exitosas

        return self.state.copy(), reward, done

    def render(self):
        for r in range(self.rows):
            print(' '.join(['*' if self.state[r, c] == 1 else 'O' for c in range(self.cols)]))

    def is_solved(self):
        return np.array_equal(self.state, self.goal_state)

class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.q_table = np.zeros((env.rows, env.cols, 2))  # Acción de 0 a 19 (una acción por celda)

    def select_action(self, state):
        # Política epsilon-greedy
        if random.random() < self.epsilon:
            return random.randint(0, self.env.rows * self.env.cols - 1)  # Acción aleatoria
        else:
            row, col = np.unravel_index(np.argmax(self.q_table[:, :, 0]), (self.env.rows, self.env.cols))
            return row * self.env.cols + col

    def train(self, episodes=1000):
        for episode in range(episodes):
            state = self.env.reset()
            done = False

            while not done:
                action = self.select_action(state)
                next_state, reward, done = self.env.step(action)

                row, col = divmod(action, self.env.cols)
                # Actualiza la tabla Q
                self.q_table[row, col, 0] += self.learning_rate * (reward - self.q_table[row, col, 0])
                
                state = next_state

        # Muestra la política aprendida (estado final)
        print("Política óptima aprendida:")
        self.env.render()


if __name__ == "__main__":
    puzzle_env = PuzzleEnvironment()
    agent = QLearningAgent(puzzle_env)
    agent.train(episodes=1000)
