In [4]:
import gym
import numpy as np

class CliffWalkerAgent:
    def __init__(self, env):
        self.env = env
        # Initialize Q-table with zeros: State space has 48 states (4x12 grid), 4 possible actions (left, down, right, up)
        self.Q = np.zeros([env.observation_space.n, env.action_space.n])

    def train(self, num_episodes=1000, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        for episode in range(num_episodes):
            # Unpack the reset() output (state and info)
            state, _ = self.env.reset()  # Reset the environment and get the initial state
            print(f"Episode {episode+1}: Starting state = {state}")  # Debug: print the state
            done = False
            
            while not done:
                # Epsilon-greedy policy: choose action based on exploration vs exploitation
                if np.random.rand() < epsilon:
                    action = self.env.action_space.sample()  # Random action (exploration)
                else:
                    action = np.argmax(self.Q[state, :])  # Best action based on Q-values (exploitation)
                
                # Take action and observe the next state and reward
                next_state, reward, done, _, _ = self.env.step(action)
                
                # Debug: print next state
                print(f"Current state = {state}, Action = {action}, Next state = {next_state}, Reward = {reward}")
                
                # Update Q-table using the Q-learning equation
                self.Q[state, action] = (self.Q[state, action] +
                                         learning_rate * (reward + discount_factor * np.max(self.Q[next_state, :]) - self.Q[state, action]))
                
                # Transition to the next state
                state = next_state

    def test(self):
        state, _ = self.env.reset()  # Unpack reset output
        done = False
        total_reward = 0
        
        while not done:
            # Choose the best action based on the learned Q-values
            action = np.argmax(self.Q[state, :])
            state, reward, done, _, _ = self.env.step(action)
            total_reward += reward
        
        print(f"Total reward after training: {total_reward}")
        print("Learned Q-table:")
        print(self.Q)


# Create the CliffWalking environment
env = gym.make('CliffWalking-v0')

# Create the agent and train it
agent = CliffWalkerAgent(env)
agent.train(num_episodes=1000)

# Test the trained agent
agent.test()


Episode 1: Starting state = 36
Current state = 36, Action = 1, Next state = 36, Reward = -100
Current state = 36, Action = 0, Next state = 24, Reward = -1
Current state = 24, Action = 0, Next state = 12, Reward = -1
Current state = 12, Action = 0, Next state = 0, Reward = -1
Current state = 0, Action = 0, Next state = 0, Reward = -1
Current state = 0, Action = 2, Next state = 12, Reward = -1
Current state = 12, Action = 1, Next state = 13, Reward = -1
Current state = 13, Action = 0, Next state = 1, Reward = -1
Current state = 1, Action = 0, Next state = 1, Reward = -1
Current state = 1, Action = 1, Next state = 2, Reward = -1
Current state = 2, Action = 0, Next state = 2, Reward = -1
Current state = 2, Action = 1, Next state = 3, Reward = -1
Current state = 3, Action = 0, Next state = 3, Reward = -1
Current state = 3, Action = 1, Next state = 4, Reward = -1
Current state = 4, Action = 0, Next state = 4, Reward = -1
Current state = 4, Action = 1, Next state = 5, Reward = -1
Current stat

  if not isinstance(terminated, (bool, np.bool8)):


Current state = 1, Action = 2, Next state = 13, Reward = -1
Current state = 13, Action = 1, Next state = 14, Reward = -1
Current state = 14, Action = 0, Next state = 2, Reward = -1
Current state = 2, Action = 2, Next state = 14, Reward = -1
Current state = 14, Action = 1, Next state = 15, Reward = -1
Current state = 15, Action = 1, Next state = 16, Reward = -1
Current state = 16, Action = 0, Next state = 4, Reward = -1
Current state = 4, Action = 3, Next state = 3, Reward = -1
Current state = 3, Action = 0, Next state = 3, Reward = -1
Current state = 3, Action = 1, Next state = 4, Reward = -1
Current state = 4, Action = 0, Next state = 4, Reward = -1
Current state = 4, Action = 2, Next state = 16, Reward = -1
Current state = 16, Action = 1, Next state = 17, Reward = -1
Current state = 17, Action = 1, Next state = 18, Reward = -1
Current state = 18, Action = 1, Next state = 19, Reward = -1
Current state = 19, Action = 3, Next state = 18, Reward = -1
Current state = 18, Action = 2, Next 