In [1]:
import numpy as np
import random
import time
from gyms.frozen_lake import FrozenLakeEnvJP


class QLearning:
    
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.99):
        self.env = env
        # Initialize the state-action matrix (Q-table) with zeros
        self.state_action_matrix = np.zeros((env.observation_space.n, env.action_space.n))
        self.learning_rate = learning_rate  # Alpha
        self.discount_factor = discount_factor  # Gamma
        self.exploration_rate = exploration_rate  # Epsilon (for exploration)
        self.exploration_decay = exploration_decay  # Epsilon decay

    def act(self, state):
        # Epsilon-greedy policy: with probability exploration_rate, choose a random action
        if random.uniform(0, 1) < self.exploration_rate:
            return self.env.action_space.sample()  # Random action
        else:
            # Choose the action with the highest Q-value  for the current state
            return self.predict(state)
    
    def predict(self, state):
        return np.argmax(self.state_action_matrix[state, :])

    def step(self, state, action, reward, next_state, done):
        # Update the Q-value using the Q-Learning update rule
        next_action = self.predict(state)
        td_target = reward + self.discount_factor * self.state_action_matrix[next_state, next_action]
        td_delta = td_target - self.state_action_matrix[state, action]
        self.state_action_matrix[state, action] += self.learning_rate * td_delta

        # Decay exploration rate
        if done:
            self.exploration_rate *= self.exploration_decay



  "Gym minimally supports python 3.6 as the python foundation not longer supports the version, please update your version to 3.7+"


In [2]:
# Create the FrozenLake environment
env = FrozenLakeEnvJP()

# one episode

In [2]:



# Initialize the QLearning strategy
strategy = QLearning(env)

# Reset the environment to the initial state
state = env.reset()[0]

# Render the initial state (not in the FrozenLakeV1 class but showing how to work with Q-Learning)
env.render(step_number=0)

# Play the game by following the Q-Learning strategy
for step in range(100):
    action = strategy.act(state)  # Select action based on the current state
    next_state, reward, done, truncated, info = env.step(action)  # Apply the action to the environment
    
    # Render the updated environment after each action
    env.render(step_number=step+1)

    # Update the Q-Learning state-action matrix
    strategy.step(state, action, reward, next_state, done)
    
    # Transition to the next state
    state = next_state

    if done or truncated:
        print("Game Over!")
        time.sleep(1)
        break

# Close the environment
env.close()


Game Over!


# Multiple Episodes

In [23]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors, patches
from PIL import Image
from IPython.display import display, clear_output
import time
import os
from matplotlib.patches import FancyArrowPatch

In [68]:
class FrozenLakeEnvJP2(FrozenLakeEnvJP):

    # Function to render the environment map and the state value matrix with arrows
    def render(self, state_action_matrix, **kwargs):
        # Get the grid layout (env.desc stores the grid description)
        grid_array = np.array(self.desc, dtype='str')

        # Get the agent's position (env.s gives the current state as a flat index)
        player_position = self.s

        # Convert the flat position to 2D coordinates (row, col)
        grid_size = grid_array.shape[0]
        player_row = player_position // grid_size
        player_col = player_position % grid_size

        # Create a figure with two subplots: one for the environment, one for the state value matrix
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))  # Two side-by-side subplots

        # Plot the environment map in the first subplot (ax1)
        for row in range(grid_size):
            for col in range(grid_size):
                ax = ax1.inset_axes([col / grid_size, (grid_size - 1 - row) / grid_size, 1 / grid_size, 1 / grid_size])

                # Render different tiles based on the grid content
                if grid_array[row, col] == 'S':  # Start/Entrance
                    ax.imshow(np.ones((40, 40, 3)))
                elif grid_array[row, col] == 'F':  # Frozen tile (safe to walk)
                    ax.imshow(np.ones((40, 40, 3)))  # Render white tile (a blank white plate)
                elif grid_array[row, col] == 'H':  # Hole
                    ax.imshow(self.hole_img)
                elif grid_array[row, col] == 'G':  # Goal
                    ax.imshow(self.goal_img)

                # Hide axis ticks for each grid cell
                ax.set_xticks([])  # Hide x-axis ticks
                ax.set_yticks([])  # Hide y-axis ticks

        # Overlay the player image on the player's position
        player_ax = ax1.inset_axes([player_col / grid_size, (grid_size - 1 - player_row) / grid_size, 1 / grid_size, 1 / grid_size])
        player_ax.imshow(self.player_img)
        player_ax.set_xticks([])  # Hide x-axis ticks
        player_ax.set_yticks([])  # Hide y-axis ticks

        # Plot the state value matrix in the second subplot (ax2)
        state_values = np.max(state_action_matrix, axis=1).reshape(grid_size, grid_size)  # max(Q(s, a)) for each state

        cax = ax2.imshow(state_values, cmap='viridis', interpolation='none')
        fig.colorbar(cax, ax=ax2)  # Add a color bar to indicate value scale

        # Add labels and formatting for the state value matrix
        ax2.set_title("State Value Matrix (max(Q(s,a)))")
        ax2.set_xticks([])  # Hide x-axis ticks
        ax2.set_yticks([])  # Hide y-axis ticks

        # Add arrows pointing to the best action
        for state in range(state_action_matrix.shape[0]):
            row = state // grid_size
            col = state % grid_size
            best_action = np.argmax(state_action_matrix[state, :])

            # Coordinates for the center of the cell
            start_x = col
            start_y = row 
            
            # Add an arrow in the direction of the best action
            if best_action == 0:  # Left
                ax2.add_patch(FancyArrowPatch((start_x+0.2, start_y), (start_x - 0.2, start_y), mutation_scale=15, color='white', lw=2))
            elif best_action == 1:  # Down
                ax2.add_patch(FancyArrowPatch((start_x, start_y - 0.2), (start_x, start_y + 0.2), mutation_scale=15, color='white', lw=2))
            elif best_action == 2:  # Right
                ax2.add_patch(FancyArrowPatch((start_x-0.2, start_y), (start_x + 0.2, start_y), mutation_scale=15, color='white', lw=2))
            elif best_action == 3:  # Up
                ax2.add_patch(FancyArrowPatch((start_x, start_y + 0.2), (start_x, start_y - 0.2), mutation_scale=15, color='white', lw=2))

        # Set the title for the overall plot
        fig.suptitle(f"Episode: {kwargs.get('episode_number', None)}, Step: {kwargs.get('step_number', None)}", fontsize=16)

        # Show the plot
        display(plt.gcf())
        clear_output(wait=True)
        plt.close()

In [69]:
# Create the FrozenLake environment
env = FrozenLakeEnvJP2()

In [70]:

# Initialize the QLearning strategy
strategy = QLearning(env, exploration_rate=0.2, learning_rate=0.7)

In [None]:
# Define number of episodes
num_episodes = 30
max_steps_per_episode = 100

# Track the rewards per episode (optional)
episode_rewards = []

for episode in range(num_episodes):
    # Reset the environment to the initial state at the start of each episode
    state = env.reset()[0]  # Extract the actual state from the reset
    
    total_reward = 0  # Initialize the reward tracker for this episode

    # Play the game by following the Q-Learning strategy
    for step in range(max_steps_per_episode):
        action = strategy.act(state)  # Select action based on the current state
        next_state, reward, done, truncated, info = env.step(action)  # Apply the action to the environment

        if done and reward == 1:
            reward = 200
        elif done:
            reward = 0
        else:
            reward = next_state

        # Update the Q-Learning state-action matrix
        strategy.step(state, action, reward, next_state, done)
        
        # Render the updated environment after each action (optional for multiple episodes)
        env.render(strategy.state_action_matrix, step_number=step+1, episode_number=episode+1)
        
        # Transition to the next state
        state = next_state
        total_reward += reward  # Accumulate the reward

        if done:
            print(f"Episode {episode + 1} done in {step + 1} steps with total reward: {total_reward}")
            time.sleep(1)
            break
        if truncated:
            print(f"Episode {episode + 1} finished in {step + 1} steps with total reward: {total_reward}")
            time.sleep(1)
            break

    # Log the total reward for this episode
    episode_rewards.append(total_reward)

# Close the environment after all episodes
env.close()



Episode 18 done in 20 steps with total reward: 77


In [None]:
# Optional: Analyze performance over episodes
import matplotlib.pyplot as plt

# Plot the total reward over time to see the agent's improvement
plt.plot(episode_rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Reward per Episode in FrozenLake')
plt.show()