In [1]:
print("hi")

hi


In [2]:
import numpy as np
import gym
import random

# Initialize environment
env = gym.make("FrozenLake-v1", is_slippery=True)


üî¢ Step 2: Q-Table Initialization
üîπ What is a Q-Table?

A Q-Table is a matrix that stores Q-values (expected rewards) for each (state, action) pair.

For FrozenLake-v1:

    It has 16 states (4√ó4 grid = 16 positions)

    It has 4 possible actions: LEFT, DOWN, RIGHT, UP ‚Üí action space = 4

In [3]:
# Number of states and actions
state_space = env.observation_space.n   # 16 for FrozenLake-v1
action_space = env.action_space.n       # 4 actions

# Initialize Q-table with zeros
q_table = np.zeros((state_space, action_space))

# Display Q-table shape
print("Q-table shape:", q_table.shape)
print(q_table)


Q-table shape: (16, 4)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


‚öôÔ∏è Step 3: Define Hyperparameters

These control how our Q-learning agent behaves and learns:
Hyperparameter	Meaning
alpha (Œ±)	Learning rate ‚Äì how much new info overrides old
gamma (Œ≥)	Discount factor ‚Äì how much we value future rewards
epsilon (Œµ)	Exploration rate ‚Äì how often to explore random actions
epsilon_decay	How fast epsilon decreases (less exploration over time)
min_epsilon	Minimum value epsilon can decay to
episodes	Total training episodes
max_steps	Max steps per episode (prevent infinite loops)

In [4]:
# Q-learning parameters
alpha = 0.8           # Learning rate
gamma = 0.95          # Discount factor
epsilon = 1.0         # Initial exploration rate
epsilon_decay = 0.995 # Decay rate for epsilon
min_epsilon = 0.01    # Minimum exploration rate

# Training control
episodes = 2000       # Total training episodes
max_steps = 100       # Max steps per episode


üîÑ Step 4 & 5: Epsilon-Greedy Policy + Q-Value Update

We‚Äôll write the training loop where:

    For each episode:

        Start from initial state

        At each step:

            Select an action using epsilon-greedy policy

            Perform action and receive next state & reward

            Update Q-value using the Bellman equation

    After each episode, decay epsilon

In [5]:
rewards = []

for episode in range(episodes):
    state = env.reset()[0]     # Reset environment, get initial state
    total_rewards = 0

    for step in range(max_steps):
        # Step 4: Epsilon-Greedy Action Selection
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit best known action

        # Take the action ‚Üí get next state, reward, done
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Step 5: Q-value Update using Bellman Equation
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        # Q-learning formula
        new_value = old_value + alpha * (reward + gamma * next_max - old_value)
        q_table[state, action] = new_value

        # Update state and reward
        state = next_state
        total_rewards += reward

        if done:
            break

    # Decay exploration rate
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    rewards.append(total_rewards)

    # Progress logging
    if (episode + 1) % 100 == 0:
        print(f"Episode: {episode + 1}, Success Rate: {np.mean(rewards[-100:]):.2f}, Epsilon: {epsilon:.3f}")


AttributeError: module 'numpy' has no attribute 'bool8'

In [None]:
# Disable exploration
test_episodes = 10
test_max_steps = 100
successes = 0

print("\nTesting trained agent...\n")

for episode in range(test_episodes):
    state = env.reset()[0]
    done = False
    print(f"Episode {episode + 1}:")
    
    for step in range(test_max_steps):
        env.render()  # Optional: shows the environment in text
        action = np.argmax(q_table[state])  # Always pick best action
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        state = next_state

        if done:
            if reward == 1:
                print("‚úÖ Reached the goal!")
                successes += 1
            else:
                print("‚ùå Fell into a hole.")
            break

print(f"\nAgent succeeded in {successes}/{test_episodes} episodes.")



Testing trained agent...

Episode 1:
‚úÖ Reached the goal!
Episode 2:
‚ùå Fell into a hole.
Episode 3:
‚úÖ Reached the goal!
Episode 4:
‚úÖ Reached the goal!
Episode 5:
‚úÖ Reached the goal!
Episode 6:
‚ùå Fell into a hole.
Episode 7:
‚ùå Fell into a hole.
Episode 8:
‚úÖ Reached the goal!
Episode 9:
‚úÖ Reached the goal!
Episode 10:
‚úÖ Reached the goal!

Agent succeeded in 7/10 episodes.


  logger.warn(


In [None]:
import time
import gym

# Recreate environment with visual rendering
env = gym.make("FrozenLake-v1", render_mode="human", is_slippery=True)

test_episodes = 10
test_max_steps = 100
successes = 0

print("\nüéÆ Testing Trained Agent with Rendering...\n")

for episode in range(test_episodes):
    state = env.reset()[0]
    done = False
    print(f"Episode {episode + 1}:")

    for step in range(test_max_steps):
        time.sleep(0.5)  # Slow down to view steps
        action = np.argmax(q_table[state])  # Use learned best action
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        state = next_state

        if done:
            if reward == 1:
                print("‚úÖ Reached the goal!")
                successes += 1
            else:
                print("‚ùå Fell into a hole.")
            time.sleep(1.5)  # Pause before next episode
            break

print(f"\nüèÅ Final Result: Agent succeeded in {successes}/{test_episodes} episodes.")



üéÆ Testing Trained Agent with Rendering...

Episode 1:


AttributeError: module 'numpy' has no attribute 'bool8'

: 