### Build your own environment ###

Mining Explorer

In [None]:
import gymnasium as gym
from gymnasium import spaces

import numpy as np
import random

random.seed(42)

In [None]:
# Build the MineExplorer environement

class MineralExplorerEnv(gym.Env):
    def __init__(self,render_mode = None):
        super(MineralExplorerEnv, self).__init__()
        
        # Set the render mode for the environment
        self.render_mode = render_mode  
        # Action space: 0 = move left, 1 = move right
        self.action_space = spaces.Discrete(2)
        
        # Observation space: 6 discrete states (0 to 5)
        self.observation_space = spaces.Discrete(6)
        
        # Initial state (Explorer starts in rectangle 3 -> index 2)
        self.state = None

    def reset(self, seed=None, options=None):
        """Resets the environment to the initial state (state 3) and uses options if provided."""
        super().reset(seed=seed)
        
        # Use options if provided
        if options is not None:
            if 'start_state' in options:
                self.state = options['start_state']
            else:
                self.state = random.randint(1,4)  # Default starting state selected randomly
        else:
            self.state = random.randint(1,4) # Default starting state selected randomly
    
        # Return observation (state) and an empty info dictionary
        return self.state, {}


    def step(self, action):
        """Executes one time step within the environment"""

        # Save the current state to calculate the reward based on the current state
        #current_state = self.state
        
        # Move left (action=0) or right (action=1)
        if action == 0:  # Move left
            self.state = max(0, self.state - 1)  # Ensure we don't go below state 0
        elif action == 1:  # Move right
            self.state = min(5, self.state + 1)  # Ensure we don't go beyond state 5

        # Check if the agent reached a terminal state (state 0 or state 5)
        if self.state == 0:
            reward = 0
            done = True  # Terminal state reached, episode ends
            truncated = False  # Not truncated by time limit
        elif self.state == 5:
            reward = 0
            done = True  # Terminal state reached, episode ends
            truncated = False  # Not truncated by time limit
        else:
            reward = 0
            done = False  # Episode continues
            truncated = False  # No truncation applied in this case
        
        # Return state (observation), scalar reward, done flag, truncated flag, and an empty info dictionary
        return self.state, reward, done, truncated, {}


    def render(self, mode="human"):
        """Render the environment"""
        env_map = ['_' for _ in range(6)]
        env_map[self.state] = 'E'
        
        if mode == "human":
            print(f"Environment: {' '.join(env_map)}")
        elif mode == "ansi":
            return f"Environment: {' '.join(env_map)}"

    def close(self):
        """Optional cleanup method."""
        pass


In [None]:
# Testing the environment
env = MineralExplorerEnv()

In [None]:
# Reset the environment
state, _ = env.reset()
print(state)
env.render()

# Run an episode with random actions
done = False
while not done:
    action = env.action_space.sample()  # Random action (0: left, 1: right)
    state, reward, done, info, _ = env.step(action)
    env.render()
    print(f"Action: {action}, Reward: {reward}, Done: {done}")

In [None]:
# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.6  # Exploration rate
epsilon_decay = 0.99  # Decay for epsilon after each episode
min_epsilon = 0.01  # Minimum exploration rate
episodes = 500000 # Number of episodes

# Initialize the Q-Table with appropriate rewards for terminal states
q_table = np.zeros((6, 2))
q_table[0, 0] = 100  # State 0, left action (no action should give this state a reward of 100)
q_table[5, 1] = 40   # State 5, right action (no action should give this state a reward of 40)

# Initialize the environment
env = MineralExplorerEnv()


# Q-Learning algorithm
for episode in range(episodes):
    state, _ = env.reset()  # Start with a random initial state
    done = False

    while not done:
        # Choose action (epsilon-greedy strategy)
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore: random action
        else:
            action = np.argmax(q_table[state])  # Exploit: choose the action with the highest Q-value

        # Take the action and observe the result
        next_state, reward, done, _, _ = env.step(action)
        #print(state,reward,next_state)

        # Update Q-value for the current state-action pair (regardless of terminal or not)
        q_table[state, action] = q_table[state, action] + alpha * (
            reward + gamma * np.max(q_table[next_state]) - q_table[state, action]
        )

        
        # Transition to the next state
        state = next_state

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

# Set the Q-values for invalid actions in terminal states to 0
q_table[0, 1] = 0  # Can't move right from state 0
q_table[5, 0] = 0  # Can't move left from state 5

# Print the learned Q-Table
print("Learned Q-Table:")
print(q_table)

In [None]:
# Testing the learned policy
state, _ = env.reset()
done = False
env.render()

while not done:
    action = np.argmax(q_table[state])
    state, reward, done, _, _ = env.step(action)
    env.render()
    print(f"Action: {action}, Reward: {reward}, Done: {done}")
