In [1]:
import gymnasium as gym
from gymnasium import spaces

import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import statistics

In [21]:
class DynamicTargetEnv(gym.Env):
    def __init__(self):
        super(DynamicTargetEnv, self).__init__()
        
        # State constraints
        self.feature_ranges = [(0, 5), (0, 1), (0, 5)]  # Ranges for x1, x2, x3
        
        # Define action space: 2 actions per feature (increment, decrement)
        self.action_space = spaces.Discrete(6)
        
        # Define observation space: Each feature has its own range
        self.observation_space = spaces.MultiDiscrete([r[1] - r[0] + 1 for r in self.feature_ranges])
        
        self.state = None
        self.steps = 0
        self.max_steps = 1000  # Prevent infinite loops
        
        # Initialize the target state
        self.target_state = self._generate_target_state()

    def reset(self, seed=None, options=None):
        # Initialize state to the minimum values for each feature
        self.state = np.array([r[0] for r in self.feature_ranges], dtype=np.int32)
        self.steps = 0
        return self.state, {}

    def step(self, action):
        self.steps += 1
        
        # Map action to feature and operation (increment or decrement)
        feature_index = action // 2
        increment = 1 if action % 2 == 0 else -1
        
        # Update the state within bounds
        feature_min, feature_max = self.feature_ranges[feature_index]
        self.state[feature_index] = np.clip(
            self.state[feature_index] + increment, feature_min, feature_max
        )

        # Check if the current state matches the target state
        if np.array_equal(self.state, self.target_state):
            reward = 100.0  # Large reward for success
            done = True
            print(f"Success! Target state achieved: {self.target_state}")
            self.target_state = self._generate_target_state()  # Generate a new target state
            print(f"New target state: {self.target_state}")
        else:
            reward = -0.1  # Small penalty for each step
            done = False

        # End the episode if max steps are reached
        if self.steps >= self.max_steps:
            done = True

        return self.state, reward, done, False, {}

    def render(self):
        print(f"State: {self.state}, Target: {self.target_state}")

    def close(self):
        pass

    def _generate_target_state(self):
        """
        Randomly generates a new target state within the feature ranges.
        """
        return np.array([np.random.randint(low=r[0], high=r[1] + 1) for r in self.feature_ranges])


In [23]:
env = DynamicTargetEnv()

state, _ = env.reset()
print(f"Initial State: {state}")

for _ in range(200):  # Simulate up to 20 steps
    action = env.action_space.sample()  # Random action
    state, reward, done, truncated, _ = env.step(action)
    print(f"Action: {action}, State: {state}, Reward: {reward}")
    if done:
        print("Terminal state reached!")
        break

env.close()


Initial State: [0 0 0]
Action: 5, State: [0 0 0], Reward: -0.1
Action: 1, State: [0 0 0], Reward: -0.1
Action: 1, State: [0 0 0], Reward: -0.1
Action: 4, State: [0 0 1], Reward: -0.1
Action: 1, State: [0 0 1], Reward: -0.1
Action: 1, State: [0 0 1], Reward: -0.1
Action: 3, State: [0 0 1], Reward: -0.1
Action: 1, State: [0 0 1], Reward: -0.1
Action: 0, State: [1 0 1], Reward: -0.1
Action: 3, State: [1 0 1], Reward: -0.1
Action: 1, State: [0 0 1], Reward: -0.1
Action: 4, State: [0 0 2], Reward: -0.1
Action: 1, State: [0 0 2], Reward: -0.1
Action: 0, State: [1 0 2], Reward: -0.1
Success! Target state achieved: [1 0 3]
New target state: [5 0 2]
Action: 4, State: [1 0 3], Reward: 100.0
Terminal state reached!
