Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add known/approximate pareto front to Minecart and Resource Gathering #45

Merged
merged 2 commits into from
Feb 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 58 additions & 63 deletions mo_gymnasium/envs/minecart/minecart.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from copy import deepcopy
from math import ceil
from pathlib import Path
from typing import Optional
from typing import List, Optional

import gymnasium as gym
import numpy as np
Expand Down Expand Up @@ -122,6 +122,13 @@ class Minecart(gym.Env, EzPickle):
## Episode Termination
The episode ends when the cart returns to the base.

## Arguments
- render_mode: The render mode to use. Can be "rgb_array" or "human".
- image_observation: If True, the observation is a RGB image of the environment.
- frame_skip: How many times each action is repeated. Default: 4
- incremental_frame_skip: Whether actions are repeated incrementally. Default: True
- config: Path to the .json configuration file. See the default configuration file for more information: https://github.com/Farama-Foundation/MO-Gymnasium/blob/main/mo_gymnasium/envs/minecart/mine_config.json

## Credits
The code was refactored from [Axel Abels' source](https://github.com/axelabels/DynMORL).
"""
Expand All @@ -131,15 +138,20 @@ class Minecart(gym.Env, EzPickle):
def __init__(
self,
render_mode: Optional[str] = None,
image_observation=False,
image_observation: bool = False,
frame_skip: int = 4,
incremental_frane_skip: bool = True,
config=str(Path(__file__).parent.absolute()) + "/mine_config.json",
):
EzPickle.__init__(self, render_mode, image_observation, config)
EzPickle.__init__(self, render_mode, image_observation, frame_skip, incremental_frane_skip, config)

self.render_mode = render_mode
self.screen = None
self.last_render_mode_used = None
self.config = config
self.frame_skip = frame_skip
assert self.frame_skip > 0, "Frame skip must be greater than 0."
self.incremental_frame_skip = incremental_frane_skip

with open(self.config) as f:
data = json.load(f)
Expand Down Expand Up @@ -185,33 +197,29 @@ def __init__(
def obj_cnt(self):
return self.ore_cnt + 1

def convex_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_skip=True, symmetric=True):
def convex_coverage_set(self, gamma: float, symmetric: bool = True) -> List[np.ndarray]:
"""
Computes an approximate convex coverage set
Computes an approximate convex coverage set (CCS).

Keyword Arguments:
frame_skip {int} -- How many times each action is repeated (default: {1})
discount {float} -- Discount factor to apply to rewards (default: {1})
incremental_frame_skip {bool} -- Whether actions are repeated incrementally (default: {1})
symmetric {bool} -- If true, we assume the pattern of accelerations from the base to the mine is the same as from the mine to the base (default: {True})
Args:
gamma (float): Discount factor to apply to rewards.
symmetric (bool): If true, we assume the pattern of accelerations from the base to the mine is the same as from the mine to the base. Default: True

Returns:
The convex coverage set
"""
policies = self.pareto_coverage_set(frame_skip, discount, incremental_frame_skip, symmetric)
policies = self.pareto_front(gamma, symmetric)
origin = np.min(policies, axis=0)
extended_policies = [origin] + policies
return [policies[idx - 1] for idx in ConvexHull(extended_policies).vertices if idx != 0]

def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_skip=True, symmetric=True):
def pareto_front(self, gamma: float, symmetric: bool = True) -> List[np.ndarray]:
"""
Computes an approximate pareto coverage set
Computes an approximate pareto front.

Keyword Arguments:
frame_skip {int} -- How many times each action is repeated (default: {1})
discount {float} -- Discount factor to apply to rewards (default: {1})
incremental_frame_skip {bool} -- Whether actions are repeated incrementally (default: {1})
symmetric {bool} -- If true, we assume the pattern of accelerations from the base to the mine is the same as from the mine to the base (default: {True})
Args:
gamma (float): Discount factor to apply to rewards
symmetric (bool): If true, we assume the pattern of accelerations from the base to the mine is the same as from the mine to the base. Default: True

Returns:
The pareto coverage set
Expand All @@ -231,16 +239,16 @@ def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_ski

# Number of rotations required to face the mine
angle = compute_angle(mine.pos, HOME_POS, [1, 1])
rotations = int(ceil(abs(angle) / (ROTATION * frame_skip)))
rotations = int(ceil(abs(angle) / (ROTATION * self.frame_skip)))

# Build pattern of accelerations/nops to reach the mine
# initialize with single acceleration
queue = [
{
"speed": ACCELERATION * frame_skip,
"dist": mine_distance - frame_skip * (frame_skip + 1) / 2 * ACCELERATION
if incremental_frame_skip
else mine_distance - ACCELERATION * frame_skip * frame_skip,
"speed": ACCELERATION * self.frame_skip,
"dist": mine_distance - self.frame_skip * (self.frame_skip + 1) / 2 * ACCELERATION
if self.incremental_frame_skip
else mine_distance - ACCELERATION * self.frame_skip * self.frame_skip,
"seq": [ACT_ACCEL],
}
]
Expand All @@ -249,11 +257,11 @@ def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_ski
while len(queue) > 0:
seq = queue.pop()
# accelerate
new_speed = seq["speed"] + ACCELERATION * frame_skip
new_speed = seq["speed"] + ACCELERATION * self.frame_skip
accelerations = new_speed / ACCELERATION
movement = (
accelerations * (accelerations + 1) / 2 * ACCELERATION
- (accelerations - frame_skip) * ((accelerations - frame_skip) + 1) / 2 * ACCELERATION
- (accelerations - self.frame_skip) * ((accelerations - self.frame_skip) + 1) / 2 * ACCELERATION
)
dist = seq["dist"] - movement
speed = new_speed
Expand All @@ -262,7 +270,7 @@ def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_ski
else:
queue.append({"speed": speed, "dist": dist, "seq": seq["seq"] + [ACT_ACCEL]})
# idle
dist = seq["dist"] - seq["speed"] * frame_skip
dist = seq["dist"] - seq["speed"] * self.frame_skip

if dist <= 0:
trimmed_sequences.append(seq["seq"] + [ACT_NONE])
Expand All @@ -276,7 +284,7 @@ def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_ski
)

# Build rational mining sequences
mine_means = mine.distribution_means() * frame_skip
mine_means = mine.distribution_means() * self.frame_skip
mn_sum = np.sum(mine_means)
# on average it takes up to this many actions to fill cart
max_mine_actions = 0 if mn_sum == 0 else int(ceil(self.capacity / mn_sum))
Expand All @@ -296,7 +304,7 @@ def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_ski
itertools.product(
[[ACT_LEFT] * rotations],
trimmed_sequences,
[[ACT_BRAKE] + [ACT_LEFT] * (180 // (ROTATION * frame_skip))],
[[ACT_BRAKE] + [ACT_LEFT] * (180 // (ROTATION * self.frame_skip))],
mine_sequences,
trimmed_sequences,
),
Expand All @@ -312,7 +320,7 @@ def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_ski
itertools.product(
[[ACT_LEFT] * rotations],
trimmed_sequences,
[[ACT_BRAKE] + [ACT_LEFT] * (180 // (ROTATION * frame_skip))],
[[ACT_BRAKE] + [ACT_LEFT] * (180 // (ROTATION * self.frame_skip))],
mine_sequences,
),
)
Expand All @@ -332,7 +340,7 @@ def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_ski
itertools.product(
[[ACT_LEFT] * rotations],
trimmed_sequences,
[[ACT_LEFT] * (180 // (ROTATION * frame_skip))],
[[ACT_LEFT] * (180 // (ROTATION * self.frame_skip))],
trimmed_sequences,
),
)
Expand All @@ -347,12 +355,12 @@ def pareto_coverage_set(self, frame_skip=1, discount=0.98, incremental_frame_ski
itertools.product(
[[ACT_LEFT] * rotations],
trimmed_sequences,
[[ACT_LEFT] * (180 // (ROTATION * frame_skip))],
[[ACT_LEFT] * (180 // (ROTATION * self.frame_skip))],
),
)

# Compute rewards for each sequence
fuel_costs = np.array([f * frame_skip for f in FUEL_LIST])
fuel_costs = np.array([f * self.frame_skip for f in FUEL_LIST])

def maxlen(l):
if len(l) == 0:
Expand All @@ -361,9 +369,14 @@ def maxlen(l):

longest_pattern = maxlen(trimmed_sequences)
max_len = (
rotations + longest_pattern + 1 + (180 // (ROTATION * frame_skip)) + maxlen(mine_sequences) + longest_pattern
rotations
+ longest_pattern
+ 1
+ (180 // (ROTATION * self.frame_skip))
+ maxlen(mine_sequences)
+ longest_pattern
)
discount_map = discount ** np.arange(max_len)
discount_map = gamma ** np.arange(max_len)
for s in all_sequences:
reward = np.zeros((len(s), self.obj_cnt()))
reward[:, -1] = fuel_costs[s]
Expand Down Expand Up @@ -417,55 +430,37 @@ def initialize_mines(self):
mine_sprite.rect.centery = (mine.pos[1] * (1 - 2 * MARGIN)) * HEIGHT + MARGIN * HEIGHT
self.mine_rects.append(mine_sprite.rect)

def step(self, action, frame_skip=4, incremental_frame_skip=True):
"""Perform the given action `frame_skip` times
["Mine", "Left", "Right", "Accelerate", "Brake", "None"]
Arguments:
action {int} -- Action to perform, ACT_MINE (0), ACT_LEFT (1), ACT_RIGHT (2), ACT_ACCEL (3), ACT_BRAKE (4) or ACT_NONE (5)

Keyword Arguments:
frame_skip {int} -- Repeat the action this many times (default: {1})
incremental_frame_skip {int} -- If True, frame_skip actions are performed in succession, otherwise the repeated actions are performed simultaneously (e.g., 4 accelerations are performed and then the cart moves).

Returns:
tuple -- (state, reward, terminal) tuple
"""
def step(self, action):
change = False # Keep track of whether the state has changed

if action < 0 or action >= ACTION_COUNT:
action = ACT_NONE

reward = np.zeros(self.ore_cnt + 1)
if frame_skip < 1:
frame_skip = 1

reward[-1] = FUEL_IDLE * frame_skip
reward[-1] = FUEL_IDLE * self.frame_skip

if action == ACT_ACCEL:
reward[-1] += FUEL_ACC * frame_skip
reward[-1] += FUEL_ACC * self.frame_skip
elif action == ACT_MINE:
reward[-1] += FUEL_MINE * frame_skip
reward[-1] += FUEL_MINE * self.frame_skip

for _ in range(frame_skip if incremental_frame_skip else 1):
for _ in range(self.frame_skip if self.incremental_frame_skip else 1):

if action == ACT_LEFT:
self.cart.rotate(-ROTATION * (1 if incremental_frame_skip else frame_skip))
self.cart.rotate(-ROTATION * (1 if self.incremental_frame_skip else self.frame_skip))
change = True
elif action == ACT_RIGHT:
self.cart.rotate(ROTATION * (1 if incremental_frame_skip else frame_skip))
self.cart.rotate(ROTATION * (1 if self.incremental_frame_skip else self.frame_skip))
change = True
elif action == ACT_ACCEL:
self.cart.accelerate(ACCELERATION * (1 if incremental_frame_skip else frame_skip))
self.cart.accelerate(ACCELERATION * (1 if self.incremental_frame_skip else self.frame_skip))
elif action == ACT_BRAKE:
self.cart.accelerate(-DECELERATION * (1 if incremental_frame_skip else frame_skip))
self.cart.accelerate(-DECELERATION * (1 if self.incremental_frame_skip else self.frame_skip))
elif action == ACT_MINE:
for _ in range(1 if incremental_frame_skip else frame_skip):
for _ in range(1 if self.incremental_frame_skip else self.frame_skip):
change = self.mine() or change

if self.end:
break

for _ in range(1 if incremental_frame_skip else frame_skip):
for _ in range(1 if self.incremental_frame_skip else self.frame_skip):
change = self.cart.step() or change

distanceFromBase = mag(self.cart.pos - HOME_POS)
Expand Down
69 changes: 68 additions & 1 deletion mo_gymnasium/envs/resource_gathering/resource_gathering.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import Optional
from typing import List, Optional

import gymnasium as gym
import numpy as np
Expand Down Expand Up @@ -78,6 +78,73 @@ def __init__(self, render_mode: Optional[str] = None):
# reward space:
self.reward_space = Box(low=-1, high=1, shape=(3,), dtype=np.float32)

def pareto_front(self, gamma: float) -> List[np.ndarray]:
"""This function returns the pareto front of the resource gathering environment.

Args:
gamma (float): The discount factor.

Returns:
The pareto front of the resource gathering environment.
"""

def get_non_dominated(candidates: List[np.ndarray]) -> List[np.ndarray]:
"""This function returns the non-dominated subset of elements.

Source: https://stackoverflow.com/questions/32791911/fast-calculation-of-pareto-front-in-python
The code provided in all the stackoverflow answers is wrong. Important changes have been made in this function.

Args:
candidates: The input set of candidate vectors.

Returns:
The non-dominated subset of this input set.
"""
candidates = np.array(candidates) # Turn the input set into a numpy array.
candidates = candidates[candidates.sum(1).argsort()[::-1]] # Sort candidates by decreasing sum of coordinates.
for i in range(candidates.shape[0]): # Process each point in turn.
n = candidates.shape[0] # Check current size of the candidates.
if i >= n: # If we've eliminated everything up until this size we stop.
break
non_dominated = np.ones(candidates.shape[0], dtype=bool) # Initialize a boolean mask for undominated points.
# find all points not dominated by i
# since points are sorted by coordinate sum
# i cannot dominate any points in 1,...,i-1
non_dominated[i + 1 :] = np.any(candidates[i + 1 :] > candidates[i], axis=1)
candidates = candidates[non_dominated] # Grab only the non-dominated vectors using the generated bitmask.

non_dominated = set()
for candidate in candidates:
non_dominated.add(tuple(candidate)) # Add the non dominated vectors to a set again.

return [np.array(point) for point in non_dominated]

# Go directly to the diamond (R2) in 10 steps
ret1 = np.array([0.0, 0.0, 1.0]) * gamma**10

# Go to both resources, through both Es
ret2 = 0.9 * 0.9 * np.array([0.0, 1.0, 1.0]) * gamma**12 # Didn't die
ret2 += 0.1 * np.array([-1.0, 0.0, 0.0]) * gamma**7 # Died to E2
ret2 += 0.9 * 0.1 * np.array([-1.0, 0.0, 0.0]) * gamma**9 # Died to E1

# Go to gold (R1), through E1 both ways
ret3 = 0.9 * 0.9 * np.array([0.0, 1.0, 0.0]) * gamma**8 # Didn't die
ret3 += 0.1 * np.array([-1.0, 0.0, 0.0]) * gamma**3 # Died to E1
ret3 += 0.9 * 0.1 * np.array([-1.0, 0.0, 0.0]) * gamma**5 # Died to E1 in the way back

# Go to both resources, dodging E1 but through E2
ret4 = 0.9 * np.array([0.0, 1.0, 1.0]) * gamma**14 # Didn't die
ret4 += 0.1 * np.array([-1.0, 0.0, 0.0]) * gamma**7 # Died to E2

# Go to gold (R1), doging all E's in 12 steps
ret5 = np.array([0.0, 1.0, 0.0]) * gamma**12 # Didn't die

# Go to gold (R1), going through E1 only once
ret6 = 0.9 * np.array([0.0, 1.0, 0.0]) * gamma**10 # Didn't die
ret6 += 0.1 * np.array([-1.0, 0.0, 0.0]) * gamma**7 # Died to E1

return get_non_dominated([ret1, ret2, ret3, ret4, ret5, ret6])

def get_map_value(self, pos):
return self.map[pos[0]][pos[1]]

Expand Down