Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions predicators/approaches/bilevel_planning_approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,11 @@ def _solve(self, task: Task, timeout: int) -> Callable[[State], Action]:
task, nsrts, preds, timeout, seed)
self._last_nsrt_plan = nsrt_plan
self._last_atoms_seq = atoms_seq
policy = utils.nsrt_plan_to_greedy_policy(nsrt_plan, task.goal,
self._rng)
policy = utils.nsrt_plan_to_greedy_policy(
nsrt_plan,
task.goal,
self._rng,
abstract_function=lambda s: utils.abstract(s, preds))
logging.debug("Current Task Plan:")
for act in nsrt_plan:
logging.debug(act)
Expand Down Expand Up @@ -110,7 +113,7 @@ def _run_sesame_plan(
self._task_planning_heuristic,
self._max_skeletons_optimized,
max_horizon=CFG.horizon,
allow_noops=CFG.sesame_allow_noops,
allow_waits=CFG.sesame_allow_waits,
use_visited_state_set=CFG.sesame_use_visited_state_set,
**kwargs)
except PlanningFailure as e:
Expand Down
44 changes: 31 additions & 13 deletions predicators/approaches/grammar_search_invention_approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
from predicators.predicate_search_score_functions import \
_PredicateSearchScoreFunction, create_score_function
from predicators.settings import CFG
from predicators.structs import Dataset, GroundAtom, GroundAtomTrajectory, \
Object, ParameterizedOption, Predicate, Segment, State, Task, Type, \
VLMPredicate
from predicators.structs import Dataset, DerivedPredicate, GroundAtom, \
GroundAtomTrajectory, Object, ParameterizedOption, Predicate, Segment, \
State, Task, Type, VLMPredicate

################################################################################
# Programmatic classifiers #
Expand All @@ -38,34 +38,44 @@ def _create_grammar(dataset: Dataset,
given_predicates: Set[Predicate]) -> _PredicateGrammar:
# We start with considering various ways to split either single or
# two feature values across our dataset.
grammar: _PredicateGrammar = _SingleFeatureInequalitiesPredicateGrammar(
dataset)
grammar: Optional[_PredicateGrammar] = None
if CFG.grammar_search_grammar_use_single_feature:
grammar = _SingleFeatureInequalitiesPredicateGrammar(dataset)
if CFG.grammar_search_grammar_use_diff_features:
diff_grammar = _FeatureDiffInequalitiesPredicateGrammar(dataset)
grammar = _ChainPredicateGrammar([grammar, diff_grammar],
alternate=True)
grammar = _ChainPredicateGrammar(
([grammar] if grammar is not None else []) + [diff_grammar],
alternate=True)
if CFG.grammar_search_grammar_use_euclidean_dist:
for (t1_f1, t1_f2, t2_f1,
t2_f2) in CFG.grammar_search_euclidean_feature_names:
euclidean_dist_grammar = _EuclideanDistancePredicateGrammar(
dataset, t1_f1, t2_f1, t1_f2, t2_f2)
grammar = _ChainPredicateGrammar([grammar, euclidean_dist_grammar],
alternate=True)
grammar = _ChainPredicateGrammar(
([grammar] if grammar is not None else []) +
[euclidean_dist_grammar],
alternate=True)
# We next optionally add in the given predicates because we want to allow
# negated and quantified versions of the given predicates, in
# addition to negated and quantified versions of new predicates.
# The chained grammar has the effect of enumerating first the
# given predicates, then the single feature inequality ones.
if CFG.grammar_search_grammar_includes_givens:
given_grammar = _GivenPredicateGrammar(given_predicates)
grammar = _ChainPredicateGrammar([given_grammar, grammar])
if grammar is not None:
grammar = _ChainPredicateGrammar([given_grammar, grammar])
else:
grammar = given_grammar
# Now, the grammar will undergo a series of transformations.
# For each predicate enumerated by the grammar, we also
# enumerate the negation of that predicate.
grammar = _NegationPredicateGrammarWrapper(grammar)
if CFG.grammar_search_grammar_includes_negation:
assert grammar is not None
grammar = _NegationPredicateGrammarWrapper(grammar)
# For each predicate enumerated, we also optionally enumerate foralls
# for that predicate, along with appropriate negations.
if CFG.grammar_search_grammar_includes_foralls:
assert grammar is not None
grammar = _ForallPredicateGrammarWrapper(grammar)
# Prune proposed predicates by checking if they are equivalent to
# any already-generated predicates with respect to the dataset.
Expand All @@ -77,17 +87,22 @@ def _create_grammar(dataset: Dataset,
# predicates.
if not CFG.grammar_search_use_handcoded_debug_grammar and \
CFG.grammar_search_prune_redundant_preds:
assert grammar is not None
grammar = _PrunedGrammar(dataset, grammar)
# We don't actually need to enumerate the given predicates
# because we already have them in the initial predicate set,
# so we just filter them out from actually being enumerated.
# But remember that we do want to enumerate their negations
# and foralls, which is why they're included originally.
grammar = _SkipGrammar(grammar, given_predicates)
if CFG.grammar_search_grammar_use_skip_grammar:
assert grammar is not None
grammar = _SkipGrammar(grammar, given_predicates)
# If we're using the DebugGrammar, filter out all other predicates.
if CFG.grammar_search_use_handcoded_debug_grammar:
assert grammar is not None
grammar = _DebugGrammar(grammar)
# We're done! Return the final grammar.
assert grammar is not None
return grammar


Expand Down Expand Up @@ -867,6 +882,9 @@ class _NegationPredicateGrammarWrapper(_PredicateGrammar):
def enumerate(self) -> Iterator[Tuple[Predicate, float]]:
for (predicate, cost) in self.base_grammar.enumerate():
yield (predicate, cost)
if isinstance(predicate, DerivedPredicate):
# Don't negate derived predicates.
continue
classifier = _NegationClassifier(predicate)
negated_predicate = Predicate(str(classifier), predicate.types,
classifier)
Expand Down Expand Up @@ -1104,7 +1122,7 @@ def rename(p: str) -> str: # pragma: no cover
score_function = create_score_function(
CFG.grammar_search_score_function,
self._initial_predicates, atom_dataset, candidates,
self._train_tasks)
self._train_tasks, None)
self._learned_predicates = \
self._select_predicates_by_score_hillclimbing(
candidates, score_function, self._initial_predicates,
Expand Down
202 changes: 202 additions & 0 deletions predicators/approaches/maple_q_process_approach.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
"""A parameterized action reinforcement learning approach inspired by MAPLE,
(https://ut-austin-rpl.github.io/maple/) but where only a Q-function is
learned.

Base samplers and applicable actions are used to perform the argmax.
"""

from __future__ import annotations

import logging
from typing import Any, Callable, List, Optional, Set

import dill as pkl
from gym.spaces import Box

from predicators import utils
from predicators.approaches.pp_online_process_learning_approach import \
OnlineProcessLearningAndPlanningApproach
from predicators.explorers import BaseExplorer, create_explorer
from predicators.ml_models import MapleQFunction
from predicators.nsrt_learning.segmentation import segment_trajectory
from predicators.settings import CFG
from predicators.structs import Action, GroundAtom, InteractionRequest, \
LowLevelTrajectory, ParameterizedOption, Predicate, Segment, State, Task, \
Type, _GroundCausalProcess, _Option


class MapleQProcessApproach(OnlineProcessLearningAndPlanningApproach):
"""A parameterized action RL approach inspired by MAPLE."""

def __init__(self, initial_predicates: Set[Predicate],
initial_options: Set[ParameterizedOption], types: Set[Type],
action_space: Box, train_tasks: List[Task]) -> None:
super().__init__(initial_predicates, initial_options, types,
action_space, train_tasks)

# The current implementation assumes that NSRTs are not changing.
assert CFG.strips_learner == "oracle"
# The base sampler should also be unchanging and from the oracle.
assert CFG.sampler_learner == "oracle"

# Log all transition data.
self._interaction_goals: List[Set[GroundAtom]] = []
self._last_seen_segment_traj_idx = -1
# For Q-learning data updates (segments by option changes).
self._segmented_trajs: List[List[Segment]] = []
self._offline_segmented_trajs: List[List[Segment]] = []

# Store the Q function. Note that this implicitly
# contains a replay buffer.
self._q_function = MapleQFunction(
seed=CFG.seed,
hid_sizes=CFG.mlp_regressor_hid_sizes,
max_train_iters=CFG.mlp_regressor_max_itr,
clip_gradients=CFG.mlp_regressor_clip_gradients,
clip_value=CFG.mlp_regressor_gradient_clip_value,
learning_rate=CFG.learning_rate,
weight_decay=CFG.weight_decay,
use_torch_gpu=CFG.use_torch_gpu,
train_print_every=CFG.pytorch_train_print_every,
n_iter_no_change=CFG.active_sampler_learning_n_iter_no_change,
num_lookahead_samples=CFG.
active_sampler_learning_num_lookahead_samples,
predicates=self._get_current_predicates())

@classmethod
def get_name(cls) -> str:
return "maple_q_with_process"

# pylint: disable=arguments-differ
def _solve(self,
task: Task,
timeout: int,
train_or_test: str = "") -> Callable[[State], Action]:

def _option_policy(state: State) -> _Option:
option = self._q_function.get_option(
state,
task.goal,
num_samples_per_ground_nsrt=CFG.
active_sampler_learning_num_samples,
train_or_test=train_or_test)
logging.debug(f"taking option: {option}")
return option

return utils.option_policy_to_policy(
_option_policy, max_option_steps=CFG.max_num_steps_option_rollout)

def _create_explorer(self) -> BaseExplorer:
"""Create a new explorer at the beginning of each interaction cycle."""
# Geometrically increase the length of exploration.
b = CFG.active_sampler_learning_explore_length_base
max_steps = b**(1 + self._online_learning_cycle)
preds = self._get_current_predicates()
assert CFG.explorer == "maple_q"
explorer = create_explorer(
CFG.explorer,
preds,
self._initial_options,
self._types,
self._action_space,
self._train_tasks,
# Endogenous processes are action-like
self._get_current_endogenous_processes(), # type: ignore[arg-type]
self._option_model,
max_steps_before_termination=max_steps,
maple_q_function=self._q_function)
return explorer

def load(self, online_learning_cycle: Optional[int]) -> None:
super().load(online_learning_cycle)
save_path = utils.get_approach_load_path_str()
with open(f"{save_path}_{online_learning_cycle}.DATA", "rb") as f:
save_dict = pkl.load(f)
self._q_function = save_dict["q_function"]
self._last_seen_segment_traj_idx = save_dict[
"last_seen_segment_traj_idx"]
self._interaction_goals = save_dict["interaction_goals"]
self._online_learning_cycle = CFG.skip_until_cycle + 1

def _learn_processes(self,
trajectories: List[LowLevelTrajectory],
online_learning_cycle: Optional[int],
annotations: Optional[List[Any]] = None) -> None:
# # Learn endogenous/exogenous processes via superclass.
# super()._learn_processes(trajectories, online_learning_cycle,
# annotations)
# Ground current endogenous processes for Q-learning.
all_ground_processes: Set[_GroundCausalProcess] = set()
all_objects = {o for t in self._train_tasks for o in t.init}
for process in self._get_current_endogenous_processes():
all_ground_processes.update(
utils.all_ground_nsrts(process,
all_objects)) # type: ignore[arg-type]
goals = [t.goal for t in self._train_tasks]
self._q_function.set_grounding(
all_objects, goals, all_ground_processes) # type: ignore[arg-type]
# Refresh segmentation by option changes.
prev_segmenter = CFG.segmenter
try:
CFG.segmenter = "option_changes"
new_segments = [
segment_trajectory(traj, self._get_current_predicates())
for traj in trajectories
]
finally:
CFG.segmenter = prev_segmenter
# if online_learning_cycle is None:
# # Offline phase: only offline trajectories are included.
# self._offline_segmented_trajs = new_segments
# self._segmented_trajs = list(self._offline_segmented_trajs)
# else:
# # Online phase: input trajectories are only the online ones so far.
# self._segmented_trajs = list(self._offline_segmented_trajs) + \
# list(new_segments)
if online_learning_cycle is not None:
self._segmented_trajs = list(new_segments)
# Update the data using the updated self._segmented_trajs.
self._update_maple_data()
# Re-learn Q function.
self._q_function.train_q_function()
# Save the things we need other than the NSRTs, which were already
# saved in the above call to self._learn_processes()
save_path = utils.get_approach_save_path_str()
with open(f"{save_path}_{online_learning_cycle}.DATA", "wb") as f:
pkl.dump(
{
"q_function": self._q_function,
"last_seen_segment_traj_idx":
self._last_seen_segment_traj_idx,
"interaction_goals": self._interaction_goals,
}, f)

def _update_maple_data(self) -> None:
start_idx = self._last_seen_segment_traj_idx + 1
new_trajs = self._segmented_trajs[start_idx:]

goal_offset = 0
assert len(self._segmented_trajs) == goal_offset + \
len(self._interaction_goals)
new_traj_goals = self._interaction_goals[goal_offset + start_idx:]

for traj_i, segmented_traj in enumerate(new_trajs):
self._last_seen_segment_traj_idx += 1
for seg_i, segment in enumerate(segmented_traj):
s = segment.states[0]
goal = new_traj_goals[traj_i]
o = segment.get_option()
ns = segment.states[-1]
reward = 1.0 if goal.issubset(segment.final_atoms) else 0.0
terminal = reward > 0 or seg_i == len(segmented_traj) - 1
self._q_function.add_datum_to_replay_buffer(
(s, goal, o, ns, reward, terminal))

def get_interaction_requests(self) -> List[InteractionRequest]:
# Save the goals for each interaction request so we can later associate
# states, actions, and goals.
requests = super().get_interaction_requests()
for request in requests:
goal = self._train_tasks[request.train_task_idx].goal
self._interaction_goals.append(goal)
return requests
Loading
Loading