BasisResearch · yichao-liang · Mar 25, 2026 · Mar 24, 2026
diff --git a/predicators/approaches/bilevel_planning_approach.py b/predicators/approaches/bilevel_planning_approach.py
@@ -66,8 +66,11 @@ def _solve(self, task: Task, timeout: int) -> Callable[[State], Action]:
                 task, nsrts, preds, timeout, seed)
             self._last_nsrt_plan = nsrt_plan
             self._last_atoms_seq = atoms_seq
-            policy = utils.nsrt_plan_to_greedy_policy(nsrt_plan, task.goal,
-                                                      self._rng)
+            policy = utils.nsrt_plan_to_greedy_policy(
+                nsrt_plan,
+                task.goal,
+                self._rng,
+                abstract_function=lambda s: utils.abstract(s, preds))
             logging.debug("Current Task Plan:")
             for act in nsrt_plan:
                 logging.debug(act)
@@ -110,7 +113,7 @@ def _run_sesame_plan(
                 self._task_planning_heuristic,
                 self._max_skeletons_optimized,
                 max_horizon=CFG.horizon,
-                allow_noops=CFG.sesame_allow_noops,
+                allow_waits=CFG.sesame_allow_waits,
                 use_visited_state_set=CFG.sesame_use_visited_state_set,
                 **kwargs)
         except PlanningFailure as e:

diff --git a/predicators/approaches/grammar_search_invention_approach.py b/predicators/approaches/grammar_search_invention_approach.py
@@ -25,9 +25,9 @@
 from predicators.predicate_search_score_functions import \
     _PredicateSearchScoreFunction, create_score_function
 from predicators.settings import CFG
-from predicators.structs import Dataset, GroundAtom, GroundAtomTrajectory, \
-    Object, ParameterizedOption, Predicate, Segment, State, Task, Type, \
-    VLMPredicate
+from predicators.structs import Dataset, DerivedPredicate, GroundAtom, \
+    GroundAtomTrajectory, Object, ParameterizedOption, Predicate, Segment, \
+    State, Task, Type, VLMPredicate
 
 ################################################################################
 #                          Programmatic classifiers                            #
@@ -38,34 +38,44 @@ def _create_grammar(dataset: Dataset,
                     given_predicates: Set[Predicate]) -> _PredicateGrammar:
     # We start with considering various ways to split either single or
     # two feature values across our dataset.
-    grammar: _PredicateGrammar = _SingleFeatureInequalitiesPredicateGrammar(
-        dataset)
+    grammar: Optional[_PredicateGrammar] = None
+    if CFG.grammar_search_grammar_use_single_feature:
+        grammar = _SingleFeatureInequalitiesPredicateGrammar(dataset)
     if CFG.grammar_search_grammar_use_diff_features:
         diff_grammar = _FeatureDiffInequalitiesPredicateGrammar(dataset)
-        grammar = _ChainPredicateGrammar([grammar, diff_grammar],
-                                         alternate=True)
+        grammar = _ChainPredicateGrammar(
+            ([grammar] if grammar is not None else []) + [diff_grammar],
+            alternate=True)
     if CFG.grammar_search_grammar_use_euclidean_dist:
         for (t1_f1, t1_f2, t2_f1,
              t2_f2) in CFG.grammar_search_euclidean_feature_names:
             euclidean_dist_grammar = _EuclideanDistancePredicateGrammar(
                 dataset, t1_f1, t2_f1, t1_f2, t2_f2)
-            grammar = _ChainPredicateGrammar([grammar, euclidean_dist_grammar],
-                                             alternate=True)
+            grammar = _ChainPredicateGrammar(
+                ([grammar] if grammar is not None else []) +
+                [euclidean_dist_grammar],
+                alternate=True)
     # We next optionally add in the given predicates because we want to allow
     # negated and quantified versions of the given predicates, in
     # addition to negated and quantified versions of new predicates.
     # The chained grammar has the effect of enumerating first the
     # given predicates, then the single feature inequality ones.
     if CFG.grammar_search_grammar_includes_givens:
         given_grammar = _GivenPredicateGrammar(given_predicates)
-        grammar = _ChainPredicateGrammar([given_grammar, grammar])
+        if grammar is not None:
+            grammar = _ChainPredicateGrammar([given_grammar, grammar])
+        else:
+            grammar = given_grammar
     # Now, the grammar will undergo a series of transformations.
     # For each predicate enumerated by the grammar, we also
     # enumerate the negation of that predicate.
-    grammar = _NegationPredicateGrammarWrapper(grammar)
+    if CFG.grammar_search_grammar_includes_negation:
+        assert grammar is not None
+        grammar = _NegationPredicateGrammarWrapper(grammar)
     # For each predicate enumerated, we also optionally enumerate foralls
     # for that predicate, along with appropriate negations.
     if CFG.grammar_search_grammar_includes_foralls:
+        assert grammar is not None
         grammar = _ForallPredicateGrammarWrapper(grammar)
     # Prune proposed predicates by checking if they are equivalent to
     # any already-generated predicates with respect to the dataset.
@@ -77,17 +87,22 @@ def _create_grammar(dataset: Dataset,
     # predicates.
     if not CFG.grammar_search_use_handcoded_debug_grammar and \
         CFG.grammar_search_prune_redundant_preds:
+        assert grammar is not None
         grammar = _PrunedGrammar(dataset, grammar)
     # We don't actually need to enumerate the given predicates
     # because we already have them in the initial predicate set,
     # so we just filter them out from actually being enumerated.
     # But remember that we do want to enumerate their negations
     # and foralls, which is why they're included originally.
-    grammar = _SkipGrammar(grammar, given_predicates)
+    if CFG.grammar_search_grammar_use_skip_grammar:
+        assert grammar is not None
+        grammar = _SkipGrammar(grammar, given_predicates)
     # If we're using the DebugGrammar, filter out all other predicates.
     if CFG.grammar_search_use_handcoded_debug_grammar:
+        assert grammar is not None
         grammar = _DebugGrammar(grammar)
     # We're done! Return the final grammar.
+    assert grammar is not None
     return grammar
 
 
@@ -867,6 +882,9 @@ class _NegationPredicateGrammarWrapper(_PredicateGrammar):
     def enumerate(self) -> Iterator[Tuple[Predicate, float]]:
         for (predicate, cost) in self.base_grammar.enumerate():
             yield (predicate, cost)
+            if isinstance(predicate, DerivedPredicate):
+                # Don't negate derived predicates.
+                continue
             classifier = _NegationClassifier(predicate)
             negated_predicate = Predicate(str(classifier), predicate.types,
                                           classifier)
@@ -1104,7 +1122,7 @@ def rename(p: str) -> str:  # pragma: no cover
                 score_function = create_score_function(
                     CFG.grammar_search_score_function,
                     self._initial_predicates, atom_dataset, candidates,
-                    self._train_tasks)
+                    self._train_tasks, None)
                 self._learned_predicates = \
                     self._select_predicates_by_score_hillclimbing(
                     candidates, score_function, self._initial_predicates,

diff --git a/predicators/approaches/maple_q_process_approach.py b/predicators/approaches/maple_q_process_approach.py
@@ -0,0 +1,202 @@
+"""A parameterized action reinforcement learning approach inspired by MAPLE,
+(https://ut-austin-rpl.github.io/maple/) but where only a Q-function is
+learned.
+
+Base samplers and applicable actions are used to perform the argmax.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Callable, List, Optional, Set
+
+import dill as pkl
+from gym.spaces import Box
+
+from predicators import utils
+from predicators.approaches.pp_online_process_learning_approach import \
+    OnlineProcessLearningAndPlanningApproach
+from predicators.explorers import BaseExplorer, create_explorer
+from predicators.ml_models import MapleQFunction
+from predicators.nsrt_learning.segmentation import segment_trajectory
+from predicators.settings import CFG
+from predicators.structs import Action, GroundAtom, InteractionRequest, \
+    LowLevelTrajectory, ParameterizedOption, Predicate, Segment, State, Task, \
+    Type, _GroundCausalProcess, _Option
+
+
+class MapleQProcessApproach(OnlineProcessLearningAndPlanningApproach):
+    """A parameterized action RL approach inspired by MAPLE."""
+
+    def __init__(self, initial_predicates: Set[Predicate],
+                 initial_options: Set[ParameterizedOption], types: Set[Type],
+                 action_space: Box, train_tasks: List[Task]) -> None:
+        super().__init__(initial_predicates, initial_options, types,
+                         action_space, train_tasks)
+
+        # The current implementation assumes that NSRTs are not changing.
+        assert CFG.strips_learner == "oracle"
+        # The base sampler should also be unchanging and from the oracle.
+        assert CFG.sampler_learner == "oracle"
+
+        # Log all transition data.
+        self._interaction_goals: List[Set[GroundAtom]] = []
+        self._last_seen_segment_traj_idx = -1
+        # For Q-learning data updates (segments by option changes).
+        self._segmented_trajs: List[List[Segment]] = []
+        self._offline_segmented_trajs: List[List[Segment]] = []
+
+        # Store the Q function. Note that this implicitly
+        # contains a replay buffer.
+        self._q_function = MapleQFunction(
+            seed=CFG.seed,
+            hid_sizes=CFG.mlp_regressor_hid_sizes,
+            max_train_iters=CFG.mlp_regressor_max_itr,
+            clip_gradients=CFG.mlp_regressor_clip_gradients,
+            clip_value=CFG.mlp_regressor_gradient_clip_value,
+            learning_rate=CFG.learning_rate,
+            weight_decay=CFG.weight_decay,
+            use_torch_gpu=CFG.use_torch_gpu,
+            train_print_every=CFG.pytorch_train_print_every,
+            n_iter_no_change=CFG.active_sampler_learning_n_iter_no_change,
+            num_lookahead_samples=CFG.
+            active_sampler_learning_num_lookahead_samples,
+            predicates=self._get_current_predicates())
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "maple_q_with_process"
+
+    # pylint: disable=arguments-differ
+    def _solve(self,
+               task: Task,
+               timeout: int,
+               train_or_test: str = "") -> Callable[[State], Action]:
+
+        def _option_policy(state: State) -> _Option:
+            option = self._q_function.get_option(
+                state,
+                task.goal,
+                num_samples_per_ground_nsrt=CFG.
+                active_sampler_learning_num_samples,
+                train_or_test=train_or_test)
+            logging.debug(f"taking option: {option}")
+            return option
+
+        return utils.option_policy_to_policy(
+            _option_policy, max_option_steps=CFG.max_num_steps_option_rollout)
+
+    def _create_explorer(self) -> BaseExplorer:
+        """Create a new explorer at the beginning of each interaction cycle."""
+        # Geometrically increase the length of exploration.
+        b = CFG.active_sampler_learning_explore_length_base
+        max_steps = b**(1 + self._online_learning_cycle)
+        preds = self._get_current_predicates()
+        assert CFG.explorer == "maple_q"
+        explorer = create_explorer(
+            CFG.explorer,
+            preds,
+            self._initial_options,
+            self._types,
+            self._action_space,
+            self._train_tasks,
+            # Endogenous processes are action-like
+            self._get_current_endogenous_processes(),  # type: ignore[arg-type]
+            self._option_model,
+            max_steps_before_termination=max_steps,
+            maple_q_function=self._q_function)
+        return explorer
+
+    def load(self, online_learning_cycle: Optional[int]) -> None:
+        super().load(online_learning_cycle)
+        save_path = utils.get_approach_load_path_str()
+        with open(f"{save_path}_{online_learning_cycle}.DATA", "rb") as f:
+            save_dict = pkl.load(f)
+        self._q_function = save_dict["q_function"]
+        self._last_seen_segment_traj_idx = save_dict[
+            "last_seen_segment_traj_idx"]
+        self._interaction_goals = save_dict["interaction_goals"]
+        self._online_learning_cycle = CFG.skip_until_cycle + 1
+
+    def _learn_processes(self,
+                         trajectories: List[LowLevelTrajectory],
+                         online_learning_cycle: Optional[int],
+                         annotations: Optional[List[Any]] = None) -> None:
+        # # Learn endogenous/exogenous processes via superclass.
+        # super()._learn_processes(trajectories, online_learning_cycle,
+        #                          annotations)
+        # Ground current endogenous processes for Q-learning.
+        all_ground_processes: Set[_GroundCausalProcess] = set()
+        all_objects = {o for t in self._train_tasks for o in t.init}
+        for process in self._get_current_endogenous_processes():
+            all_ground_processes.update(
+                utils.all_ground_nsrts(process,
+                                       all_objects))  # type: ignore[arg-type]
+        goals = [t.goal for t in self._train_tasks]
+        self._q_function.set_grounding(
+            all_objects, goals, all_ground_processes)  # type: ignore[arg-type]
+        # Refresh segmentation by option changes.
+        prev_segmenter = CFG.segmenter
+        try:
+            CFG.segmenter = "option_changes"
+            new_segments = [
+                segment_trajectory(traj, self._get_current_predicates())
+                for traj in trajectories
+            ]
+        finally:
+            CFG.segmenter = prev_segmenter
+        # if online_learning_cycle is None:
+        #     # Offline phase: only offline trajectories are included.
+        #     self._offline_segmented_trajs = new_segments
+        #     self._segmented_trajs = list(self._offline_segmented_trajs)
+        # else:
+        #     # Online phase: input trajectories are only the online ones so far.
+        #     self._segmented_trajs = list(self._offline_segmented_trajs) + \
+        #                              list(new_segments)
+        if online_learning_cycle is not None:
+            self._segmented_trajs = list(new_segments)
+            # Update the data using the updated self._segmented_trajs.
+            self._update_maple_data()
+            # Re-learn Q function.
+            self._q_function.train_q_function()
+            # Save the things we need other than the NSRTs, which were already
+            # saved in the above call to self._learn_processes()
+            save_path = utils.get_approach_save_path_str()
+            with open(f"{save_path}_{online_learning_cycle}.DATA", "wb") as f:
+                pkl.dump(
+                    {
+                        "q_function": self._q_function,
+                        "last_seen_segment_traj_idx":
+                        self._last_seen_segment_traj_idx,
+                        "interaction_goals": self._interaction_goals,
+                    }, f)
+
+    def _update_maple_data(self) -> None:
+        start_idx = self._last_seen_segment_traj_idx + 1
+        new_trajs = self._segmented_trajs[start_idx:]
+
+        goal_offset = 0
+        assert len(self._segmented_trajs) == goal_offset + \
+            len(self._interaction_goals)
+        new_traj_goals = self._interaction_goals[goal_offset + start_idx:]
+
+        for traj_i, segmented_traj in enumerate(new_trajs):
+            self._last_seen_segment_traj_idx += 1
+            for seg_i, segment in enumerate(segmented_traj):
+                s = segment.states[0]
+                goal = new_traj_goals[traj_i]
+                o = segment.get_option()
+                ns = segment.states[-1]
+                reward = 1.0 if goal.issubset(segment.final_atoms) else 0.0
+                terminal = reward > 0 or seg_i == len(segmented_traj) - 1
+                self._q_function.add_datum_to_replay_buffer(
+                    (s, goal, o, ns, reward, terminal))
+
+    def get_interaction_requests(self) -> List[InteractionRequest]:
+        # Save the goals for each interaction request so we can later associate
+        # states, actions, and goals.
+        requests = super().get_interaction_requests()
+        for request in requests:
+            goal = self._train_tasks[request.train_task_idx].goal
+            self._interaction_goals.append(goal)
+        return requests