Eventual-Inc · samster25 · May 2, 2024 · May 2, 2024 · May 2, 2024 · jaychia
diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py
@@ -219,7 +219,8 @@ def iter_partitions(self) -> Iterator[Union[MicroPartition, "RayObjectRef"]]:
         if self._result is not None:
             # If the dataframe has already finished executing,
             # use the precomputed results.
-            yield from self._result.values()
+            for mat_result in self._result.values():
+                yield mat_result.partition()
 
         else:
             # Execute the dataframe in a streaming fashion.
@@ -238,8 +239,9 @@ def _populate_preview(self) -> None:
         )
         if preview_partition_invalid:
             preview_parts = self._result._get_preview_vpartition(self._num_preview_rows)
-            preview_results = LocalPartitionSet({i: part for i, part in enumerate(preview_parts)})
-
+            preview_results = LocalPartitionSet()
+            for i, part in enumerate(preview_parts):
+                preview_results.set_partition_from_table(i, part)
             preview_partition = preview_results._get_merged_vpartition()
             self._preview = DataFramePreview(
                 preview_partition=preview_partition,
@@ -314,7 +316,10 @@ def _from_tables(cls, *parts: MicroPartition) -> "DataFrame":
         if not parts:
             raise ValueError("Can't create a DataFrame from an empty list of tables.")
 
-        result_pset = LocalPartitionSet({i: part for i, part in enumerate(parts)})
+        result_pset = LocalPartitionSet()
+
+        for i, part in enumerate(parts):
+            result_pset.set_partition_from_table(i, part)
 
         context = get_context()
         cache_entry = context.runner().put_partition_set_into_cache(result_pset)

diff --git a/daft/execution/physical_plan.py b/daft/execution/physical_plan.py
@@ -36,7 +36,6 @@
 from daft.logical.schema import Schema
 from daft.runners.partitioning import (
     MaterializedResult,
-    PartialPartitionMetadata,
     PartitionT,
 )
 from daft.table.micropartition import MicroPartition
@@ -68,16 +67,12 @@ def _stage_id_counter():
 
 
 def partition_read(
-    partitions: Iterator[PartitionT], metadatas: Iterator[PartialPartitionMetadata] | None = None
+    materialized_results: Iterator[MaterializedResult[PartitionT]],
 ) -> InProgressPhysicalPlan[PartitionT]:
     """Instantiate a (no-op) physical plan from existing partitions."""
-    if metadatas is None:
-        # Iterator of empty metadatas.
-        metadatas = (PartialPartitionMetadata(num_rows=None, size_bytes=None) for _ in iter(int, 1))
-
     yield from (
-        PartitionTaskBuilder[PartitionT](inputs=[partition], partial_metadatas=[metadata])
-        for partition, metadata in zip(partitions, metadatas)
+        PartitionTaskBuilder[PartitionT](inputs=[mat_result.partition()], partial_metadatas=[mat_result.metadata()])
+        for mat_result in materialized_results
     )
 
 

diff --git a/daft/io/file_path.py b/daft/io/file_path.py
@@ -47,7 +47,8 @@ def from_glob_path(path: str, io_config: Optional[IOConfig] = None) -> DataFrame
     runner_io = context.runner().runner_io()
     file_infos = runner_io.glob_paths_details([path], io_config=io_config)
     file_infos_table = MicroPartition._from_pytable(file_infos.to_table())
-    partition = LocalPartitionSet({0: file_infos_table})
+    partition = LocalPartitionSet()
+    partition.set_partition_from_table(0, file_infos_table)
     cache_entry = context.runner().put_partition_set_into_cache(partition)
     size_bytes = partition.size_bytes()
     assert size_bytes is not None, "In-memory data should always have non-None size in bytes"

diff --git a/daft/runners/partitioning.py b/daft/runners/partitioning.py
@@ -232,18 +232,18 @@ def to_arrow(self, cast_tensors_to_ray_tensor_dtype: bool = False) -> pa.Table:
         merged_partition = self._get_merged_vpartition()
         return merged_partition.to_arrow(cast_tensors_to_ray_tensor_dtype)
 
-    def items(self) -> list[tuple[PartID, PartitionT]]:
+    def items(self) -> list[tuple[PartID, MaterializedResult[PartitionT]]]:
         """
         Returns all (partition id, partition) in this PartitionSet,
         ordered by partition ID.
         """
         raise NotImplementedError()
 
-    def values(self) -> list[PartitionT]:
+    def values(self) -> list[MaterializedResult[PartitionT]]:
         return [value for _, value in self.items()]
 
     @abstractmethod
-    def get_partition(self, idx: PartID) -> PartitionT:
+    def get_partition(self, idx: PartID) -> MaterializedResult[PartitionT]:
         raise NotImplementedError()
 
     @abstractmethod

diff --git a/daft/runners/pyrunner.py b/daft/runners/pyrunner.py
@@ -15,6 +15,7 @@
 from daft.runners import runner_io
 from daft.runners.partitioning import (
     MaterializedResult,
+    PartialPartitionMetadata,
     PartID,
     PartitionCacheEntry,
     PartitionMetadata,
@@ -28,23 +29,27 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass
 class LocalPartitionSet(PartitionSet[MicroPartition]):
-    _partitions: dict[PartID, MicroPartition]
+    _partitions: dict[PartID, MaterializedResult[MicroPartition]]
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._partitions = {}
 
-    def items(self) -> list[tuple[PartID, MicroPartition]]:
+    def items(self) -> list[tuple[PartID, MaterializedResult[MicroPartition]]]:
         return sorted(self._partitions.items())
 
     def _get_merged_vpartition(self) -> MicroPartition:
         ids_and_partitions = self.items()
         assert ids_and_partitions[0][0] == 0
         assert ids_and_partitions[-1][0] + 1 == len(ids_and_partitions)
-        return MicroPartition.concat([part for id, part in ids_and_partitions])
+        return MicroPartition.concat([part.partition() for id, part in ids_and_partitions])
 
     def _get_preview_vpartition(self, num_rows: int) -> list[MicroPartition]:
         ids_and_partitions = self.items()
         preview_parts = []
-        for _, part in ids_and_partitions:
+        for _, mat_result in ids_and_partitions:
+            part: MicroPartition = mat_result.partition()
             part_len = len(part)
             if part_len >= num_rows:  # if this part has enough rows, take what we need and break
                 preview_parts.append(part.slice(0, num_rows))
@@ -54,11 +59,14 @@ def _get_preview_vpartition(self, num_rows: int) -> list[MicroPartition]:
                 preview_parts.append(part)
         return preview_parts
 
-    def get_partition(self, idx: PartID) -> MicroPartition:
+    def get_partition(self, idx: PartID) -> MaterializedResult[MicroPartition]:
         return self._partitions[idx]
 
     def set_partition(self, idx: PartID, part: MaterializedResult[MicroPartition]) -> None:
-        self._partitions[idx] = part.partition()
+        self._partitions[idx] = part
+
+    def set_partition_from_table(self, idx: PartID, part: MicroPartition) -> None:
+        self._partitions[idx] = PyMaterializedResult(part, PartitionMetadata.from_table(part))
 
     def delete_partition(self, idx: PartID) -> None:
         del self._partitions[idx]
@@ -67,10 +75,10 @@ def has_partition(self, idx: PartID) -> bool:
         return idx in self._partitions
 
     def __len__(self) -> int:
-        return sum(len(partition) for partition in self._partitions.values())
+        return sum(len(partition.partition()) for partition in self._partitions.values())
 
     def size_bytes(self) -> int | None:
-        size_bytes_ = [partition.size_bytes() for partition in self._partitions.values()]
+        size_bytes_ = [partition.partition().size_bytes() for partition in self._partitions.values()]
         size_bytes: list[int] = [size for size in size_bytes_ if size is not None]
         if len(size_bytes) != len(size_bytes_):
             return None
@@ -126,7 +134,7 @@ def runner_io(self) -> PyRunnerIO:
     def run(self, builder: LogicalPlanBuilder) -> PartitionCacheEntry:
         results = list(self.run_iter(builder))
 
-        result_pset = LocalPartitionSet({})
+        result_pset = LocalPartitionSet()
         for i, result in enumerate(results):
             result_pset.set_partition(i, result)
 
@@ -144,6 +152,7 @@ def run_iter(
 
         # Optimize the logical plan.
         builder = builder.optimize()
+
         # Finalize the logical plan and get a physical plan scheduler for translating the
         # physical plan to executable tasks.
         plan_scheduler = builder.to_physical_plan_scheduler(daft_execution_config)
@@ -209,8 +218,10 @@ def _physical_plan_to_partitions(
                                 )
                             ):
                                 logger.debug("Running task synchronously in main thread: %s", next_step)
-                                partitions = self.build_partitions(next_step.instructions, *next_step.inputs)
-                                next_step.set_result([PyMaterializedResult(partition) for partition in partitions])
+                                materialized_results = self.build_partitions(
+                                    next_step.instructions, next_step.inputs, next_step.partial_metadatas
+                                )
+                                next_step.set_result(materialized_results)
 
                             else:
                                 # Submit the task for execution.
@@ -220,7 +231,10 @@ def _physical_plan_to_partitions(
                                 pbar.mark_task_start(next_step)
 
                                 future = thread_pool.submit(
-                                    self.build_partitions, next_step.instructions, *next_step.inputs
+                                    self.build_partitions,
+                                    next_step.instructions,
+                                    next_step.inputs,
+                                    next_step.partial_metadatas,
                                 )
                                 # Register the inflight task and resources used.
                                 future_to_task[future] = next_step.id()
@@ -239,12 +253,13 @@ def _physical_plan_to_partitions(
                         done_id = future_to_task.pop(done_future)
                         del inflight_tasks_resources[done_id]
                         done_task = inflight_tasks.pop(done_id)
-                        partitions = done_future.result()
+                        materialized_results = done_future.result()
 
                         pbar.mark_task_done(done_task)
 
-                        logger.debug("Task completed: %s -> <%s partitions>", done_id, len(partitions))
-                        done_task.set_result([PyMaterializedResult(partition) for partition in partitions])
+                        logger.debug("Task completed: %s -> <%s partitions>", done_id, len(materialized_results))
+
+                        done_task.set_result(materialized_results)
 
                     if next_step is None:
                         next_step = next(plan)
@@ -278,17 +293,23 @@ def _can_admit_task(self, resource_request: ResourceRequest, inflight_resources:
         return all((cpus_okay, gpus_okay, memory_okay))
 
     @staticmethod
-    def build_partitions(instruction_stack: list[Instruction], *inputs: MicroPartition) -> list[MicroPartition]:
-        partitions = list(inputs)
+    def build_partitions(
+        instruction_stack: list[Instruction],
+        partitions: list[MicroPartition],
+        final_metadata: list[PartialPartitionMetadata],
+    ) -> list[MaterializedResult[MicroPartition]]:
         for instruction in instruction_stack:
             partitions = instruction.run(partitions)
+        return [
+            PyMaterializedResult(part, PartitionMetadata.from_table(part).merge_with_partial(partial))
+            for part, partial in zip(partitions, final_metadata)
+        ]
 
-        return partitions
 
-
-@dataclass(frozen=True)
+@dataclass
 class PyMaterializedResult(MaterializedResult[MicroPartition]):
     _partition: MicroPartition
+    _metadata: PartitionMetadata | None = None
 
     def partition(self) -> MicroPartition:
         return self._partition
@@ -297,7 +318,9 @@ def vpartition(self) -> MicroPartition:
         return self._partition
 
     def metadata(self) -> PartitionMetadata:
-        return PartitionMetadata.from_table(self._partition)
+        if self._metadata is None:
+            self._metadata = PartitionMetadata.from_table(self._partition)
+        return self._metadata
 
     def cancel(self) -> None:
         return None