Less is more 😍

Fokko · Fokko · commit 0384b4eaac6d · 2025-02-18T15:22:25.000+01:00
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -99,7 +99,6 @@
     HDFS_KERB_TICKET,
     HDFS_PORT,
     HDFS_USER,
-    PYARROW_USE_LARGE_TYPES_ON_READ,
     S3_ACCESS_KEY_ID,
     S3_CONNECT_TIMEOUT,
     S3_ENDPOINT,
@@ -1348,7 +1347,6 @@ def _task_to_record_batches(
     positional_deletes: Optional[List[ChunkedArray]],
     case_sensitive: bool,
     name_mapping: Optional[NameMapping] = None,
-    use_large_types: Optional[bool] = True,
     partition_spec: Optional[PartitionSpec] = None,
 ) -> Iterator[pa.RecordBatch]:
     _, _, path = _parse_location(task.file.file_path)
@@ -1376,21 +1374,13 @@ def _task_to_record_batches(
 
         file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
 
-        fragment_schema = physical_schema
-        if use_large_types is not None:
-            fragment_schema = (
-                _pyarrow_schema_ensure_large_types(physical_schema)
-                if use_large_types
-                else (_pyarrow_schema_ensure_small_types(physical_schema))
-            )
-
         fragment_scanner = ds.Scanner.from_fragment(
             fragment=fragment,
             # With PyArrow 16.0.0 there is an issue with casting record-batches:
             # https://github.com/apache/arrow/issues/41884
             # https://github.com/apache/arrow/issues/43183
             # Would be good to remove this later on
-            schema=fragment_schema,
+            schema=physical_schema,
             # This will push down the query to Arrow.
             # But in case there are positional deletes, we have to apply them first
             filter=pyarrow_filter if not positional_deletes else None,
@@ -1425,7 +1415,6 @@ def _task_to_record_batches(
                 file_project_schema,
                 current_batch,
                 downcast_ns_timestamp_to_us=True,
-                use_large_types=use_large_types,
             )
 
             # Inject projected column values if available
@@ -1539,12 +1528,8 @@ def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table:
         deletes_per_file = _read_all_delete_files(self._io, tasks)
         executor = ExecutorFactory.get_or_create()
 
-        use_large_types = None
-        if PYARROW_USE_LARGE_TYPES_ON_READ in self._io.properties:
-            use_large_types = property_as_bool(self._io.properties, PYARROW_USE_LARGE_TYPES_ON_READ, True)
-
         def _table_from_scan_task(task: FileScanTask) -> pa.Table:
-            batches = list(self._record_batches_from_scan_tasks_and_deletes([task], deletes_per_file, use_large_types))
+            batches = list(self._record_batches_from_scan_tasks_and_deletes([task], deletes_per_file))
             if len(batches) > 0:
                 return pa.Table.from_batches(batches)
             else:
@@ -1606,13 +1591,12 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record
         deletes_per_file = _read_all_delete_files(self._io, tasks)
         # Always use large types, since we cannot infer it in a streaming fashion,
         # without fetching all the schemas first, which defeats the purpose of streaming
-        return self._record_batches_from_scan_tasks_and_deletes(tasks, deletes_per_file, use_large_types=True)
+        return self._record_batches_from_scan_tasks_and_deletes(tasks, deletes_per_file)
 
     def _record_batches_from_scan_tasks_and_deletes(
         self,
         tasks: Iterable[FileScanTask],
         deletes_per_file: Dict[str, List[ChunkedArray]],
-        use_large_types: Optional[bool] = True,
     ) -> Iterator[pa.RecordBatch]:
         total_row_count = 0
         for task in tasks:
@@ -1627,7 +1611,6 @@ def _record_batches_from_scan_tasks_and_deletes(
                 deletes_per_file.get(task.file.file_path),
                 self._case_sensitive,
                 self._table_metadata.name_mapping(),
-                use_large_types,
                 self._table_metadata.spec(),
             )
             for batch in batches:
@@ -1646,13 +1629,12 @@ def _to_requested_schema(
     batch: pa.RecordBatch,
     downcast_ns_timestamp_to_us: bool = False,
     include_field_ids: bool = False,
-    use_large_types: Optional[bool] = True,
 ) -> pa.RecordBatch:
     # We could reuse some of these visitors
     struct_array = visit_with_partner(
         requested_schema,
         batch,
-        ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us, include_field_ids, use_large_types),
+        ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us, include_field_ids),
         ArrowAccessor(file_schema),
     )
     return pa.RecordBatch.from_struct_array(struct_array)
@@ -1669,12 +1651,10 @@ def __init__(
         file_schema: Schema,
         downcast_ns_timestamp_to_us: bool = False,
         include_field_ids: bool = False,
-        use_large_types: Optional[bool] = True,
     ) -> None:
         self._file_schema = file_schema
         self._include_field_ids = include_field_ids
         self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
-        self._use_large_types = use_large_types
 
     def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
         file_field = self._file_schema.find_field(field.field_id)
@@ -1684,8 +1664,6 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
                 target_schema = schema_to_pyarrow(
                     promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids
                 )
-                if self._use_large_types is False:
-                    target_schema = _pyarrow_schema_ensure_small_types(target_schema)
                 return values.cast(target_schema)
             elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type:
                 if field.field_type == TimestampType():
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -1750,7 +1750,7 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
         return pa.RecordBatchReader.from_batches(
             target_schema,
             batches,
-        )
+        ).cast(target_schema)
 
     def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
         """Read a Pandas DataFrame eagerly from this Iceberg table.
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
@@ -881,7 +881,7 @@ def test_table_scan_override_with_small_types(catalog: Catalog) -> None:
     expected_schema = pa.schema(
         [
             pa.field("string", pa.string()),
-            pa.field("string-to-binary", pa.binary()),
+            pa.field("string-to-binary", pa.large_binary()),
             pa.field("binary", pa.binary()),
             pa.field("list", pa.list_(pa.string())),
         ]

Original file line number	Diff line number	Diff line change
`@@ -881,7 +881,7 @@ def test_table_scan_override_with_small_types(catalog: Catalog) -> None:`
`881`	`881`	`expected_schema = pa.schema(`
`882`	`882`	`[`
`883`	`883`	`pa.field("string", pa.string()),`
`884`		`- pa.field("string-to-binary", pa.binary()),`
	`884`	`+ pa.field("string-to-binary", pa.large_binary()),`
`885`	`885`	`pa.field("binary", pa.binary()),`
`886`	`886`	`pa.field("list", pa.list_(pa.string())),`
`887`	`887`	`]`