99
99
HDFS_KERB_TICKET ,
100
100
HDFS_PORT ,
101
101
HDFS_USER ,
102
- PYARROW_USE_LARGE_TYPES_ON_READ ,
103
102
S3_ACCESS_KEY_ID ,
104
103
S3_CONNECT_TIMEOUT ,
105
104
S3_ENDPOINT ,
@@ -1348,7 +1347,6 @@ def _task_to_record_batches(
1348
1347
positional_deletes : Optional [List [ChunkedArray ]],
1349
1348
case_sensitive : bool ,
1350
1349
name_mapping : Optional [NameMapping ] = None ,
1351
- use_large_types : Optional [bool ] = True ,
1352
1350
partition_spec : Optional [PartitionSpec ] = None ,
1353
1351
) -> Iterator [pa .RecordBatch ]:
1354
1352
_ , _ , path = _parse_location (task .file .file_path )
@@ -1376,21 +1374,13 @@ def _task_to_record_batches(
1376
1374
1377
1375
file_project_schema = prune_columns (file_schema , projected_field_ids , select_full_types = False )
1378
1376
1379
- fragment_schema = physical_schema
1380
- if use_large_types is not None :
1381
- fragment_schema = (
1382
- _pyarrow_schema_ensure_large_types (physical_schema )
1383
- if use_large_types
1384
- else (_pyarrow_schema_ensure_small_types (physical_schema ))
1385
- )
1386
-
1387
1377
fragment_scanner = ds .Scanner .from_fragment (
1388
1378
fragment = fragment ,
1389
1379
# With PyArrow 16.0.0 there is an issue with casting record-batches:
1390
1380
# https://github.com/apache/arrow/issues/41884
1391
1381
# https://github.com/apache/arrow/issues/43183
1392
1382
# Would be good to remove this later on
1393
- schema = fragment_schema ,
1383
+ schema = physical_schema ,
1394
1384
# This will push down the query to Arrow.
1395
1385
# But in case there are positional deletes, we have to apply them first
1396
1386
filter = pyarrow_filter if not positional_deletes else None ,
@@ -1425,7 +1415,6 @@ def _task_to_record_batches(
1425
1415
file_project_schema ,
1426
1416
current_batch ,
1427
1417
downcast_ns_timestamp_to_us = True ,
1428
- use_large_types = use_large_types ,
1429
1418
)
1430
1419
1431
1420
# Inject projected column values if available
@@ -1539,12 +1528,8 @@ def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table:
1539
1528
deletes_per_file = _read_all_delete_files (self ._io , tasks )
1540
1529
executor = ExecutorFactory .get_or_create ()
1541
1530
1542
- use_large_types = None
1543
- if PYARROW_USE_LARGE_TYPES_ON_READ in self ._io .properties :
1544
- use_large_types = property_as_bool (self ._io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , True )
1545
-
1546
1531
def _table_from_scan_task (task : FileScanTask ) -> pa .Table :
1547
- batches = list (self ._record_batches_from_scan_tasks_and_deletes ([task ], deletes_per_file , use_large_types ))
1532
+ batches = list (self ._record_batches_from_scan_tasks_and_deletes ([task ], deletes_per_file ))
1548
1533
if len (batches ) > 0 :
1549
1534
return pa .Table .from_batches (batches )
1550
1535
else :
@@ -1606,13 +1591,12 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record
1606
1591
deletes_per_file = _read_all_delete_files (self ._io , tasks )
1607
1592
# Always use large types, since we cannot infer it in a streaming fashion,
1608
1593
# without fetching all the schemas first, which defeats the purpose of streaming
1609
- return self ._record_batches_from_scan_tasks_and_deletes (tasks , deletes_per_file , use_large_types = True )
1594
+ return self ._record_batches_from_scan_tasks_and_deletes (tasks , deletes_per_file )
1610
1595
1611
1596
def _record_batches_from_scan_tasks_and_deletes (
1612
1597
self ,
1613
1598
tasks : Iterable [FileScanTask ],
1614
1599
deletes_per_file : Dict [str , List [ChunkedArray ]],
1615
- use_large_types : Optional [bool ] = True ,
1616
1600
) -> Iterator [pa .RecordBatch ]:
1617
1601
total_row_count = 0
1618
1602
for task in tasks :
@@ -1627,7 +1611,6 @@ def _record_batches_from_scan_tasks_and_deletes(
1627
1611
deletes_per_file .get (task .file .file_path ),
1628
1612
self ._case_sensitive ,
1629
1613
self ._table_metadata .name_mapping (),
1630
- use_large_types ,
1631
1614
self ._table_metadata .spec (),
1632
1615
)
1633
1616
for batch in batches :
@@ -1646,13 +1629,12 @@ def _to_requested_schema(
1646
1629
batch : pa .RecordBatch ,
1647
1630
downcast_ns_timestamp_to_us : bool = False ,
1648
1631
include_field_ids : bool = False ,
1649
- use_large_types : Optional [bool ] = True ,
1650
1632
) -> pa .RecordBatch :
1651
1633
# We could reuse some of these visitors
1652
1634
struct_array = visit_with_partner (
1653
1635
requested_schema ,
1654
1636
batch ,
1655
- ArrowProjectionVisitor (file_schema , downcast_ns_timestamp_to_us , include_field_ids , use_large_types ),
1637
+ ArrowProjectionVisitor (file_schema , downcast_ns_timestamp_to_us , include_field_ids ),
1656
1638
ArrowAccessor (file_schema ),
1657
1639
)
1658
1640
return pa .RecordBatch .from_struct_array (struct_array )
@@ -1669,12 +1651,10 @@ def __init__(
1669
1651
file_schema : Schema ,
1670
1652
downcast_ns_timestamp_to_us : bool = False ,
1671
1653
include_field_ids : bool = False ,
1672
- use_large_types : Optional [bool ] = True ,
1673
1654
) -> None :
1674
1655
self ._file_schema = file_schema
1675
1656
self ._include_field_ids = include_field_ids
1676
1657
self ._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
1677
- self ._use_large_types = use_large_types
1678
1658
1679
1659
def _cast_if_needed (self , field : NestedField , values : pa .Array ) -> pa .Array :
1680
1660
file_field = self ._file_schema .find_field (field .field_id )
@@ -1684,8 +1664,6 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
1684
1664
target_schema = schema_to_pyarrow (
1685
1665
promote (file_field .field_type , field .field_type ), include_field_ids = self ._include_field_ids
1686
1666
)
1687
- if self ._use_large_types is False :
1688
- target_schema = _pyarrow_schema_ensure_small_types (target_schema )
1689
1667
return values .cast (target_schema )
1690
1668
elif (target_type := schema_to_pyarrow (field .field_type , include_field_ids = self ._include_field_ids )) != values .type :
1691
1669
if field .field_type == TimestampType ():
0 commit comments