Skip to content

Commit 73dc023

Browse files
committed
Merge branch 'main' of github.com:apache/iceberg-python into fd-integrate-residuals
2 parents ab5d277 + 7a6a7c8 commit 73dc023

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+1955
-486
lines changed

.github/ISSUE_TEMPLATE/iceberg_bug_report.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ body:
2828
description: What Apache Iceberg version are you using?
2929
multiple: false
3030
options:
31-
- "0.8.1 (latest release)"
31+
- "0.9.0 (latest release)"
32+
- "0.8.1"
3233
- "0.8.0"
3334
- "0.7.1"
3435
- "0.7.0"

.github/pull_request_template.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<!--
2+
Thanks for opening a pull request!
3+
-->
4+
5+
<!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
6+
<!-- Closes #${GITHUB_ISSUE_ID} -->
7+
8+
# Rationale for this change
9+
10+
# Are these changes tested?
11+
12+
# Are there any user-facing changes?
13+
14+
<!-- In the case of user-facing changes, please add the changelog label. -->

.github/workflows/pypi-build-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ jobs:
6262
if: startsWith(matrix.os, 'ubuntu')
6363

6464
- name: Build wheels
65-
uses: pypa/cibuildwheel@v2.22.0
65+
uses: pypa/cibuildwheel@v2.23.2
6666
with:
6767
output-dir: wheelhouse
6868
config-file: "pyproject.toml"

.github/workflows/svn-build-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ jobs:
5757
if: startsWith(matrix.os, 'ubuntu')
5858

5959
- name: Build wheels
60-
uses: pypa/cibuildwheel@v2.22.0
60+
uses: pypa/cibuildwheel@v2.23.2
6161
with:
6262
output-dir: wheelhouse
6363
config-file: "pyproject.toml"

dev/.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
.github/*
12
.rat-excludes
23
build
34
.git

dev/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ WORKDIR ${SPARK_HOME}
4040
ENV SPARK_VERSION=3.5.4
4141
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
4242
ENV ICEBERG_VERSION=1.8.0
43-
ENV PYICEBERG_VERSION=0.8.1
43+
ENV PYICEBERG_VERSION=0.9.0
4444

4545
RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
4646
&& tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \

dev/provision.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@
328328
CREATE TABLE {catalog_name}.default.test_table_empty_list_and_map (
329329
col_list array<int>,
330330
col_map map<int, int>,
331+
col_struct struct<test:int>,
331332
col_list_with_struct array<struct<test:int>>
332333
)
333334
USING iceberg
@@ -340,8 +341,8 @@
340341
spark.sql(
341342
f"""
342343
INSERT INTO {catalog_name}.default.test_table_empty_list_and_map
343-
VALUES (null, null, null),
344-
(array(), map(), array(struct(1)))
344+
VALUES (null, null, null, null),
345+
(array(), map(), struct(1), array(struct(1)))
345346
"""
346347
)
347348

mkdocs/docs/how-to-release.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,8 @@ Then, select the previous release version as the **Previous tag** to use the dif
379379

380380
**Set as the latest release** and **Publish**.
381381

382+
Make sure to check the `changelog` label on GitHub to see if anything needs to be highlighted.
383+
382384
### Release the docs
383385

384386
Run the [`Release Docs` Github Action](https://github.com/apache/iceberg-python/actions/workflows/python-release-docs.yml).

poetry.lock

Lines changed: 615 additions & 312 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
__version__ = "0.9.0"
18+
__version__ = "0.10.0"

pyiceberg/avro/encoder.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17+
from typing import Any
1718
from uuid import UUID
1819

1920
from pyiceberg.avro import STRUCT_DOUBLE, STRUCT_FLOAT
@@ -74,3 +75,6 @@ def write_uuid(self, uuid: UUID) -> None:
7475
if len(uuid.bytes) != 16:
7576
raise ValueError(f"Expected UUID to have 16 bytes, got: len({uuid.bytes!r})")
7677
return self.write(uuid.bytes)
78+
79+
def write_unknown(self, _: Any) -> None:
80+
"""Nulls are written as 0 bytes in avro, so we do nothing."""

pyiceberg/avro/reader.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,14 @@ class TimestampReader(IntegerReader):
175175
"""
176176

177177

178+
class TimestampNanoReader(IntegerReader):
179+
"""Reads a nanosecond granularity timestamp from the stream.
180+
181+
Long is decoded as python integer which represents
182+
the number of nanoseconds from the unix epoch, 1 January 1970.
183+
"""
184+
185+
178186
class TimestamptzReader(IntegerReader):
179187
"""Reads a microsecond granularity timestamptz from the stream.
180188
@@ -185,6 +193,16 @@ class TimestamptzReader(IntegerReader):
185193
"""
186194

187195

196+
class TimestamptzNanoReader(IntegerReader):
197+
"""Reads a microsecond granularity timestamptz from the stream.
198+
199+
Long is decoded as python integer which represents
200+
the number of nanoseconds from the unix epoch, 1 January 1970.
201+
202+
Adjusted to UTC.
203+
"""
204+
205+
188206
class StringReader(Reader):
189207
def read(self, decoder: BinaryDecoder) -> str:
190208
return decoder.read_utf8()
@@ -201,6 +219,14 @@ def skip(self, decoder: BinaryDecoder) -> None:
201219
decoder.skip(16)
202220

203221

222+
class UnknownReader(Reader):
223+
def read(self, decoder: BinaryDecoder) -> None:
224+
return None
225+
226+
def skip(self, decoder: BinaryDecoder) -> None:
227+
pass
228+
229+
204230
@dataclass(frozen=True)
205231
class FixedReader(Reader):
206232
_len: int = dataclassfield()

pyiceberg/avro/resolver.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,11 @@
4444
StringReader,
4545
StructReader,
4646
TimeReader,
47+
TimestampNanoReader,
4748
TimestampReader,
49+
TimestamptzNanoReader,
4850
TimestamptzReader,
51+
UnknownReader,
4952
UUIDReader,
5053
)
5154
from pyiceberg.avro.writer import (
@@ -63,9 +66,12 @@
6366
OptionWriter,
6467
StringWriter,
6568
StructWriter,
69+
TimestampNanoWriter,
70+
TimestamptzNanoWriter,
6671
TimestamptzWriter,
6772
TimestampWriter,
6873
TimeWriter,
74+
UnknownWriter,
6975
UUIDWriter,
7076
Writer,
7177
)
@@ -97,9 +103,12 @@
97103
PrimitiveType,
98104
StringType,
99105
StructType,
106+
TimestampNanoType,
100107
TimestampType,
108+
TimestamptzNanoType,
101109
TimestamptzType,
102110
TimeType,
111+
UnknownType,
103112
UUIDType,
104113
)
105114

@@ -181,9 +190,15 @@ def visit_time(self, time_type: TimeType) -> Writer:
181190
def visit_timestamp(self, timestamp_type: TimestampType) -> Writer:
182191
return TimestampWriter()
183192

193+
def visit_timestamp_ns(self, timestamp_ns_type: TimestampNanoType) -> Writer:
194+
return TimestampNanoWriter()
195+
184196
def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> Writer:
185197
return TimestamptzWriter()
186198

199+
def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType) -> Writer:
200+
return TimestamptzNanoWriter()
201+
187202
def visit_string(self, string_type: StringType) -> Writer:
188203
return StringWriter()
189204

@@ -193,6 +208,9 @@ def visit_uuid(self, uuid_type: UUIDType) -> Writer:
193208
def visit_binary(self, binary_type: BinaryType) -> Writer:
194209
return BinaryWriter()
195210

211+
def visit_unknown(self, unknown_type: UnknownType) -> Writer:
212+
return UnknownWriter()
213+
196214

197215
CONSTRUCT_WRITER_VISITOR = ConstructWriter()
198216

@@ -326,9 +344,15 @@ def visit_time(self, time_type: TimeType, partner: Optional[IcebergType]) -> Wri
326344
def visit_timestamp(self, timestamp_type: TimestampType, partner: Optional[IcebergType]) -> Writer:
327345
return TimestampWriter()
328346

347+
def visit_timestamp_ns(self, timestamp_ns_type: TimestampNanoType, partner: Optional[IcebergType]) -> Writer:
348+
return TimestampNanoWriter()
349+
329350
def visit_timestamptz(self, timestamptz_type: TimestamptzType, partner: Optional[IcebergType]) -> Writer:
330351
return TimestamptzWriter()
331352

353+
def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType, partner: Optional[IcebergType]) -> Writer:
354+
return TimestamptzNanoWriter()
355+
332356
def visit_string(self, string_type: StringType, partner: Optional[IcebergType]) -> Writer:
333357
return StringWriter()
334358

@@ -341,6 +365,9 @@ def visit_fixed(self, fixed_type: FixedType, partner: Optional[IcebergType]) ->
341365
def visit_binary(self, binary_type: BinaryType, partner: Optional[IcebergType]) -> Writer:
342366
return BinaryWriter()
343367

368+
def visit_unknown(self, unknown_type: UnknownType, partner: Optional[IcebergType]) -> Writer:
369+
return UnknownWriter()
370+
344371

345372
class ReadSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Reader]):
346373
__slots__ = ("read_types", "read_enums", "context")
@@ -456,9 +483,15 @@ def visit_time(self, time_type: TimeType, partner: Optional[IcebergType]) -> Rea
456483
def visit_timestamp(self, timestamp_type: TimestampType, partner: Optional[IcebergType]) -> Reader:
457484
return TimestampReader()
458485

486+
def visit_timestamp_ns(self, timestamp_ns_type: TimestampNanoType, partner: Optional[IcebergType]) -> Reader:
487+
return TimestampNanoReader()
488+
459489
def visit_timestamptz(self, timestamptz_type: TimestamptzType, partner: Optional[IcebergType]) -> Reader:
460490
return TimestamptzReader()
461491

492+
def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType, partner: Optional[IcebergType]) -> Reader:
493+
return TimestamptzNanoReader()
494+
462495
def visit_string(self, string_type: StringType, partner: Optional[IcebergType]) -> Reader:
463496
return StringReader()
464497

@@ -471,6 +504,9 @@ def visit_fixed(self, fixed_type: FixedType, partner: Optional[IcebergType]) ->
471504
def visit_binary(self, binary_type: BinaryType, partner: Optional[IcebergType]) -> Reader:
472505
return BinaryReader()
473506

507+
def visit_unknown(self, unknown_type: UnknownType, partner: Optional[IcebergType]) -> Reader:
508+
return UnknownReader()
509+
474510

475511
class SchemaPartnerAccessor(PartnerAccessor[IcebergType]):
476512
def schema_partner(self, partner: Optional[IcebergType]) -> Optional[IcebergType]:

pyiceberg/avro/writer.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,24 @@ def write(self, encoder: BinaryEncoder, val: int) -> None:
9595
encoder.write_int(val)
9696

9797

98+
@dataclass(frozen=True)
99+
class TimestampNanoWriter(Writer):
100+
def write(self, encoder: BinaryEncoder, val: int) -> None:
101+
encoder.write_int(val)
102+
103+
98104
@dataclass(frozen=True)
99105
class TimestamptzWriter(Writer):
100106
def write(self, encoder: BinaryEncoder, val: int) -> None:
101107
encoder.write_int(val)
102108

103109

110+
@dataclass(frozen=True)
111+
class TimestamptzNanoWriter(Writer):
112+
def write(self, encoder: BinaryEncoder, val: int) -> None:
113+
encoder.write_int(val)
114+
115+
104116
@dataclass(frozen=True)
105117
class StringWriter(Writer):
106118
def write(self, encoder: BinaryEncoder, val: Any) -> None:
@@ -113,6 +125,12 @@ def write(self, encoder: BinaryEncoder, val: UUID) -> None:
113125
encoder.write(val.bytes)
114126

115127

128+
@dataclass(frozen=True)
129+
class UnknownWriter(Writer):
130+
def write(self, encoder: BinaryEncoder, val: Any) -> None:
131+
encoder.write_unknown(val)
132+
133+
116134
@dataclass(frozen=True)
117135
class FixedWriter(Writer):
118136
_len: int = dataclassfield()

pyiceberg/catalog/hive.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@
110110
TimestampType,
111111
TimestamptzType,
112112
TimeType,
113+
UnknownType,
113114
UUIDType,
114115
)
115116
from pyiceberg.utils.properties import property_as_bool, property_as_float
@@ -236,6 +237,7 @@ def _annotate_namespace(database: HiveDatabase, properties: Properties) -> HiveD
236237
UUIDType: "string",
237238
BinaryType: "binary",
238239
FixedType: "binary",
240+
UnknownType: "void",
239241
}
240242

241243

pyiceberg/catalog/rest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def _retry_hook(retry_state: RetryCallState) -> None:
148148

149149

150150
_RETRY_ARGS = {
151-
"retry": retry_if_exception_type(AuthorizationExpiredError),
151+
"retry": retry_if_exception_type((AuthorizationExpiredError, UnauthorizedError)),
152152
"stop": stop_after_attempt(2),
153153
"before_sleep": _retry_hook,
154154
"reraise": True,

pyiceberg/catalog/sql.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -619,15 +619,28 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identi
619619
table_stmt = select(IcebergTables.table_namespace).where(IcebergTables.catalog_name == self.name)
620620
namespace_stmt = select(IcebergNamespaceProperties.namespace).where(IcebergNamespaceProperties.catalog_name == self.name)
621621
if namespace:
622-
namespace_str = Catalog.namespace_to_string(namespace, NoSuchNamespaceError)
623-
table_stmt = table_stmt.where(IcebergTables.table_namespace.like(namespace_str))
624-
namespace_stmt = namespace_stmt.where(IcebergNamespaceProperties.namespace.like(namespace_str))
622+
namespace_like = Catalog.namespace_to_string(namespace, NoSuchNamespaceError) + "%"
623+
table_stmt = table_stmt.where(IcebergTables.table_namespace.like(namespace_like))
624+
namespace_stmt = namespace_stmt.where(IcebergNamespaceProperties.namespace.like(namespace_like))
625625
stmt = union(
626626
table_stmt,
627627
namespace_stmt,
628628
)
629629
with Session(self.engine) as session:
630-
return [Catalog.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()]
630+
namespace_tuple = Catalog.identifier_to_tuple(namespace)
631+
sub_namespaces_level_length = len(namespace_tuple) + 1
632+
633+
namespaces = list(
634+
{ # only get distinct namespaces
635+
ns[:sub_namespaces_level_length] # truncate to the required level
636+
for ns in {Catalog.identifier_to_tuple(ns) for ns in session.execute(stmt).scalars()}
637+
if len(ns) >= sub_namespaces_level_length # only get sub namespaces/children
638+
and ns[: sub_namespaces_level_length - 1] == namespace_tuple
639+
# exclude fuzzy matches when `namespace` contains `%` or `_`
640+
}
641+
)
642+
643+
return namespaces
631644

632645
def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties:
633646
"""Get properties for a namespace.

0 commit comments

Comments
 (0)