Skip to content

Commit

Permalink
feat: support RANGE in queries Part 2: Arrow (#1868)
Browse files Browse the repository at this point in the history
* feat: support range in queries as dict

* fix sys tests

* lint

* add arrow support

* fix python 3.7 test error

* print dependencies in sys test

* add unit test and docs

* fix unit test

* add func docs

* add sys test for tabledata.list in arrow

* add sys test for tabledata.list as iterator

* lint

* fix docs error

* fix docstring

* fix docstring

* fix docstring

* docs

* docs

* docs

* move dtypes mapping code

* address comment

* address comment

* fix pytest error

* Revert "move dtypes mapping code"

This reverts commit c46c65c.

* remove commented out assertions

* typo and formats

* add None-check for range_element_type and add unit tests

* change test skip condition

* fix test error

* change test skip condition

* change test skip condition

* change decorator order

* use a different way to construct test data

* fix error message and add warning number check

* add warning number check and comments
  • Loading branch information
Linchin committed Apr 18, 2024
1 parent bd0814c commit 5251b5d
Show file tree
Hide file tree
Showing 15 changed files with 516 additions and 25 deletions.
16 changes: 7 additions & 9 deletions google/cloud/bigquery/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@
_UNIVERSE_DOMAIN_ENV = "GOOGLE_CLOUD_UNIVERSE_DOMAIN"
"""Environment variable for setting universe domain."""

_SUPPORTED_RANGE_ELEMENTS = {"TIMESTAMP", "DATETIME", "DATE"}


def _get_client_universe(
client_options: Optional[Union[client_options_lib.ClientOptions, dict]]
Expand Down Expand Up @@ -310,17 +312,13 @@ def _json_from_json(value, field):


def _range_element_from_json(value, field):
"""Coerce 'value' to a range element value, if set or not nullable."""
"""Coerce 'value' to a range element value."""
if value == "UNBOUNDED":
return None
elif field.element_type == "DATE":
return _date_from_json(value, None)
elif field.element_type == "DATETIME":
return _datetime_from_json(value, None)
elif field.element_type == "TIMESTAMP":
return _timestamp_from_json(value, None)
if field.element_type in _SUPPORTED_RANGE_ELEMENTS:
return _CELLDATA_FROM_JSON[field.element_type](value, field.element_type)
else:
raise ValueError(f"Unsupported range field type: {value}")
raise ValueError(f"Unsupported range element type: {field.element_type}")


def _range_from_json(value, field):
Expand All @@ -344,7 +342,7 @@ def _range_from_json(value, field):
end = _range_element_from_json(end, field.range_element_type)
return {"start": start, "end": end}
else:
raise ValueError(f"Unknown range format: {value}")
raise ValueError(f"Unknown format for range value: {value}")
else:
return None

Expand Down
33 changes: 33 additions & 0 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,17 @@ def bq_to_arrow_struct_data_type(field):
return pyarrow.struct(arrow_fields)


def bq_to_arrow_range_data_type(field):
if field is None:
raise ValueError(
"Range element type cannot be None, must be one of "
"DATE, DATETIME, or TIMESTAMP"
)
element_type = field.element_type.upper()
arrow_element_type = _pyarrow_helpers.bq_to_arrow_scalars(element_type)()
return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)])


def bq_to_arrow_data_type(field):
"""Return the Arrow data type, corresponding to a given BigQuery column.
Expand All @@ -160,6 +171,9 @@ def bq_to_arrow_data_type(field):
if field_type_upper in schema._STRUCT_TYPES:
return bq_to_arrow_struct_data_type(field)

if field_type_upper == "RANGE":
return bq_to_arrow_range_data_type(field.range_element_type)

data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper)
if data_type_constructor is None:
return None
Expand Down Expand Up @@ -220,6 +234,9 @@ def default_types_mapper(
datetime_dtype: Union[Any, None] = None,
time_dtype: Union[Any, None] = None,
timestamp_dtype: Union[Any, None] = None,
range_date_dtype: Union[Any, None] = None,
range_datetime_dtype: Union[Any, None] = None,
range_timestamp_dtype: Union[Any, None] = None,
):
"""Create a mapping from pyarrow types to pandas types.
Expand Down Expand Up @@ -274,6 +291,22 @@ def types_mapper(arrow_data_type):
elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type):
return time_dtype

elif pyarrow.types.is_struct(arrow_data_type):
if range_datetime_dtype is not None and arrow_data_type.equals(
range_datetime_dtype.pyarrow_dtype
):
return range_datetime_dtype

elif range_date_dtype is not None and arrow_data_type.equals(
range_date_dtype.pyarrow_dtype
):
return range_date_dtype

elif range_timestamp_dtype is not None and arrow_data_type.equals(
range_timestamp_dtype.pyarrow_dtype
):
return range_timestamp_dtype

return types_mapper


Expand Down
14 changes: 8 additions & 6 deletions google/cloud/bigquery/dbapi/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,12 +277,14 @@ def complex_query_parameter(
param = query.ArrayQueryParameter(
name,
sub_type,
value
if isinstance(sub_type, query.ScalarQueryParameterType)
else [
complex_query_parameter(None, v, sub_type._complex__src, base)
for v in value
],
(
value
if isinstance(sub_type, query.ScalarQueryParameterType)
else [
complex_query_parameter(None, v, sub_type._complex__src, base)
for v in value
]
),
)
elif type_type == STRUCT:
if not isinstance(value, collections_abc.Mapping):
Expand Down
9 changes: 9 additions & 0 deletions google/cloud/bigquery/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ class DefaultPandasDTypes(enum.Enum):
TIME_DTYPE = object()
"""Specifies default time dtype"""

RANGE_DATE_DTYPE = object()
"""Specifies default range date dtype"""

RANGE_DATETIME_DTYPE = object()
"""Specifies default range datetime dtype"""

RANGE_TIMESTAMP_DTYPE = object()
"""Specifies default range timestamp dtype"""


class DestinationFormat(object):
"""The exported file format. The default value is :attr:`CSV`.
Expand Down
67 changes: 67 additions & 0 deletions google/cloud/bigquery/job/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,13 @@ def to_dataframe(
datetime_dtype: Union[Any, None] = None,
time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
timestamp_dtype: Union[Any, None] = None,
range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE,
range_datetime_dtype: Union[
Any, None
] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE,
range_timestamp_dtype: Union[
Any, None
] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE,
) -> "pandas.DataFrame":
"""Return a pandas DataFrame from a QueryJob
Expand Down Expand Up @@ -1919,6 +1926,63 @@ def to_dataframe(
.. versionadded:: 3.10.0
range_date_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype, such as:
.. code-block:: python
pandas.ArrowDtype(pyarrow.struct(
[("start", pyarrow.date32()), ("end", pyarrow.date32())]
))
to convert BigQuery RANGE<DATE> type, instead of relying on
the default ``object``. If you explicitly set the value to
``None``, the data type will be ``object``. BigQuery Range type
can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
.. versionadded:: 3.21.0
range_datetime_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype, such as:
.. code-block:: python
pandas.ArrowDtype(pyarrow.struct(
[
("start", pyarrow.timestamp("us")),
("end", pyarrow.timestamp("us")),
]
))
to convert BigQuery RANGE<DATETIME> type, instead of relying on
the default ``object``. If you explicitly set the value to
``None``, the data type will be ``object``. BigQuery Range type
can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
.. versionadded:: 3.21.0
range_timestamp_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype, such as:
.. code-block:: python
pandas.ArrowDtype(pyarrow.struct(
[
("start", pyarrow.timestamp("us", tz="UTC")),
("end", pyarrow.timestamp("us", tz="UTC")),
]
))
to convert BigQuery RANGE<TIMESTAMP> type, instead of relying
on the default ``object``. If you explicitly set the value to
``None``, the data type will be ``object``. BigQuery Range type
can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
.. versionadded:: 3.21.0
Returns:
pandas.DataFrame:
A :class:`~pandas.DataFrame` populated with row data
Expand Down Expand Up @@ -1949,6 +2013,9 @@ def to_dataframe(
datetime_dtype=datetime_dtype,
time_dtype=time_dtype,
timestamp_dtype=timestamp_dtype,
range_date_dtype=range_date_dtype,
range_datetime_dtype=range_datetime_dtype,
range_timestamp_dtype=range_timestamp_dtype,
)

# If changing the signature of this method, make sure to apply the same
Expand Down
11 changes: 5 additions & 6 deletions google/cloud/bigquery/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,13 @@
from google.cloud.bigquery._helpers import _rows_from_json
from google.cloud.bigquery._helpers import _QUERY_PARAMS_FROM_JSON
from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_PARAM
from google.cloud.bigquery._helpers import _SUPPORTED_RANGE_ELEMENTS


_SCALAR_VALUE_TYPE = Optional[
Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date]
]

_RANGE_ELEMENT_TYPE_STR = {"TIMESTAMP", "DATETIME", "DATE"}


class ConnectionProperty:
"""A connection-level property to customize query behavior.
Expand Down Expand Up @@ -388,14 +387,14 @@ def _parse_range_element_type(self, type_):
google.cloud.bigquery.query.ScalarQueryParameterType: Instance
"""
if isinstance(type_, str):
if type_ not in _RANGE_ELEMENT_TYPE_STR:
if type_ not in _SUPPORTED_RANGE_ELEMENTS:
raise ValueError(
"If given as a string, range element type must be one of "
"'TIMESTAMP', 'DATE', or 'DATETIME'."
)
return ScalarQueryParameterType(type_)
elif isinstance(type_, ScalarQueryParameterType):
if type_._type not in _RANGE_ELEMENT_TYPE_STR:
if type_._type not in _SUPPORTED_RANGE_ELEMENTS:
raise ValueError(
"If given as a ScalarQueryParameter object, range element "
"type must be one of 'TIMESTAMP', 'DATE', or 'DATETIME' "
Expand Down Expand Up @@ -960,14 +959,14 @@ class RangeQueryParameter(_AbstractQueryParameter):
@classmethod
def _parse_range_element_type(self, range_element_type):
if isinstance(range_element_type, str):
if range_element_type not in _RANGE_ELEMENT_TYPE_STR:
if range_element_type not in _SUPPORTED_RANGE_ELEMENTS:
raise ValueError(
"If given as a string, range_element_type must be one of "
f"'TIMESTAMP', 'DATE', or 'DATETIME'. Got {range_element_type}."
)
return RangeQueryParameterType(range_element_type)
elif isinstance(range_element_type, RangeQueryParameterType):
if range_element_type.type_._type not in _RANGE_ELEMENT_TYPE_STR:
if range_element_type.type_._type not in _SUPPORTED_RANGE_ELEMENTS:
raise ValueError(
"If given as a RangeQueryParameterType object, "
"range_element_type must be one of 'TIMESTAMP', 'DATE', "
Expand Down

0 comments on commit 5251b5d

Please sign in to comment.