From 5bb1808e27bdda37232d7eb53bcdad69428df65a Mon Sep 17 00:00:00 2001 From: "chenbangduo.cbd" Date: Tue, 5 Dec 2023 20:01:22 +0800 Subject: [PATCH 1/2] Revert "[IO] Add tensor shape meta-data support for ParquetDataset. (#849)" This reverts commit 92b2271b78798f92dd63284426f73364c526806a. --- .../python/data/experimental/ops/dataframe.py | 26 +++++++++---------- .../experimental/ops/parquet_dataset_ops.py | 13 ++++------ 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/dataframe.py b/tensorflow/python/data/experimental/ops/dataframe.py index f3dc249653a..003f75259f1 100644 --- a/tensorflow/python/data/experimental/ops/dataframe.py +++ b/tensorflow/python/data/experimental/ops/dataframe.py @@ -59,17 +59,14 @@ def __init__(self, name, dtype=None, ragged_rank=None, shape=None): self._ragged_rank = ragged_rank if shape: shape = tensor_shape.TensorShape(shape) - shape_rank = 0 - for _ in shape: - shape_rank += 1 - if ragged_rank is not None and ragged_rank != shape_rank: + for d in shape: + if d.value is None: + raise ValueError( + f'Field {name} has incomplete shape: {shape}') + if ragged_rank is not None and ragged_rank > 1: raise ValueError( f'Field {name} is a nested list ({ragged_rank}) ' f'with shape {shape}') - self._ragged_rank = shape_rank - elif ragged_rank is not None: - shape = tensor_shape.TensorShape([None for _ in xrange(ragged_rank)]) - self._shape = shape @property @@ -134,16 +131,17 @@ def output_classes(self): def output_types(self): return self.map(lambda i: self._dtype if i == 0 else dtypes.int32) - def output_shapes(self, batch_size=None): + @property + def output_shapes(self): if self._shape is None: - return self.map(lambda i: tensor_shape.vector(batch_size) if i == 0 - else tensor_shape.vector(None)) + return self.map(lambda _: tensor_shape.vector(None)) return self.map( - lambda i: tensor_shape.vector(batch_size).concatenate(self._shape) if i == 0 + lambda i: tensor_shape.vector(None).concatenate(self._shape) if i == 0 else tensor_shape.vector(None)) - def output_specs(self, batch_size=None): - shape = tensor_shape.vector(batch_size) + @property + def output_specs(self): + shape = tensor_shape.vector(None) if self._shape is not None: shape = shape.concatenate(self._shape) specs = [tensor_spec.TensorSpec(shape, dtype=self._dtype)] diff --git a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py index 719940d1beb..86e2b9b4ec7 100644 --- a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py +++ b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py @@ -38,25 +38,23 @@ class DataFrameValueSpec(type_spec.BatchableTypeSpec): def value_type(self): return DataFrame.Value if self._ragged_rank > 0 else ops.Tensor - def __init__(self, field, batch_size=None): + def __init__(self, field): """Constructs a type specification for a `tf.RaggedTensor`. Args: field: The field definition. - batch_size: The batch_size of DataFrame. """ if field.incomplete: raise ValueError( f'Field {field} is incomplete, please specify dtype and ragged_rank') self._field = field - self._batch_size = batch_size def _serialize(self): return (self._field.dtype, self._field.ragged_rank) @property def _component_specs(self): - return self._field.output_specs(self._batch_size) + return self._field.output_specs def _to_components(self, value): if isinstance(value, DataFrame.Value): @@ -80,7 +78,7 @@ def _to_legacy_output_types(self): return self._field.output_types def _to_legacy_output_shapes(self): - return self._field.output_shapes(self._batch_size) + return self._field.output_shapes def _to_legacy_output_classes(self): return self._field.output_classes @@ -112,10 +110,9 @@ def __init__( self._fields = fields self._output_specs = { f.name: ( - DataFrameValueSpec(f, batch_size if drop_remainder else None) + DataFrameValueSpec(f) if f.ragged_rank > 0 - else tensor_spec.TensorSpec( - shape=[batch_size if drop_remainder else None], dtype=f.dtype)) + else tensor_spec.TensorSpec(shape=[None], dtype=f.dtype)) for f in self._fields} self._field_names = nest.flatten({f.name: f.name for f in self._fields}) self._field_dtypes = nest.flatten({f.name: f.dtype for f in self._fields}) From 00e497cf0ac0ccc402aeb81da8cc7ad31923877a Mon Sep 17 00:00:00 2001 From: "chenbangduo.cbd" Date: Wed, 6 Dec 2023 10:11:36 +0800 Subject: [PATCH 2/2] [IO] Fix tensor shape meta-data bug for DataFrame Value. Signed-off-by: chenbangduo.cbd --- .../experimental/ops/parquet_dataset_ops.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py index 86e2b9b4ec7..5bb790c331d 100644 --- a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py +++ b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py @@ -22,6 +22,7 @@ from tensorflow.python.data.ops import readers from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_spec from tensorflow.python.framework import type_spec from tensorflow.python.util import nest @@ -108,12 +109,18 @@ def __init__( self._batch_size = ops.convert_to_tensor( batch_size, dtype=dtypes.int64, name='batch_size') self._fields = fields - self._output_specs = { - f.name: ( - DataFrameValueSpec(f) - if f.ragged_rank > 0 - else tensor_spec.TensorSpec(shape=[None], dtype=f.dtype)) - for f in self._fields} + self._output_specs = {} + for f in self._fields: + item = None + if f.ragged_rank > 0: + item = DataFrameValueSpec(f) + else: + shape = tensor_shape.vector(batch_size if drop_remainder else None) + if f.shape: + shape = shape.concatenate(f.shape) + item = tensor_spec.TensorSpec(shape=shape, dtype=f.dtype) + self._output_specs[f.name] = item + self._field_names = nest.flatten({f.name: f.name for f in self._fields}) self._field_dtypes = nest.flatten({f.name: f.dtype for f in self._fields}) self._field_ragged_ranks = nest.flatten(