diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 44f6076d7d..207c6d3882 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: - id: codespell args: - '-w' - - '--skip="*.txt,pylintrc,.*,src/maxtext/assets/*"' + - '--skip="*.txt,pylintrc,.*,src/maxtext/assets/*,src/maxtext/input_pipeline/protos/*"' - '-L ND,nd,sems,TE,ROUGE,rouge,astroid,ags,dout' - '.' additional_dependencies: @@ -30,6 +30,7 @@ repos: args: - '--disable=R0401,R0917,W0201,W0613' - "--ignore-patterns='.pytype,.*pyi$'" + - '--ignore-paths=src/maxtext/input_pipeline/protos' - 'benchmarks' - 'src' - 'tests' @@ -47,6 +48,7 @@ repos: rev: 24.10.1 hooks: - id: pyink + exclude: src/maxtext/input_pipeline/protos/ args: - '--pyink-indentation=2' - '--line-length=122' diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py index 6db25ec0f6..e6fb9d9222 100644 --- a/src/maxtext/input_pipeline/input_pipeline_utils.py +++ b/src/maxtext/input_pipeline/input_pipeline_utils.py @@ -21,17 +21,17 @@ if TYPE_CHECKING: import datasets + import tensorflow as tf import grain.python as grain import numpy as np -import tensorflow as tf +from maxtext.input_pipeline.protos import example_pb2 from maxtext.input_pipeline import tokenizer from maxtext.multimodal import processor as mm_processor from maxtext.multimodal import utils as mm_utils from maxtext.utils import max_logging -Features = dict[str, tf.Tensor] -AUTOTUNE = tf.data.experimental.AUTOTUNE +Features = dict[str, Any] INPUT_TOKENS_KEY = "input_ids" ########## Functions used by TFDS pipeline @@ -58,6 +58,8 @@ def shift_data_by_truncation(x): def add_segmentation_and_position(x, data_columns, padding_token=0): + import tensorflow as tf # pylint: disable=import-outside-toplevel + for data_column in data_columns: x[f"{data_column}_segmentation"] = tf.cast(x[data_column] != padding_token, tf.int32) x[f"{data_column}_position"] = tf.broadcast_to( @@ -68,6 +70,7 @@ def add_segmentation_and_position(x, data_columns, padding_token=0): def TokenizeOp(tokenizer_model, features: Features, data_keys: Iterable[str] = ("inputs", "targets")) -> Features: """Op for tokenization""" + import tensorflow as tf # pylint: disable=import-outside-toplevel def _process_string(string_tensor): # Extract string value and decode it if necessary @@ -421,20 +424,23 @@ class ParseFeatures(grain.MapTransform): def __init__(self, data_columns, tokenize): self.data_columns = data_columns - if tokenize: - self.dtype = tf.string - else: - self.dtype = tf.int64 + self.tokenize = tokenize def map(self, element): - def _parse(example): - parsed = tf.io.parse_example( - example, - {col: tf.io.FixedLenSequenceFeature([], dtype=self.dtype, allow_missing=True) for col in self.data_columns}, - ) - return parsed - - return _parse(element) + """Parse a serialized tf.train.Example proto and extract features.""" + example = example_pb2.Example() + example.ParseFromString(element) + features = example.features.feature + + parsed = {} + for col in self.data_columns: + if col in features: + f = features[col] + if self.tokenize: + parsed[col] = np.array(f.bytes_list.value, dtype=object) + else: + parsed[col] = np.array(f.int64_list.value, dtype=np.int32) + return parsed @dataclasses.dataclass @@ -447,9 +453,9 @@ def __init__(self, column_names, tokenize): def map(self, element): if self.tokenize: - return {col: element[col].numpy()[0].decode() for col in self.column_names} + return {col: element[col][0].decode() for col in self.column_names} else: - return {col: element[col].numpy() for col in self.column_names} + return {col: element[col] for col in self.column_names} @dataclasses.dataclass diff --git a/src/maxtext/input_pipeline/protos/__init__.py b/src/maxtext/input_pipeline/protos/__init__.py new file mode 100644 index 0000000000..f3582c0090 --- /dev/null +++ b/src/maxtext/input_pipeline/protos/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023–2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/maxtext/input_pipeline/protos/example.proto b/src/maxtext/input_pipeline/protos/example.proto new file mode 100644 index 0000000000..c4a59a6603 --- /dev/null +++ b/src/maxtext/input_pipeline/protos/example.proto @@ -0,0 +1,302 @@ +// Copy of https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/example/example.proto +// Protocol messages for describing input data Examples for machine learning +// model training or inference. +syntax = "proto3"; + +package maxtext.input_pipeline.protos; + +import "maxtext/input_pipeline/protos/feature.proto"; + +option cc_enable_arenas = true; +option java_outer_classname = "ExampleProtos"; +option java_multiple_files = true; +option java_package = "org.tensorflow.example"; +option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example/example_protos_go_proto"; + +// An Example is a mostly-normalized data format for storing data for +// training and inference. It contains a key-value store (features); where +// each key (string) maps to a Feature message (which is oneof packed BytesList, +// FloatList, or Int64List). This flexible and compact format allows the +// storage of large amounts of typed data, but requires that the data shape +// and use be determined by the configuration files and parsers that are used to +// read and write this format. That is, the Example is mostly *not* a +// self-describing format. In TensorFlow, Examples are read in row-major +// format, so any configuration that describes data with rank-2 or above +// should keep this in mind. If you flatten a matrix into a FloatList it should +// be stored as [ row 0 ... row 1 ... row M-1 ] +// +// An Example for a movie recommendation application: +// features { +// feature { +// key: "age" +// value { float_list { +// value: 29.0 +// }} +// } +// feature { +// key: "movie" +// value { bytes_list { +// value: "The Shawshank Redemption" +// value: "Fight Club" +// }} +// } +// feature { +// key: "movie_ratings" +// value { float_list { +// value: 9.0 +// value: 9.7 +// }} +// } +// feature { +// key: "suggestion" +// value { bytes_list { +// value: "Inception" +// }} +// } +// # Note that this feature exists to be used as a label in training. +// # E.g., if training a logistic regression model to predict purchase +// # probability in our learning tool we would set the label feature to +// # "suggestion_purchased". +// feature { +// key: "suggestion_purchased" +// value { float_list { +// value: 1.0 +// }} +// } +// # Similar to "suggestion_purchased" above this feature exists to be used +// # as a label in training. +// # E.g., if training a linear regression model to predict purchase +// # price in our learning tool we would set the label feature to +// # "purchase_price". +// feature { +// key: "purchase_price" +// value { float_list { +// value: 9.99 +// }} +// } +// } +// +// A conformant Example data set obeys the following conventions: +// - If a Feature K exists in one example with data type T, it must be of +// type T in all other examples when present. It may be omitted. +// - The number of instances of Feature K list data may vary across examples, +// depending on the requirements of the model. +// - If a Feature K doesn't exist in an example, a K-specific default will be +// used, if configured. +// - If a Feature K exists in an example but contains no items, the intent +// is considered to be an empty tensor and no default will be used. + +message Example { + Features features = 1; +} + +// A SequenceExample is an Example representing one or more sequences, and +// some context. The context contains features which apply to the entire +// example. The feature_lists contain a key, value map where each key is +// associated with a repeated set of Features (a FeatureList). +// A FeatureList thus represents the values of a feature identified by its key +// over time / frames. +// +// Below is a SequenceExample for a movie recommendation application recording a +// sequence of ratings by a user. The time-independent features ("locale", +// "age", "favorites") describing the user are part of the context. The sequence +// of movies the user rated are part of the feature_lists. For each movie in the +// sequence we have information on its name and actors and the user's rating. +// This information is recorded in three separate feature_list(s). +// In the example below there are only two movies. All three feature_list(s), +// namely "movie_ratings", "movie_names", and "actors" have a feature value for +// both movies. Note, that "actors" is itself a bytes_list with multiple +// strings per movie. +// +// context: { +// feature: { +// key : "locale" +// value: { +// bytes_list: { +// value: [ "pt_BR" ] +// } +// } +// } +// feature: { +// key : "age" +// value: { +// float_list: { +// value: [ 19.0 ] +// } +// } +// } +// feature: { +// key : "favorites" +// value: { +// bytes_list: { +// value: [ "Majesty Rose", "Savannah Outen", "One Direction" ] +// } +// } +// } +// } +// feature_lists: { +// feature_list: { +// key : "movie_ratings" +// value: { +// feature: { +// float_list: { +// value: [ 4.5 ] +// } +// } +// feature: { +// float_list: { +// value: [ 5.0 ] +// } +// } +// } +// } +// feature_list: { +// key : "movie_names" +// value: { +// feature: { +// bytes_list: { +// value: [ "The Shawshank Redemption" ] +// } +// } +// feature: { +// bytes_list: { +// value: [ "Fight Club" ] +// } +// } +// } +// } +// feature_list: { +// key : "actors" +// value: { +// feature: { +// bytes_list: { +// value: [ "Tim Robbins", "Morgan Freeman" ] +// } +// } +// feature: { +// bytes_list: { +// value: [ "Brad Pitt", "Edward Norton", "Helena Bonham Carter" ] +// } +// } +// } +// } +// } +// +// A conformant SequenceExample data set obeys the following conventions: +// +// Context: +// - All conformant context features K must obey the same conventions as +// a conformant Example's features (see above). +// Feature lists: +// - A FeatureList L may be missing in an example; it is up to the +// parser configuration to determine if this is allowed or considered +// an empty list (zero length). +// - If a FeatureList L exists, it may be empty (zero length). +// - If a FeatureList L is non-empty, all features within the FeatureList +// must have the same data type T. Even across SequenceExamples, the type T +// of the FeatureList identified by the same key must be the same. An entry +// without any values may serve as an empty feature. +// - If a FeatureList L is non-empty, it is up to the parser configuration +// to determine if all features within the FeatureList must +// have the same size. The same holds for this FeatureList across multiple +// examples. +// - For sequence modeling, e.g.: +// http://colah.github.io/posts/2015-08-Understanding-LSTMs/ +// https://github.com/tensorflow/nmt +// the feature lists represent a sequence of frames. +// In this scenario, all FeatureLists in a SequenceExample have the same +// number of Feature messages, so that the ith element in each FeatureList +// is part of the ith frame (or time step). +// Examples of conformant and non-conformant examples' FeatureLists: +// +// Conformant FeatureLists: +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { float_list: { value: [ 5.0 ] } } } +// } } +// +// Non-conformant FeatureLists (mismatched types): +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { int64_list: { value: [ 5 ] } } } +// } } +// +// Conditionally conformant FeatureLists, the parser configuration determines +// if the feature sizes must match: +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { float_list: { value: [ 5.0, 6.0 ] } } } +// } } +// +// Conformant pair of SequenceExample +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { float_list: { value: [ 5.0 ] } } } +// } } +// and: +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { float_list: { value: [ 5.0 ] } } +// feature: { float_list: { value: [ 2.0 ] } } } +// } } +// +// Conformant pair of SequenceExample +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { float_list: { value: [ 5.0 ] } } } +// } } +// and: +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { } +// } } +// +// Conditionally conformant pair of SequenceExample, the parser configuration +// determines if the second feature_lists is consistent (zero-length) or +// invalid (missing "movie_ratings"): +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { float_list: { value: [ 5.0 ] } } } +// } } +// and: +// feature_lists: { } +// +// Non-conformant pair of SequenceExample (mismatched types) +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { float_list: { value: [ 5.0 ] } } } +// } } +// and: +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { int64_list: { value: [ 4 ] } } +// feature: { int64_list: { value: [ 5 ] } } +// feature: { int64_list: { value: [ 2 ] } } } +// } } +// +// Conditionally conformant pair of SequenceExample; the parser configuration +// determines if the feature sizes must match: +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.5 ] } } +// feature: { float_list: { value: [ 5.0 ] } } } +// } } +// and: +// feature_lists: { feature_list: { +// key: "movie_ratings" +// value: { feature: { float_list: { value: [ 4.0 ] } } +// feature: { float_list: { value: [ 5.0, 3.0 ] } } +// } } + +message SequenceExample { + Features context = 1; + FeatureLists feature_lists = 2; +} diff --git a/src/maxtext/input_pipeline/protos/example_pb2.py b/src/maxtext/input_pipeline/protos/example_pb2.py new file mode 100644 index 0000000000..fb1922a75b --- /dev/null +++ b/src/maxtext/input_pipeline/protos/example_pb2.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: maxtext/input_pipeline/protos/example.proto +# Protobuf Python Version: 5.29.5 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 29, + 5, + '', + 'maxtext/input_pipeline/protos/example.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from maxtext.input_pipeline.protos import feature_pb2 as maxtext_dot_input__pipeline_dot_protos_dot_feature__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n+maxtext/input_pipeline/protos/example.proto\x12\x1dmaxtext.input_pipeline.protos\x1a+maxtext/input_pipeline/protos/feature.proto\"D\n\x07\x45xample\x12\x39\n\x08\x66\x65\x61tures\x18\x01 \x01(\x0b\x32\'.maxtext.input_pipeline.protos.Features\"\x8f\x01\n\x0fSequenceExample\x12\x38\n\x07\x63ontext\x18\x01 \x01(\x0b\x32\'.maxtext.input_pipeline.protos.Features\x12\x42\n\rfeature_lists\x18\x02 \x01(\x0b\x32+.maxtext.input_pipeline.protos.FeatureListsB\x81\x01\n\x16org.tensorflow.exampleB\rExampleProtosP\x01ZSgithub.com/tensorflow/tensorflow/tensorflow/go/core/example/example_protos_go_proto\xf8\x01\x01\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'maxtext.input_pipeline.protos.example_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None + _globals['DESCRIPTOR']._serialized_options = b'\n\026org.tensorflow.exampleB\rExampleProtosP\001ZSgithub.com/tensorflow/tensorflow/tensorflow/go/core/example/example_protos_go_proto\370\001\001' + _globals['_EXAMPLE']._serialized_start=123 + _globals['_EXAMPLE']._serialized_end=191 + _globals['_SEQUENCEEXAMPLE']._serialized_start=194 + _globals['_SEQUENCEEXAMPLE']._serialized_end=337 +# @@protoc_insertion_point(module_scope) diff --git a/src/maxtext/input_pipeline/protos/feature.proto b/src/maxtext/input_pipeline/protos/feature.proto new file mode 100644 index 0000000000..6960a5bbd6 --- /dev/null +++ b/src/maxtext/input_pipeline/protos/feature.proto @@ -0,0 +1,111 @@ +// Copy of https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/example/feature.proto +// Protocol messages for describing features for machine learning model +// training or inference. +// +// There are three base Feature types: +// - bytes +// - float +// - int64 +// +// A Feature contains Lists which may hold zero or more values. These +// lists are the base values BytesList, FloatList, Int64List. +// +// Features are organized into categories by name. The Features message +// contains the mapping from name to Feature. +// +// Example Features for a movie recommendation application: +// feature { +// key: "age" +// value { float_list { +// value: 29.0 +// }} +// } +// feature { +// key: "movie" +// value { bytes_list { +// value: "The Shawshank Redemption" +// value: "Fight Club" +// }} +// } +// feature { +// key: "movie_ratings" +// value { float_list { +// value: 9.0 +// value: 9.7 +// }} +// } +// feature { +// key: "suggestion" +// value { bytes_list { +// value: "Inception" +// }} +// } +// feature { +// key: "suggestion_purchased" +// value { int64_list { +// value: 1 +// }} +// } +// feature { +// key: "purchase_price" +// value { float_list { +// value: 9.99 +// }} +// } +// + +syntax = "proto3"; + +package maxtext.input_pipeline.protos; + +option cc_enable_arenas = true; +option java_outer_classname = "FeatureProtos"; +option java_multiple_files = true; +option java_package = "org.tensorflow.example"; +option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example/example_protos_go_proto"; + +// LINT.IfChange +// Containers to hold repeated fundamental values. +message BytesList { + repeated bytes value = 1; +} +message FloatList { + repeated float value = 1 [packed = true]; +} +message Int64List { + repeated int64 value = 1 [packed = true, jstype = JS_STRING]; +} + +// Containers for non-sequential data. +message Feature { + // Each feature can be exactly one kind. + oneof kind { + BytesList bytes_list = 1; + FloatList float_list = 2; + Int64List int64_list = 3; + } +} + +message Features { + // Map from feature name to feature. + map feature = 1; +} + +// Containers for sequential data. +// +// A FeatureList contains lists of Features. These may hold zero or more +// Feature values. +// +// FeatureLists are organized into categories by name. The FeatureLists message +// contains the mapping from name to FeatureList. +// +message FeatureList { + repeated Feature feature = 1; +} + +message FeatureLists { + // Map from feature name to feature list. + map feature_list = 1; +} +// LINT.ThenChange( +// https://www.tensorflow.org/code/tensorflow/python/training/training.py) diff --git a/src/maxtext/input_pipeline/protos/feature_pb2.py b/src/maxtext/input_pipeline/protos/feature_pb2.py new file mode 100644 index 0000000000..8d2ecae46e --- /dev/null +++ b/src/maxtext/input_pipeline/protos/feature_pb2.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: maxtext/input_pipeline/protos/feature.proto +# Protobuf Python Version: 5.29.5 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 29, + 5, + '', + 'maxtext/input_pipeline/protos/feature.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n+maxtext/input_pipeline/protos/feature.proto\x12\x1dmaxtext.input_pipeline.protos\"\x1a\n\tBytesList\x12\r\n\x05value\x18\x01 \x03(\x0c\"\x1e\n\tFloatList\x12\x11\n\x05value\x18\x01 \x03(\x02\x42\x02\x10\x01\" \n\tInt64List\x12\x13\n\x05value\x18\x01 \x03(\x03\x42\x04\x10\x01\x30\x01\"\xd1\x01\n\x07\x46\x65\x61ture\x12>\n\nbytes_list\x18\x01 \x01(\x0b\x32(.maxtext.input_pipeline.protos.BytesListH\x00\x12>\n\nfloat_list\x18\x02 \x01(\x0b\x32(.maxtext.input_pipeline.protos.FloatListH\x00\x12>\n\nint64_list\x18\x03 \x01(\x0b\x32(.maxtext.input_pipeline.protos.Int64ListH\x00\x42\x06\n\x04kind\"\xa9\x01\n\x08\x46\x65\x61tures\x12\x45\n\x07\x66\x65\x61ture\x18\x01 \x03(\x0b\x32\x34.maxtext.input_pipeline.protos.Features.FeatureEntry\x1aV\n\x0c\x46\x65\x61tureEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x35\n\x05value\x18\x02 \x01(\x0b\x32&.maxtext.input_pipeline.protos.Feature:\x02\x38\x01\"F\n\x0b\x46\x65\x61tureList\x12\x37\n\x07\x66\x65\x61ture\x18\x01 \x03(\x0b\x32&.maxtext.input_pipeline.protos.Feature\"\xc2\x01\n\x0c\x46\x65\x61tureLists\x12R\n\x0c\x66\x65\x61ture_list\x18\x01 \x03(\x0b\x32<.maxtext.input_pipeline.protos.FeatureLists.FeatureListEntry\x1a^\n\x10\x46\x65\x61tureListEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x39\n\x05value\x18\x02 \x01(\x0b\x32*.maxtext.input_pipeline.protos.FeatureList:\x02\x38\x01\x42\x81\x01\n\x16org.tensorflow.exampleB\rFeatureProtosP\x01ZSgithub.com/tensorflow/tensorflow/tensorflow/go/core/example/example_protos_go_proto\xf8\x01\x01\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'maxtext.input_pipeline.protos.feature_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None + _globals['DESCRIPTOR']._serialized_options = b'\n\026org.tensorflow.exampleB\rFeatureProtosP\001ZSgithub.com/tensorflow/tensorflow/tensorflow/go/core/example/example_protos_go_proto\370\001\001' + _globals['_FLOATLIST'].fields_by_name['value']._loaded_options = None + _globals['_FLOATLIST'].fields_by_name['value']._serialized_options = b'\020\001' + _globals['_INT64LIST'].fields_by_name['value']._loaded_options = None + _globals['_INT64LIST'].fields_by_name['value']._serialized_options = b'\020\0010\001' + _globals['_FEATURES_FEATUREENTRY']._loaded_options = None + _globals['_FEATURES_FEATUREENTRY']._serialized_options = b'8\001' + _globals['_FEATURELISTS_FEATURELISTENTRY']._loaded_options = None + _globals['_FEATURELISTS_FEATURELISTENTRY']._serialized_options = b'8\001' + _globals['_BYTESLIST']._serialized_start=78 + _globals['_BYTESLIST']._serialized_end=104 + _globals['_FLOATLIST']._serialized_start=106 + _globals['_FLOATLIST']._serialized_end=136 + _globals['_INT64LIST']._serialized_start=138 + _globals['_INT64LIST']._serialized_end=170 + _globals['_FEATURE']._serialized_start=173 + _globals['_FEATURE']._serialized_end=382 + _globals['_FEATURES']._serialized_start=385 + _globals['_FEATURES']._serialized_end=554 + _globals['_FEATURES_FEATUREENTRY']._serialized_start=468 + _globals['_FEATURES_FEATUREENTRY']._serialized_end=554 + _globals['_FEATURELIST']._serialized_start=556 + _globals['_FEATURELIST']._serialized_end=626 + _globals['_FEATURELISTS']._serialized_start=629 + _globals['_FEATURELISTS']._serialized_end=823 + _globals['_FEATURELISTS_FEATURELISTENTRY']._serialized_start=729 + _globals['_FEATURELISTS_FEATURELISTENTRY']._serialized_end=823 +# @@protoc_insertion_point(module_scope) diff --git a/src/maxtext/input_pipeline/protos/generate_proto.sh b/src/maxtext/input_pipeline/protos/generate_proto.sh new file mode 100644 index 0000000000..a21ad72191 --- /dev/null +++ b/src/maxtext/input_pipeline/protos/generate_proto.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Copyright 2023–2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script generates feature.proto and example.proto +# Install protoc with a version matching your protobuf version. +# For example, to install protoc 29.5 that matches protobuf 5.29.5: +# PROTOC_VERSION=29.5 +# PROTOC_ARCH=$(uname -m) +# if [ "$PROTOC_ARCH" = "x86_64" ]; then ARCH="x86_64"; elif [ "$PROTOC_ARCH" = "aarch64" ]; then ARCH="aarch_64"; fi +# URL="https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-${ARCH}.zip" +# curl -L $URL -o protoc.zip && unzip protoc.zip -d protoc_temp +# cp protoc_temp/bin/protoc ~/.local/bin/protoc && cp -r protoc_temp/include/* ~/.local/include/ +# rm -rf protoc.zip protoc_temp + +# Under the maxtext/src +cd ./src + +# Compile +protoc --proto_path=. --python_out=. \ +maxtext/input_pipeline/protos/feature.proto \ +maxtext/input_pipeline/protos/example.proto