# **Schema Generation**

Here we will generate a schema and try to place some reasonable constraints on the data. 

We will create the schema based on all of the available data. For now, the schema will only have information about the tags. 

In [1]:
import os
import pprint
import tempfile
import urllib
import numpy as np

import pandas as pd

import absl
import tensorflow as tf
import tensorflow_data_validation as tfdv
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

import tfx
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import ExampleValidator
from tfx.components.base import executor_spec
from tfx.components.trainer.executor import GenericExecutor
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.extensions.google_cloud_big_query.example_gen.component import (
    BigQueryExampleGen,
)

from tfx.components import ImporterNode
from tfx.types.standard_artifacts import Schema

from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.proto import example_gen_pb2

from tensorflow_metadata.proto.v0 import schema_pb2 
from tensorflow_metadata.proto.v0 import statistics_pb2
from tensorflow_metadata.proto.v0 import anomalies_pb2

from ml_metadata import metadata_store
from ml_metadata.proto import metadata_store_pb2

from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
from tfx.utils.dsl_utils import external_input



In [2]:
_pipeline_root = tempfile.mkdtemp(prefix='pipeline')
_pipeline_name = 'interactive_pipeline'

context = InteractiveContext(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root)



We are only querying 100 rows in order to get the types for the different columns we will use during training. We will generate the vocabulary for the tags separately (outside of tfx), because in order to generate the vocabulary for the tags within tfx we would need to generate tfrecords for every single example. This is unnecessary, we can generate a schema from a small amount of data first (mostly to get the types for all features) and then separately generate the vocab and set the domain of the tags manually. 

See below for how this is accomplished. 

In [8]:
query = """
SELECT 
    program_longsynopsis as synopsis,
    tags
FROM 
    #`metadata_sky.merlin_movie_series_data_small`
    `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data`
LIMIT 100
"""

In [10]:
%%time
output = example_gen_pb2.Output(
             split_config=example_gen_pb2.SplitConfig(splits=[
                 example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=10)
             ],
    ))
example_gen = BigQueryExampleGen(query=query, output_config=output)
context.run(example_gen, beam_pipeline_args=['--project', 'res-nbcupea-dev-ds-sandbox-001', '--temp_location', 'gs://metadata-bucket-sky-test/tmp'])



CPU times: user 1.18 s, sys: 190 ms, total: 1.37 s
Wall time: 17.6 s


0,1
.execution_id,4
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } BigQueryExampleGen at 0x7fd9a5e90490.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0.exec_properties['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n program_longsynopsis as synopsis,\n tags\nFROM \n #`metadata_sky.merlin_movie_series_data_small`\n `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data`\nLIMIT 100\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }['custom_config']None['_beam_pipeline_args'][0]--project[1]res-nbcupea-dev-ds-sandbox-001[2]--temp_location[3]gs://metadata-bucket-sky-test/tmp[4]--extra_package=/tmp/tmpipwo7m1_/build/tfx/dist/tfx_ephemeral-0.23.0.tar.gz[5]--labels[6]tfx_executor=extensions-google_cloud_big_query-example_gen-executor-executor[7]--labels[8]tfx_py_version=3-7[9]--labels[10]tfx_runner=interactivecontext[11]--labels[12]tfx_version=0-23-0"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"
.exec_properties,"['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n program_longsynopsis as synopsis,\n tags\nFROM \n #`metadata_sky.merlin_movie_series_data_small`\n `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data`\nLIMIT 100\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }['custom_config']None['_beam_pipeline_args'][0]--project[1]res-nbcupea-dev-ds-sandbox-001[2]--temp_location[3]gs://metadata-bucket-sky-test/tmp[4]--extra_package=/tmp/tmpipwo7m1_/build/tfx/dist/tfx_ephemeral-0.23.0.tar.gz[5]--labels[6]tfx_executor=extensions-google_cloud_big_query-example_gen-executor-executor[7]--labels[8]tfx_py_version=3-7[9]--labels[10]tfx_runner=interactivecontext[11]--labels[12]tfx_version=0-23-0"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4
.span,0
.split_names,"[""train""]"
.version,0

0,1
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n program_longsynopsis as synopsis,\n tags\nFROM \n #`metadata_sky.merlin_movie_series_data_small`\n `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data`\nLIMIT 100\n""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }"
['custom_config'],
['_beam_pipeline_args'],[0]--project[1]res-nbcupea-dev-ds-sandbox-001[2]--temp_location[3]gs://metadata-bucket-sky-test/tmp[4]--extra_package=/tmp/tmpipwo7m1_/build/tfx/dist/tfx_ephemeral-0.23.0.tar.gz[5]--labels[6]tfx_executor=extensions-google_cloud_big_query-example_gen-executor-executor[7]--labels[8]tfx_py_version=3-7[9]--labels[10]tfx_runner=interactivecontext[11]--labels[12]tfx_version=0-23-0

0,1
[0],--project
[1],res-nbcupea-dev-ds-sandbox-001
[2],--temp_location
[3],gs://metadata-bucket-sky-test/tmp
[4],--extra_package=/tmp/tmpipwo7m1_/build/tfx/dist/tfx_ephemeral-0.23.0.tar.gz
[5],--labels
[6],tfx_executor=extensions-google_cloud_big_query-example_gen-executor-executor
[7],--labels
[8],tfx_py_version=3-7
[9],--labels

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4
.span,0
.split_names,"[""train""]"
.version,0


**Inspect the data**

In [11]:
def make_dataset(examples, split):
    """
    Create a dataset from examples artifact
    """
    uri = os.path.join(examples.get()[0].uri, split)
    
    tfrecord_filenames = [os.path.join(uri, name) for name in os.listdir(uri)]
    dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")
    
    return dataset

In [12]:
train_dataset = make_dataset(example_gen.outputs['examples'], 'train')

print('TRAIN DATASET SIZE: {}'.format(train_dataset.reduce(0, lambda x, _: x + 1)))

print('{:=^80}'.format(' Example '))

# Iterate over the first record and decode it.
for tfrecord in train_dataset.take(1):
    serialized_example = tfrecord.numpy()
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    pp.pprint(example)

TRAIN DATASET SIZE: 100
features {
  feature {
    key: "synopsis"
    value {
      bytes_list {
        value: "Steamed lobster; grilled lobster and asparagus; lobster pizza. Seafood lasagna; bay scallops; soba noodles; ginger-root salsa. Poached salmon; catfish enchiladas; spring trout; pesce piquante. Tortilla soup; red-pepper mousse; artichoke salad; squash salad. Vegetarian shepherd\'s pie; ginger/squash soup; eggplant quinelles and pasta. Caper/sweet potato muffins; jam; biscuits; onion snacking bread. Xochile soup; pinoles rice; sweet-potato pancakes; braised chickpeas. Herbs of fennel Provencal; ratatouille; eggplant with crab meat; bread stuffing. Fajitas with salsa; grilled halibut; lime-marinated emu. Herbal rice cakes, roasted eggplant and tomatoes, vegetable medley; the connection between diet and hypertension. Chicken with onions, tomatoes and vinegar; grilled rabbit; roasted butternut squash. Dark chocolate/mint velvet; chocolate strawberries; petits pots; chocolate/but

**The following query allows us to get the entire tag vocab**. This takes about 1 second to run, as opposed to several minutes to generate examples for every row data and then calculate the statistics for that data. 

In [13]:
%%bigquery tag_vocab_df

SELECT 
  DISTINCT(labels)
FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data`, UNNEST(tags) as labels

Unfortunately, this also causes us to calculate a domain for things like synopsis and content_id, which we don't want. 

In [14]:
stats_options = tfdv.StatsOptions(num_rank_histogram_buckets=1000)
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples'],
    stats_options=stats_options)
context.run(statistics_gen)

0,1
.execution_id,5
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } StatisticsGen at 0x7fd9a5d40fd0.inputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0.outputs['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fd9a5eb1690.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""].exec_properties['stats_options_json']{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_count"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null}['exclude_splits'][]"
.component.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"
.component.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fd9a5eb1690.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""]"

0,1
.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"
.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fd9a5eb1690.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""]"
.exec_properties,"['stats_options_json']{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_count"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null}['exclude_splits'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4
.span,0
.split_names,"[""train""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fd9a5eb1690.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5
.span,0
.split_names,"[""train""]"

0,1
['stats_options_json'],"{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_count"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null}"
['exclude_splits'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd9a5e907d0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4) at 0x7fd9a5e90790.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinewe1rfxi_/BigQueryExampleGen/examples/4
.span,0
.split_names,"[""train""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fd9a5eb1690.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5) at 0x7fd9a5e312d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5.span0.split_names[""train""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/pipelinewe1rfxi_/StatisticsGen/statistics/5
.span,0
.split_names,"[""train""]"


Let's look at the statistics:

In [15]:
context.show(statistics_gen.outputs['statistics'])

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Some useful stuff here: 
 + Data is never missing (constraint)
 + Can potentially do some thresholds on labels + tags in order to detect drift 
     + Hard to say what is reasonable here, but we can just start with some value and see what happens
 + Can set the label and tag vocabulary if we'd like ("domain") 
     + Can also set constraints on the domain --> e.g. labels should always come from the observed domain. 
 + Need to increase the number of rank_histogram_buckets. 

In [16]:
stats_path = statistics_gen.outputs['statistics'].get()[0].uri
train_statistics_path = os.path.join(stats_path, 'train/stats_tfrecord')

train_statistics = tfdv.load_statistics(train_statistics_path)

In [17]:
# Get the full tag domain by setting max_string_domain_size to 5000
schema = tfdv.infer_schema(train_statistics, infer_feature_shape=True, max_string_domain_size=50)

In [18]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'synopsis',BYTES,required,,-
'tags',BYTES,required,"[1,inf)",-


**Below is where we set the domain manually**

In [19]:
tags = tfdv.get_feature(schema, 'tags')
tfdv.set_domain(schema, 'tags', schema_pb2.StringDomain(value=np.hstack(tag_vocab_df.values)))

In [20]:
tags.distribution_constraints.min_domain_mass = 0.95
tags.drift_comparator.infinity_norm.threshold = 0.05

In [21]:
tfdv.write_schema_text(schema, '../src/schema/schema.pbtxt')

---

In [22]:
loaded_schema = tfdv.load_schema_text('../src/schema/schema.pbtxt')

In [23]:
def get_domain_size(schema_path, feature):
    schema_text = tfdv.load_schema_text(schema_path)
    domain = tfdv.get_domain(schema_text, feature)

    return len(domain.value)

Can confirm that the schema has the entire domain as well as both features and their type! 

In [24]:
tfdv.display_schema(loaded_schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'synopsis',BYTES,required,,-
'tags',STRING,required,"[1,inf)",'tags_domain'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'tags_domain',"'Basketball', 'Horror', 'Newscast', 'Consumer', 'Hockey', 'Standup', 'Olympics', 'Crafts', 'Water Sports', 'Cartoon', 'Advice', 'Badminton', 'Table Tennis', 'Ski Jumping', 'Men\'s Gundersen Normal Hill / 10km', 'Men\'s Single', 'Men\'s 4 x 7.5km Relay', 'Behind the Scenes', 'Fantasy', 'Sports', 'Pets', 'Fashion', 'Motorcycle', 'Valentine\'s Day', 'Rugby union', 'Latin Music', 'Multi-sport event', 'Women\'s 10km Pursuit', 'Mixed Team', 'Women\'s Snowboard Cross', 'Men\'s Parallel Giant Slalom', 'Coming of Age', 'Men\'s 15km Mass Start', 'Women\'s Ski Cross', 'Food', 'preschoolers (ages 2-4)', 'Skateboarding', 'Rio 2016', 'Cycling', 'Bobsledding', 'Women\'s Curling', 'Men\'s Snowboard Cross', 'Water Skiing', 'Canada Day', 'Rowing', 'Men\'s Super G', 'Retrospective', 'Men\'s Downhill', 'Women\'s 7.5km Sprint', 'Serial', 'Hip-Hop & Rap Music', 'Computers', 'Dance', 'Mixed Martial Arts', 'Paranormal', 'Netflix Original', 'Rodeo', 'Pop Music', 'Tennis', 'Rock Music', 'Heavy Metal Music', 'Sequel', 'Ringuette', 'Ska Music', 'Modern Pentathlon', 'Komödie', 'Dating', 'Short Subject', 'Men\'s 4 x 10km Relay', 'Men\'s Individual', 'Women\'s Giant Slalom', 'Investigative', 'Newsmagazine', 'Self improvement', 'Animated', 'Anthology', 'Gay and Lesbian', 'Motor Sports', 'Christmas', 'World Music', 'Track & Field', 'Aerobics', 'Rhythmic Gymnastics', 'Short track speed skating', 'Bull riding', 'Women\'s Halfpipe', 'Dog sled', 'Women\'s 1000m', 'Polo', 'Magazine', 'Women\'s Aerials', 'Nordic Combined', 'Women\'s 30km Classic Style', 'Mixed Ice Dance', 'Highlights', 'Politics & Government', 'Special', 'Thanksgiving', 'Volleyball', 'Equestrian', 'Curling', 'Docudrama', 'Men\'s Curling', 'Australian Football', 'Women\'s 500m', 'Men\'s Aerials', 'Men\'s Individual - Large Hill', 'Women\'s Big Air', 'Women\'s Slopestyle', 'Arts & Literature', 'Women\'s Sprint Classic Style', 'Men\'s Sprint Classic Style', 'Women\'s 15km Individual', 'Hydroplane racing', 'Outdoors', 'History', 'Sports talk', 'Hunting', 'Horse Racing', 'Wall Street', 'Pro Wrestling', 'Documentary drama', 'Swimming', 'Handball', 'Bollywood', 'eSports', 'Ballet', 'Women\'s Team Pursuit 6 Laps', 'Women\'s Ice Hockey', 'Marathon', 'Mixed Curling', 'Men\'s Team Large Hill / 4 x 5km', 'Blackjack', 'Mixed 2 x 6 + 2 x 7.5km Relay', 'Men\'s Gundersen Large Hill / 10km', 'Drift racing', 'Women\'s 1500m', 'Trains', 'Romance', 'Variety', 'Collectibles', 'Country Music', 'Bowling', 'Biathlon', 'Card Games', 'Taekwondo', 'Running', 'Hanukkah', 'Sumo Wrestling', 'Men\'s 1000m', 'Men\'s Team Sprint Freestyle', 'July 4th', 'Family Entertainment', 'Racquetball', 'Cinco de Mayo', 'Freestyle Skiing', 'Men\'s Big Air', 'Reggae Music', 'Public Affairs', 'SportingEvent', 'Baseball', 'Comedy drama', 'American history', 'Musical', 'Boxing', 'Triathlon', 'Drag Racing', 'Motorcycle Racing', 'Rugby', 'Fencing', 'Cross-Country Skiing', 'Bicycle racing', 'Arm Wrestling', 'Men\'s 5000m Relay', 'Blues Music', 'Pro', 'Men\'s Halfpipe', 'Floorball', 'Men\'s 10000m', 'Teens', 'R&B Music', 'News', 'Performing arts', 'Parenting', 'Miniseries', 'Shooting', 'Horse', 'PyeongChang 2018', 'Parade', 'Wrestling', 'Summer Olympics', 'Alpine Skiing', 'Field Hockey', 'Women\'s Single', 'Synchronized Swimming', 'St. Patrick\'s Day', 'Women\'s Downhill', 'Women\'s Super G', 'Intl soccer', 'Drama', 'Game Show', 'Interview', 'Medical', 'Holiday', 'Romantic comedy', 'Gymnastics', 'Halloween', 'Mountain Biking', 'Surfing', 'Women\'s 10km Freestyle', 'Quebec Production', 'Women\'s Individual - Normal Hill', 'Art de Vivre', 'Men\'s 5000m', 'Women\'s 4 x 6km Relay', 'Women\'s Slalom', 'Bowls', 'Beach soccer', 'Legal', 'Anime', 'Sports non-event', 'Mystery', 'Western', 'tweens (ages 10-12)', 'Weddings', 'Independent', 'Men\'s Giant Slalom', 'Easter', 'Bluegrass Music', 'Soul Music', 'Women\'s Team Sprint Freestyle', 'Mixed Team - Relay', 'Men\'s 20km Individual', 'Women\'s Mass Start 16 Laps', 'Travel', 'Sitcom', 'Home & Garden', 'Home improvement', 'Shopping', 'Nature', 'Skeleton', 'Martial arts', 'Animated Comedy', 'Rugby league', 'Classic Movies', 'Bodybuilding', 'Ancient history', 'Women\'s 2-Man Competition', 'Men\'s 1500m', 'Men\'s 500m', 'Pool', 'Women\'s 5000m', 'Lifestyle', 'Children\'s/Family Entertainment', 'Thriller', 'Football', 'Auto Racing', 'Weather', 'Skiing', 'Speedskating', 'New Year\'s', 'Musical comedy', 'Gaelic Football', 'Kayaking', 'Bullfighting', 'Based on the Video Game', 'Water Polo', 'Folk Music', 'Men\'s Mass Start 16 Laps', 'Music', 'Educational', 'Golf', 'Lacrosse', 'Filmreihe', 'Foreign', 'Romance comedy', 'Suspense', 'Dog Show', 'Footvolley', 'Karaoke', 'World history', 'Men\'s 2-Man Competition', 'Soft Rock', 'Women\'s 2 x 7.5km Skiathlon', 'Men\'s 50km Classic Style', 'Senior Citizen', 'Women\'s 12.5km Mass Start', 'teens (ages 13-14)', 'Soccer', 'Documentary', 'Action Sports', 'Boat', 'Snowboarding', 'History drama', 'Yacht Racing', '2018 World Cup', 'Opera', 'Roller derby', 'Men\'s Team Pursuit 8 Laps', 'Men\'s Team - Large Hill', 'Adaptation', 'Dog Racing', 'Men\'s 15km Freestyle', 'Women\'s Moguls', 'Women\'s 3000m Relay', 'Reality Competition', 'Local', 'not for kids', 'Comedy', 'Business & Finance', 'Crime', 'Biography', 'older teens (ages 15+)', 'Historical drama', 'Health', 'Dark comedy', 'Playoffs', 'Judo', 'Figure Skating', 'Women\'s Alpine Combined', 'Funk Music', 'Crime drama', 'Soap Opera', 'Auto', 'Debate', 'Gospel Music', 'Art', 'Military & War', 'Aviation', 'Labor Day', 'Poker', 'Men\'s Slopestyle', 'Luge', 'Canoeing', '20 de noviembre', 'Darts', 'Mixed Pairs', 'Men\'s Alpine Combined', 'Yom Kippur', '4K Test Tag', 'Science fiction', 'kids (ages 5-9)', 'How-To', 'Courtroom', 'Alternative Music', 'Cheerleading', 'Billiards', 'Preschool', 'Weightlifting', 'Snowmobiling', 'Men\'s 12.5km Pursuit', 'Men\'s Ski Cross', 'Men\'s Moguls', 'Men\'s Slalom', 'Men\'s 2 x 15km Skiathlon', 'Women\'s 3000m', 'Entertainment', 'Science & Technology', 'Classic Sport Event', 'Real Estate', 'Boating', 'Cricket', 'Winter Olympics', 'Concert', 'Men\'s Ice Hockey', 'Beach Volleyball', 'Hurling', 'Easy Listening Music', 'Men\'s Individual - Normal Hill', 'Trampoline Gymnastics', 'Open 4-Man Competition', 'Men\'s Singles', 'Men\'s 10km Sprint', 'Open Double', 'Goth Music', 'Erotic', 'Action & Adventure', 'Talk', 'Animals', 'Gaming', 'Theater', 'Softball', 'Snooker', 'Women\'s Singles', 'Women\'s Individual', 'Women\'s Parallel Giant Slalom', 'Classic Sports', 'Spin-Off', 'Reality', 'Religion', 'Fishing', 'Awards', 'Agriculture', 'Auction', 'Bicycle', 'Fitness', 'Environment', 'Event', 'Techno Music', 'Classical Music', 'Sailing', 'Fundraiser/Telethon', 'Diving', 'Archery', 'Jazz Music', 'Amazon Original', 'Ice Skating', 'Women\'s 4 x 5km Relay', 'Boxing Day'"


In [25]:
get_domain_size('../src/schema/schema.pbtxt', 'tags')

409