# **Schema Generation**

Here we will generate a schema and try to place some reasonable constraints on the data. 

We will create the schema based on all of the available data. For now, the schema will only have information about the tags. 

In [3]:
import os
import pprint
import tempfile
import urllib
import numpy as np

import pandas as pd

import absl
import tensorflow as tf
import tensorflow_data_validation as tfdv
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

import tfx
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import ExampleValidator
from tfx.components.base import executor_spec
from tfx.components.trainer.executor import GenericExecutor
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.extensions.google_cloud_big_query.example_gen.component import (
    BigQueryExampleGen,
)

from tfx.components import ImporterNode
from tfx.types.standard_artifacts import Schema

from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.proto import example_gen_pb2

from tensorflow_metadata.proto.v0 import schema_pb2 
from tensorflow_metadata.proto.v0 import statistics_pb2
from tensorflow_metadata.proto.v0 import anomalies_pb2

from ml_metadata import metadata_store
from ml_metadata.proto import metadata_store_pb2

from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
from tfx.utils.dsl_utils import external_input



In [4]:
_pipeline_root = tempfile.mkdtemp(prefix='pipeline')
_pipeline_name = 'interactive_pipeline'

context = InteractiveContext(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root)



We are only querying 100 rows in order to get the types for the different columns we will use during training. We will generate the vocabulary for the tags separately (outside of tfx), because in order to generate the vocabulary for the tags within tfx we would need to generate tfrecords for every single example. This is unnecessary, we can generate a schema from a small amount of data first (mostly to get the types for all features) and then separately generate the vocab and set the domain of the tags manually. 

See below for how this is accomplished. 

In [5]:
query = """
SELECT 
    program_longsynopsis as synopsis,
    tags
FROM 
    #`metadata_sky.merlin_movie_series_data_small`
    `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data_with_lang_and_type`
LIMIT 100
"""

In [6]:
%%time
output = example_gen_pb2.Output(
             split_config=example_gen_pb2.SplitConfig(splits=[
                 example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=10)
             ],
    ))
example_gen = BigQueryExampleGen(query=query, output_config=output)
context.run(example_gen, beam_pipeline_args=['--project', 'res-nbcupea-dev-ds-sandbox-001', '--temp_location', 'gs://metadata-bucket-sky-test/tmp'])



  temp_location = pcoll.pipeline.options.view_as(


CPU times: user 1.3 s, sys: 218 ms, total: 1.52 s
Wall time: 19.7 s


0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } BigQueryExampleGen at 0x7f5e049cc4d0.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0.exec_properties['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n program_longsynopsis as synopsis,\n tags\nFROM \n #`metadata_sky.merlin_movie_series_data_small`\n `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data_with_lang_and_type`\nLIMIT 100\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }['custom_config']None['_beam_pipeline_args'][0]--project[1]res-nbcupea-dev-ds-sandbox-001[2]--temp_location[3]gs://metadata-bucket-sky-test/tmp[4]--extra_package=/tmp/tmpjlw1nuoy/build/tfx/dist/tfx_ephemeral-0.23.0.tar.gz[5]--labels[6]tfx_executor=extensions-google_cloud_big_query-example_gen-executor-executor[7]--labels[8]tfx_py_version=3-7[9]--labels[10]tfx_runner=interactivecontext[11]--labels[12]tfx_version=0-23-0"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.exec_properties,"['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n program_longsynopsis as synopsis,\n tags\nFROM \n #`metadata_sky.merlin_movie_series_data_small`\n `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data_with_lang_and_type`\nLIMIT 100\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }['custom_config']None['_beam_pipeline_args'][0]--project[1]res-nbcupea-dev-ds-sandbox-001[2]--temp_location[3]gs://metadata-bucket-sky-test/tmp[4]--extra_package=/tmp/tmpjlw1nuoy/build/tfx/dist/tfx_ephemeral-0.23.0.tar.gz[5]--labels[6]tfx_executor=extensions-google_cloud_big_query-example_gen-executor-executor[7]--labels[8]tfx_py_version=3-7[9]--labels[10]tfx_runner=interactivecontext[11]--labels[12]tfx_version=0-23-0"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n program_longsynopsis as synopsis,\n tags\nFROM \n #`metadata_sky.merlin_movie_series_data_small`\n `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data_with_lang_and_type`\nLIMIT 100\n""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }"
['custom_config'],
['_beam_pipeline_args'],[0]--project[1]res-nbcupea-dev-ds-sandbox-001[2]--temp_location[3]gs://metadata-bucket-sky-test/tmp[4]--extra_package=/tmp/tmpjlw1nuoy/build/tfx/dist/tfx_ephemeral-0.23.0.tar.gz[5]--labels[6]tfx_executor=extensions-google_cloud_big_query-example_gen-executor-executor[7]--labels[8]tfx_py_version=3-7[9]--labels[10]tfx_runner=interactivecontext[11]--labels[12]tfx_version=0-23-0

0,1
[0],--project
[1],res-nbcupea-dev-ds-sandbox-001
[2],--temp_location
[3],gs://metadata-bucket-sky-test/tmp
[4],--extra_package=/tmp/tmpjlw1nuoy/build/tfx/dist/tfx_ephemeral-0.23.0.tar.gz
[5],--labels
[6],tfx_executor=extensions-google_cloud_big_query-example_gen-executor-executor
[7],--labels
[8],tfx_py_version=3-7
[9],--labels

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0


**Inspect the data**

In [7]:
def make_dataset(examples, split):
    """
    Create a dataset from examples artifact
    """
    uri = os.path.join(examples.get()[0].uri, split)
    
    tfrecord_filenames = [os.path.join(uri, name) for name in os.listdir(uri)]
    dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")
    
    return dataset

In [8]:
train_dataset = make_dataset(example_gen.outputs['examples'], 'train')

print('TRAIN DATASET SIZE: {}'.format(train_dataset.reduce(0, lambda x, _: x + 1)))

print('{:=^80}'.format(' Example '))

# Iterate over the first record and decode it.
for tfrecord in train_dataset.take(1):
    serialized_example = tfrecord.numpy()
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    pp.pprint(example)

TRAIN DATASET SIZE: 100
features {
  feature {
    key: "synopsis"
    value {
      bytes_list {
        value: "Mr. Krabs convinces SpongeBob and Patrick to join him on a treasure hunt; SpongeBob gets on the wrong bus. Plancton decide contratar a alguien que puede crear el lema m\303\241s tonto para el \"Balde de Carnada\" y elige a Patricio; es aniversario de boda de Karen y Plancton y ella quiere darle la f\303\263rmula del \"Cangrejo Cascarudo\" como regalo de bodas. Bob Esponja tiene un ensayo que escribir para la escuela de navegaci\303\263n, pero se distrae f\303\241cilmente y no puede mantener su mente concentrada en su trabajo, y eventualmente est\303\241 imaginando que sus pantalones hablan con \303\251l. SpongeBob helps out the Krusty Krab\'s night shift crew, but the customers crave something creepier than usual. Bob Esponja y Patricio protestan contra la construcci\303\263n de una carretera que destruir\303\255a campos de medusas. Al tratar de pedir un emparedado especial

**The following query allows us to get the entire tag vocab**. This takes about 1 second to run, as opposed to several minutes to generate examples for every row data and then calculate the statistics for that data. 

In [10]:
%%bigquery tag_vocab_df

SELECT 
  DISTINCT(labels)
FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.merlin_data_with_lang_and_type`, UNNEST(tags) as labels

Unfortunately, this also causes us to calculate a domain for things like synopsis and content_id, which we don't want. 

In [11]:
stats_options = tfdv.StatsOptions(num_rank_histogram_buckets=1000)
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples'],
    stats_options=stats_options)
context.run(statistics_gen)

0,1
.execution_id,2
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } StatisticsGen at 0x7f5e285b2710.inputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0.outputs['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f5e28534c10.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""].exec_properties['stats_options_json']{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_count"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null}['exclude_splits'][]"
.component.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.component.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f5e28534c10.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f5e28534c10.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""]"
.exec_properties,"['stats_options_json']{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_count"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null}['exclude_splits'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f5e28534c10.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/pipelinew4og66an/StatisticsGen/statistics/2
.span,0
.split_names,"[""train""]"

0,1
['stats_options_json'],"{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_count"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null}"
['exclude_splits'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f5e049cc810.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinew4og66an/BigQueryExampleGen/examples/1) at 0x7f5e049cc7d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinew4og66an/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f5e28534c10.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinew4og66an/StatisticsGen/statistics/2) at 0x7f5e28551bd0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinew4og66an/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/pipelinew4og66an/StatisticsGen/statistics/2
.span,0
.split_names,"[""train""]"


Let's look at the statistics:

In [12]:
context.show(statistics_gen.outputs['statistics'])

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Some useful stuff here: 
 + Data is never missing (constraint)
 + Can potentially do some thresholds on labels + tags in order to detect drift 
     + Hard to say what is reasonable here, but we can just start with some value and see what happens
 + Can set the label and tag vocabulary if we'd like ("domain") 
     + Can also set constraints on the domain --> e.g. labels should always come from the observed domain. 
 + Need to increase the number of rank_histogram_buckets. 

In [13]:
stats_path = statistics_gen.outputs['statistics'].get()[0].uri
train_statistics_path = os.path.join(stats_path, 'train/stats_tfrecord')

train_statistics = tfdv.load_statistics(train_statistics_path)

In [14]:
# Get the full tag domain by setting max_string_domain_size to 5000
schema = tfdv.infer_schema(train_statistics, infer_feature_shape=True, max_string_domain_size=50)

In [15]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'synopsis',BYTES,required,,-
'tags',BYTES,required,"[1,inf)",-


**Below is where we set the domain manually**

In [16]:
tags = tfdv.get_feature(schema, 'tags')
tfdv.set_domain(schema, 'tags', schema_pb2.StringDomain(value=np.hstack(tag_vocab_df.values)))

In [17]:
tags.distribution_constraints.min_domain_mass = 0.95
tags.drift_comparator.infinity_norm.threshold = 0.05

In [18]:
tfdv.write_schema_text(schema, '../src/schema/schema_with_lang_and_type.pbtxt')

---

In [19]:
loaded_schema = tfdv.load_schema_text('../src/schema/schema_with_lang_and_type.pbtxt')

In [20]:
def get_domain_size(schema_path, feature):
    schema_text = tfdv.load_schema_text(schema_path)
    domain = tfdv.get_domain(schema_text, feature)

    return len(domain.value)

Can confirm that the schema has the entire domain as well as both features and their type! 

In [21]:
tfdv.display_schema(loaded_schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'synopsis',BYTES,required,,-
'tags',STRING,required,"[1,inf)",'tags_domain'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'tags_domain',"'Bobsledding', 'Dog sled', 'Funk Music', 'Men\'s Individual - Large Hill', 'Women\'s Singles', 'Men\'s Snowboard Cross', 'Episode', 'Horse', 'Hunting', 'Pro Wrestling', 'Men\'s Team Sprint Freestyle', 'Ski Jumping', 'Nordic combined', 'Auto Racing', 'Game Show', 'Reggae Music', 'Gaming', 'Bullfighting', 'Movie', 'Curling', 'Women\'s 500m', 'Mixed Ice Dance', 'Arts & Literature', 'Ancient history', 'Soul Music', 'Animated Comedy', 'Open Double', 'Local', 'Fantasy', 'Docudrama', 'Anthology', 'Opera', 'Judo', 'Short track speed skating', 'Women\'s 3000m Relay', 'Women\'s Parallel Giant Slalom', 'Entertainment', 'Hip-Hop & Rap Music', 'Western', 'Filmreihe', 'eSports', 'Lacrosse', 'Field Hockey', 'Yom Kippur', 'Komödie', 'Men\'s 12.5km Pursuit', 'Men\'s 4 x 7.5km Relay', 'Variety', 'Basketball', 'Biathlon', 'Men\'s Individual', 'Men\'s Alpine Combined', 'Sports', 'Motorcycle', 'Water Polo', 'Women\'s Slopestyle', 'Men\'s 10000m', 'Men\'s Super G', 'Agriculture', 'Musical comedy', 'Boat', 'Folk Music', 'Rio 2016', 'Cross-Country Skiing', 'Retrospective', 'Crime drama', 'Educational', 'Teens', 'Horror', 'Collectibles', 'Billiards', 'Dog Show', 'Karaoke', 'Table Tennis', 'Men\'s 4 x 10km Relay', 'Men\'s Moguls', 'Horse Racing', 'Shooting', 'Men\'s 2-Man Competition', 'Drift racing', 'Newsmagazine', 'Documentary', 'older teens (ages 15+)', 'Auction', 'Triathlon', 'Based on the Video Game', 'Amazon Original', 'Australian Football', 'Women\'s 10km Freestyle', 'Men\'s Mass Start 16 Laps', 'Crime', 'Animated', 'Public Affairs', 'teens (ages 13-14)', 'Motorcycle Racing', 'Diving', 'History drama', 'Beach soccer', 'Women\'s Moguls', 'Nordic Combined', 'Business & Finance', 'Thriller', 'Fitness', 'World Music', 'Blues Music', 'Dog Racing', 'Highlights', 'Science & Technology', 'Debate', 'Ringuette', 'Labor Day', 'Hockey', 'Home & Garden', 'Standup', 'Performing arts', 'Easter', 'Ballet', 'Classic Movies', 'Women\'s 5000m', 'Water Skiing', 'Ice Skating', 'Women\'s Aerials', 'Men\'s Curling', 'Bowls', 'Women\'s Ski Cross', 'Music', 'Romance', 'Family Entertainment', 'Men\'s Single', 'Blackjack', 'Women\'s Downhill', 'not for kids', 'Fishing', '2018 World Cup', 'Valentine\'s Day', 'Halloween', 'Bull riding', 'Canada Day', 'Women\'s Snowboard Cross', 'spa', 'Legal', 'R&B Music', 'Consumer', 'Men\'s Gundersen Large Hill / 10km', 'Mixed Team - Relay', 'Women\'s Mass Start 16 Laps', 'Medical', 'Action & Adventure', 'Environment', 'American history', 'Skateboarding', 'Roller derby', 'Men\'s Downhill', 'eng', 'History', 'Pop Music', 'Soap Opera', 'Crafts', 'Boating', 'Snowmobiling', 'Women\'s Ice Hockey', 'Classic Sport Event', 'Darts', 'Men\'s Team Large Hill / 4 x 5km', 'Motor Sports', 'Techno Music', 'Track & Field', 'Playoffs', 'Rowing', 'Mixed Team', 'Women\'s Sprint Classic Style', 'Classic Sports', 'Men\'s 15km Mass Start', 'Self improvement', 'Fashion', 'Romantic comedy', 'Newscast', 'Paranormal', 'Weddings', 'Card Games', 'Independent', 'Women\'s 2 x 7.5km Skiathlon', 'Men\'s Slalom', '4K Test Tag', 'Comedy', 'Comedy drama', 'Volleyball', 'Cycling', 'Sailing', 'Rhythmic Gymnastics', 'Women\'s 15km Individual', 'Teleroman', 'Soccer', 'Historical drama', 'Dark comedy', 'Christmas', 'Awards', 'Art de Vivre', 'Men\'s 5000m', 'Women\'s 10km Pursuit', 'Profile', 'Men\'s 5000m Relay', 'Gaelic Football', 'Women\'s Individual - Normal Hill', 'Miniseries', 'Science fiction', 'Martial arts', 'Preschool', 'Wrestling', 'Modern Pentathlon', 'Mixed Pairs', 'Freestyle Skiing', 'Women\'s Super G', 'Interview', 'Yacht Racing', 'Aerobics', 'Latin Music', 'Snooker', 'Bicycle racing', 'Multi-sport event', 'Boxing Day', 'Women\'s 7.5km Sprint', 'Floorball', 'Sports non-event', 'Sitcom', 'Country Music', 'Real Estate', 'Shopping', 'Anime', 'Bodybuilding', 'Romance comedy', 'Hurling', 'Canoeing', 'Cinco de Mayo', 'Intl soccer', 'Trampoline Gymnastics', 'New Year\'s', 'Magazine', 'Men\'s Team - Large Hill', 'Spin-Off', 'Travel', 'Golf', 'Handball', 'Men\'s Slopestyle', '20 de noviembre', 'Women\'s Giant Slalom', 'Short Subject', 'Behind the Scenes', 'kids (ages 5-9)', 'Heavy Metal Music', 'Dance', 'Poker', 'Cheerleading', 'Racquetball', 'Men\'s Singles', 'Men\'s 1500m', 'Current Affairs', 'Boxing', 'SportingEvent', 'Theater', 'Women\'s 30km Classic Style', 'Men\'s Sprint Classic Style', 'Politics & Government', 'Surfing', 'Rugby union', 'Quebec Production', 'Serial', 'Women\'s Halfpipe', 'Reality Competition', 'Indoor soccer', 'Mixed Martial Arts', 'Aviation', 'Weightlifting', 'PyeongChang 2018', 'Pool', 'Beach Volleyball', 'Badminton', 'Easy Listening Music', 'Rugby league', 'Men\'s 10km Sprint', 'Women\'s 12.5km Mass Start', 'Women\'s 1500m', 'Food', 'Talk', 'Fundraiser/Telethon', 'Arm Wrestling', 'Thanksgiving', 'Jazz Music', 'Men\'s Team Pursuit 8 Laps', 'Dokusoap', 'Auto', 'Art', 'Home improvement', 'Bluegrass Music', 'Wall Street', 'Netflix Original', 'Taekwondo', 'Goth Music', 'Mixed 2 x 6 + 2 x 7.5km Relay', 'Mystery', 'Swimming', 'Foreign', 'World history', 'Squash', 'Footvolley', 'St. Patrick\'s Day', 'Women\'s Single', 'Women\'s Team Pursuit 6 Laps', 'Soft Rock', 'Women\'s 3000m', 'Military & War', 'Summer Olympics', 'Courtroom', 'Parade', 'Alternative Music', 'Luge', 'Men\'s Gundersen Normal Hill / 10km', 'Lifestyle', 'Cooking', 'Computers', 'Rock Music', 'Sumo Wrestling', 'Synchronized Swimming', 'Sequel', 'Men\'s 2 x 15km Skiathlon', 'Mex. Independence', 'Investigative', 'Men\'s 20km Individual', 'Health', 'Rugby', 'Women\'s Big Air', 'Biography', 'Softball', 'Adaptation', 'Women\'s Slalom', 'Religion', 'Animals', 'Archery', 'Musical', 'Classical Music', 'Men\'s Halfpipe', 'Men\'s 15km Freestyle', 'Women\'s 4 x 5km Relay', 'Women\'s Individual', 'Coming of Age', 'Men\'s Parallel Giant Slalom', 'Parenting', 'Olympics', 'Winter Olympics', 'Marathon', 'Advice', 'Bollywood', 'Dating', 'Men\'s 1000m', 'Women\'s Team Sprint Freestyle', 'Men\'s Giant Slalom', 'Men\'s 500m', 'News', 'Outdoors', 'tweens (ages 10-12)', 'Cricket', 'Gospel Music', 'Mixed Curling', 'Cartoon', 'Men\'s Big Air', 'Erotic', 'Special', 'Action Sports', 'Speedskating', 'Figure Skating', 'Holiday', 'Drag Racing', 'Police', 'Women\'s Alpine Combined', 'Reality', 'preschoolers (ages 2-4)', 'Pets', 'Football', 'Baseball', 'Polo', 'Snowboarding', 'Bicycle', 'Kayaking', 'Men\'s Aerials', 'Music Video', 'Women\'s 4 x 6km Relay', 'Drama', 'Weather', 'Gay and Lesbian', 'Skiing', 'Fencing', 'Women\'s Curling', 'Pro', 'Women\'s 1000m', 'Hydroplane racing', 'Nature', 'Tennis', 'Alpine Skiing', 'Men\'s Ice Hockey', 'Hanukkah', 'Men\'s Individual - Normal Hill', 'Senior Citizen', 'Water Sports', 'Concert', 'Running', 'Women\'s 2-Man Competition', 'Men\'s 50km Classic Style', 'Trains', 'Open 4-Man Competition', 'Men\'s Ski Cross', 'Skeleton', 'Children\'s/Family Entertainment', 'Equestrian', 'Event', 'Mountain Biking', 'Gymnastics', 'Pelota vasca', 'Sports talk', 'How-To', 'Suspense', 'Rodeo', 'Documentary drama', 'Bowling', 'July 4th', 'Ska Music'"


In [23]:
get_domain_size('../src/schema/schema_with_lang_and_type.pbtxt', 'tags')

425