# **Schema Generation**

Here we will generate a schema and try to place some reasonable constraints on the data. 

We will create the schema based on all of the available data. For now, the schema will only have information about the tags. 

In [1]:
import os
import pprint
import tempfile
import urllib
import numpy as np

import pandas as pd

import absl
import tensorflow as tf
import tensorflow_data_validation as tfdv
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

import tfx
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import ExampleValidator
from tfx.components.base import executor_spec
from tfx.components.trainer.executor import GenericExecutor
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.extensions.google_cloud_big_query.example_gen.component import (
    BigQueryExampleGen,
)

from tfx.components import ImporterNode
from tfx.types.standard_artifacts import Schema

from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.proto import example_gen_pb2

from tensorflow_metadata.proto.v0 import schema_pb2 
from tensorflow_metadata.proto.v0 import statistics_pb2
from tensorflow_metadata.proto.v0 import anomalies_pb2

from ml_metadata import metadata_store
from ml_metadata.proto import metadata_store_pb2

from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
from tfx.utils.dsl_utils import external_input



In [2]:
_pipeline_root = tempfile.mkdtemp(prefix='pipeline')
_pipeline_name = 'interactive_pipeline'

context = InteractiveContext(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root)



We are only querying 100 rows in order to get the types for the different columns we will use during training. We will generate the vocabulary for the tags separately (outside of tfx), because in order to generate the vocabulary for the tags within tfx we would need to generate tfrecords for every single example. This is unnecessary, we can generate a schema from a small amount of data first (mostly to get the types for all features) and then separately generate the vocab and set the domain of the tags manually. 

See below for how this is accomplished. 

In [3]:
query = """
SELECT 
    synopsis,
    tags
FROM 
    `metadata_sky.merlin_movie_series_data_small`
LIMIT 100
"""

In [4]:
%%time
output = example_gen_pb2.Output(
             split_config=example_gen_pb2.SplitConfig(splits=[
                 example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=10)
             ],
    ))
example_gen = BigQueryExampleGen(query=query, output_config=output)
context.run(example_gen, beam_pipeline_args=['--project', 'ml-sandbox-101', '--temp_location', 'gs://metadata-bucket-sky/tmp'])



  temp_location = pcoll.pipeline.options.view_as(


CPU times: user 846 ms, sys: 168 ms, total: 1.01 s
Wall time: 18.3 s


0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } BigQueryExampleGen at 0x7f3c2892a650.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0.exec_properties['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n synopsis,\n tags\nFROM \n `metadata_sky.merlin_movie_series_data_small`\nLIMIT 100\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }['custom_config']None"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.exec_properties,"['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n synopsis,\n tags\nFROM \n `metadata_sky.merlin_movie_series_data_small`\nLIMIT 100\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }['custom_config']None"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n synopsis,\n tags\nFROM \n `metadata_sky.merlin_movie_series_data_small`\nLIMIT 100\n""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }"
['custom_config'],

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0


**Inspect the data**

In [5]:
def make_dataset(examples, split):
    """
    Create a dataset from examples artifact
    """
    uri = os.path.join(examples.get()[0].uri, split)
    
    tfrecord_filenames = [os.path.join(uri, name) for name in os.listdir(uri)]
    dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")
    
    return dataset

In [6]:
train_dataset = make_dataset(example_gen.outputs['examples'], 'train')

print('TRAIN DATASET SIZE: {}'.format(train_dataset.reduce(0, lambda x, _: x + 1)))

print('{:=^80}'.format(' Example '))

# Iterate over the first record and decode it.
for tfrecord in train_dataset.take(1):
    serialized_example = tfrecord.numpy()
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    pp.pprint(example)

TRAIN DATASET SIZE: 100
features {
  feature {
    key: "synopsis"
    value {
      bytes_list {
        value: "Antonio rolls into Venice to savor the Railto market\'s spices; he prepares seafood salad, Doges\' ice cream and peaches in wine. Features stuffed onions, plus sweet ravioli with orange-blossom honey; Antonio attends the bizarre wedding of two trees, which is really an excuse for a pasta feast, and cooks for an isolated lighthouse keeper. Antonio shares his recipes for beef topside simmered in wine, fritto misto and cabbage and cheese soup. In Ostuni, Antonio visits a communal wood-burning bread oven. Antonio"
      }
    }
  }
  feature {
    key: "tags"
    value {
      bytes_list {
        value: "Food"
        value: "Crafts"
      }
    }
  }
}



**The following query allows us to get the entire tag vocab**. This takes about 1 second to run, as opposed to several minutes to generate examples for every row data and then calculate the statistics for that data. 

In [7]:
%%bigquery tag_vocab_df

SELECT 
  DISTINCT(labels)
FROM `ml-sandbox-101.metadata_sky.merlin_movie_series_data_small`, UNNEST(tags) as labels

Unfortunately, this also causes us to calculate a domain for things like synopsis and content_id, which we don't want. 

In [8]:
stats_options = tfdv.StatsOptions(num_rank_histogram_buckets=1000)
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples'],
    stats_options=stats_options)
context.run(statistics_gen)



0,1
.execution_id,2
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } StatisticsGen at 0x7f3cc47e8910.inputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0.outputs['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f3c4b6a82d0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""].exec_properties['stats_options_json']{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null, ""_per_feature_weight_override"": null, ""_vocab_paths"": null, ""_add_default_generators"": true}['exclude_splits'][]"
.component.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.component.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f3c4b6a82d0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f3c4b6a82d0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""]"
.exec_properties,"['stats_options_json']{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null, ""_per_feature_weight_override"": null, ""_vocab_paths"": null, ""_add_default_generators"": true}['exclude_splits'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f3c4b6a82d0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2
.span,0
.split_names,"[""train""]"

0,1
['stats_options_json'],"{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null, ""_per_feature_weight_override"": null, ""_vocab_paths"": null, ""_add_default_generators"": true}"
['exclude_splits'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f3c2892a950.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1) at 0x7f3c288be950.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinevnx1zgbs/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f3c4b6a82d0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2) at 0x7f3c4b75a8d0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/pipelinevnx1zgbs/StatisticsGen/statistics/2
.span,0
.split_names,"[""train""]"


Let's look at the statistics:

In [9]:
context.show(statistics_gen.outputs['statistics'])

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Some useful stuff here: 
 + Data is never missing (constraint)
 + Can potentially do some thresholds on labels + tags in order to detect drift 
     + Hard to say what is reasonable here, but we can just start with some value and see what happens
 + Can set the label and tag vocabulary if we'd like ("domain") 
     + Can also set constraints on the domain --> e.g. labels should always come from the observed domain. 
 + Need to increase the number of rank_histogram_buckets. 

In [10]:
stats_path = statistics_gen.outputs['statistics'].get()[0].uri
train_statistics_path = os.path.join(stats_path, 'train/stats_tfrecord')

train_statistics = tfdv.load_statistics(train_statistics_path)

In [11]:
# Get the full tag domain by setting max_string_domain_size to 5000
schema = tfdv.infer_schema(train_statistics, infer_feature_shape=True, max_string_domain_size=50)

In [12]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'synopsis',BYTES,required,,-
'tags',BYTES,required,"[1,inf)",-


**Below is where we set the domain manually**

In [13]:
tags = tfdv.get_feature(schema, 'tags')
tfdv.set_domain(schema, 'tags', schema_pb2.StringDomain(value=np.hstack(tag_vocab_df.values)))

1
4


In [14]:
tags.distribution_constraints.min_domain_mass = 0.95
tags.drift_comparator.infinity_norm.threshold = 0.05

In [15]:
tfdv.write_schema_text(schema, '../src/schema/schema.pbtxt')

---

In [16]:
loaded_schema = tfdv.load_schema_text('../src/schema/schema.pbtxt')

In [17]:
def get_domain_size(schema_path, feature):
    schema_text = tfdv.load_schema_text(schema_path)
    domain = tfdv.get_domain(schema_text, feature)

    return len(domain.value)

Can confirm that the schema has the entire domain as well as both features and their type! 

In [18]:
tfdv.display_schema(loaded_schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'synopsis',BYTES,required,,-
'tags',STRING,required,"[1,inf)",'tags_domain'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'tags_domain',"'Documentary', 'News', 'Local', 'Outdoors', 'Science fiction', 'Action & Adventure', 'older teens (ages 15+)', 'Drama', 'Newscast', 'Public Affairs', 'Talk', 'Anthology', 'Football', 'Sports', 'SportingEvent', 'Classic Sport Event', 'Comedy', 'teens (ages 13-14)', 'Sitcom', 'Sports non-event', 'Fishing', 'How-To', 'Romance', 'Interview', 'Entertainment', 'Crime drama', 'Comedy drama', 'Sports talk', 'Health', 'Travel', 'Music', 'Computers', 'Reality', 'tweens (ages 10-12)', 'Fantasy', 'Animated', 'Children\'s/Family Entertainment', 'Politics & Government', 'Golf', 'Art', 'Food', 'Crafts', 'Newsmagazine', 'Basketball', 'Educational', 'Horror', 'Hockey', 'Crime', 'Independent', 'Gaelic Football', 'Hurling', 'Home & Garden', 'Home improvement', 'Thriller', 'Variety', 'Science & Technology', 'Teens', 'Soap Opera', 'Poker', 'Card Games', 'Motor Sports', 'Anime', 'History', 'Fashion', 'Hunting', 'Pop Music', 'Biography', 'Medical', 'Business & Finance', 'Musical', 'Martial arts', 'kids (ages 5-9)', 'Religion', 'Mystery', 'Diving', 'Nature', 'Legal', 'Animals', 'Action Sports', 'Volleyball', 'Water Sports', 'Pets', 'Agriculture', 'Western', 'Environment', 'Auto Racing', 'Hip-Hop & Rap Music', 'Reggae Music', 'R&B Music', 'Fitness', 'Auto', 'Baseball', 'Dance', 'Pro Wrestling', 'preschoolers (ages 2-4)', 'Badminton', 'Beach Volleyball', 'Summer Olympics', 'Olympics', 'Game Show', 'Mixed Martial Arts', 'Historical drama', 'Motorcycle Racing', 'Special', 'Weddings', 'Concert', 'Holiday', 'Christmas', 'Consumer', 'Miniseries', 'not for kids', 'Self improvement', 'Paranormal', 'Dark comedy', 'Tennis', 'Alternative Music', 'Event', 'Performing arts', 'Foreign', 'Horse Racing', 'Real Estate', 'Debate', 'Soccer', 'Documentary drama', 'Boxing', 'Military & War', 'Snowboarding', 'Netflix Original', 'Motorcycle', 'Halloween', 'Parenting', 'Shopping', 'Rugby league', 'Docudrama', 'Gaming', 'Aerobics', 'Weather', 'Romantic comedy', 'Ice Skating', 'Cricket', 'Suspense', 'American history', 'Gay and Lesbian', 'Water Polo', 'Auction', 'Collectibles', 'Musical comedy', 'Country Music', 'Horse', 'Handball', 'Rodeo', 'Boating', 'Winter Olympics', 'Standup', 'Theater', 'Playoffs', 'Surfing', 'PyeongChang 2018', 'Courtroom', 'Equestrian', 'Cycling', 'Snowmobiling', 'Gymnastics', 'Curling', 'Billiards', 'History drama', 'Weightlifting', 'Swimming', 'Australian Football', 'Softball', 'Drag Racing', 'Based on the Video Game', 'Aviation', 'Latin Music', 'Awards', 'Rio 2016', 'Thanksgiving', 'Gospel Music', 'Wrestling', 'Boat', 'Bollywood', 'Romance comedy', 'Bowling', 'Rock Music', 'Short track speed skating', 'Track & Field', 'Fundraiser/Telethon', 'Preschool', 'Footvolley', 'Skiing', 'Advice', 'Shooting', 'St. Patrick\'s Day', 'Opera', 'Lacrosse', 'Canoeing', 'Techno Music', 'Filmreihe', 'Skateboarding', 'Archery', 'Bull riding', 'Heavy Metal Music', 'Figure Skating', 'Darts', 'Rowing', 'Rhythmic Gymnastics', 'Wall Street', 'Cheerleading', 'Amazon Original', 'Rugby', 'eSports', 'Multi-sport event', 'New Year\'s', 'Ancient history', 'World history', 'Rugby union', 'Valentine\'s Day', 'Adaptation', 'Family Entertainment', 'Sequel', 'Roller derby', 'Cross-Country Skiing', 'Parade', 'Skeleton', 'Bicycle', 'Kayaking', 'Sailing', 'Yacht Racing', 'Running', '2018 World Cup', 'Ski Jumping', 'World Music', 'Arm Wrestling', 'Classical Music', 'Bullfighting', 'Pro', 'Sumo Wrestling', 'Easter', 'Bluegrass Music', 'Quebec Production', 'Fencing', 'Mountain Biking', 'Jazz Music', 'Blues Music', 'Field Hockey', 'Easy Listening Music', 'Synchronized Swimming', 'Folk Music', 'Soul Music', 'Dog Racing', 'Alpine Skiing', 'Water Skiing', 'Dog Show', 'Judo', 'Dog sled', 'Speedskating', 'Bobsledding', '20 de noviembre', 'Table Tennis', 'Ballet', 'Polo', 'Biathlon', 'Pool', 'Blackjack', 'Bodybuilding', 'Luge', 'Men\'s Curling', 'Men\'s Team Large Hill / 4 x 5km', 'Women\'s Big Air', 'Triathlon', 'Racquetball', 'Men\'s Ice Hockey', 'Women\'s 2-Man Competition', 'Funk Music', 'Men\'s 1000m', 'Women\'s 4 x 5km Relay', 'Women\'s 1500m', 'Women\'s 12.5km Mass Start', 'Hanukkah', 'Karaoke', 'Cartoon', 'July 4th', 'Modern Pentathlon', 'Women\'s Moguls', 'Women\'s Curling', 'Women\'s Ice Hockey', 'Mixed Team', 'Art de Vivre', 'Soft Rock', 'Trampoline Gymnastics', 'Highlights', 'Komödie', 'Drift racing', 'Mixed Pairs', 'Bicycle racing', 'Mixed Curling', 'Boxing Day', 'Men\'s Halfpipe', 'Classic Movies', 'Women\'s Super G', 'Snooker', 'Women\'s Halfpipe', 'Taekwondo', 'Men\'s Gundersen Normal Hill / 10km', 'Ringuette', 'Women\'s Aerials', 'Marathon', 'Men\'s 12.5km Pursuit', 'Women\'s 3000m Relay', 'Women\'s 500m', 'Men\'s 1500m', 'Women\'s 2 x 7.5km Skiathlon', 'Arts & Literature', 'Men\'s 15km Freestyle', 'Women\'s Giant Slalom', 'Men\'s Downhill', 'Mixed Ice Dance', 'Men\'s 2-Man Competition', 'Canada Day', 'Women\'s Downhill', 'Women\'s 4 x 6km Relay', 'Women\'s Ski Cross', 'Women\'s Slopestyle', 'Freestyle Skiing', 'Women\'s Singles', 'Women\'s 10km Freestyle', 'Women\'s 15km Individual', 'Men\'s 10km Sprint', 'Men\'s 5000m', 'Men\'s 2 x 15km Skiathlon', 'Men\'s Aerials', 'Men\'s 500m', 'Men\'s Team - Large Hill', 'Open 4-Man Competition', 'Women\'s 1000m', 'Women\'s Slalom', 'Men\'s Singles', 'Labor Day', 'Men\'s 4 x 7.5km Relay', 'Women\'s Single', 'Men\'s Individual - Large Hill', 'Women\'s Individual', 'Mixed 2 x 6 + 2 x 7.5km Relay', 'Men\'s Gundersen Large Hill / 10km', 'Beach soccer', 'Women\'s Sprint Classic Style', 'Men\'s Sprint Classic Style', 'Dating', 'Men\'s Team Pursuit 8 Laps', 'Men\'s Team Sprint Freestyle', 'Women\'s Team Pursuit 6 Laps', 'Women\'s Team Sprint Freestyle', 'Mixed Team - Relay', 'Men\'s 20km Individual', 'Men\'s Giant Slalom', 'Men\'s Slopestyle', 'Men\'s Single', 'Men\'s Alpine Combined', 'Men\'s Snowboard Cross', 'Goth Music', 'Men\'s 5000m Relay', 'Serial', 'Classic Sports', 'Men\'s 50km Classic Style', 'Men\'s Moguls', 'Cinco de Mayo', 'Women\'s 7.5km Sprint', 'Men\'s Individual - Normal Hill', 'Animated Comedy', 'Men\'s 4 x 10km Relay', 'Women\'s 30km Classic Style', 'Women\'s 3000m', 'Nordic Combined', 'Men\'s Individual', 'Women\'s Snowboard Cross', 'Men\'s Super G', 'Short Subject', 'Men\'s 15km Mass Start', 'Floorball', 'Men\'s Slalom', 'Women\'s Alpine Combined', 'Women\'s Mass Start 16 Laps', 'Men\'s Mass Start 16 Laps', 'Intl soccer', 'Open Double', 'Women\'s Parallel Giant Slalom', 'Men\'s Parallel Giant Slalom', 'Men\'s Big Air', 'Magazine', 'Women\'s 5000m', 'Hydroplane racing', 'Women\'s 10km Pursuit', 'Men\'s 10000m', 'Men\'s Ski Cross', 'Lifestyle', 'Senior Citizen', 'Coming of Age', 'Bowls', 'Yom Kippur', '4K Test Tag', 'Women\'s Individual - Normal Hill', 'Ska Music', 'Investigative', 'Reality Competition', 'Spin-Off', 'Behind the Scenes', 'Trains', 'Retrospective'"


In [19]:
get_domain_size('../src/schema/schema.pbtxt', 'tags')

408