# **Schema Generation**

Here we will generate a schema and try to place some reasonable constraints on the data. 

We will create the schema based on all of the available data. For now, the schema will only have information about the tags. 

In [22]:
import os
import pprint
import tempfile
import urllib
import numpy as np

import pandas as pd

import absl
import tensorflow as tf
import tensorflow_data_validation as tfdv
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

import tfx
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import ExampleValidator
from tfx.components.base import executor_spec
from tfx.components.trainer.executor import GenericExecutor
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.extensions.google_cloud_big_query.example_gen.component import (
    BigQueryExampleGen,
)

from tfx.components import ImporterNode
from tfx.types.standard_artifacts import Schema

from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.proto import example_gen_pb2

from tensorflow_metadata.proto.v0 import schema_pb2 
from tensorflow_metadata.proto.v0 import statistics_pb2
from tensorflow_metadata.proto.v0 import anomalies_pb2

from ml_metadata import metadata_store
from ml_metadata.proto import metadata_store_pb2

from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
from tfx.utils.dsl_utils import external_input

In [2]:
_pipeline_root = tempfile.mkdtemp(prefix='pipeline')
_pipeline_name = 'interactive_pipeline'

context = InteractiveContext(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root)



We are only querying 100 rows in order to get the types for the different columns we will use during training. We will generate the vocabulary for the tags separately (outside of tfx), because in order to generate the vocabulary for the tags within tfx we would need to generate tfrecords for every single example. This is unnecessary, we can generate a schema from a small amount of data first (mostly to get the types for all features) and then separately generate the vocab and set the domain of the tags manually. 

See below for how this is accomplished. 

In [3]:
query = """
SELECT 
    synopsis,
    tags
FROM 
    `metadata_sky.merlin_movie_series_data_small`
LIMIT 100
"""

In [4]:
%%time
output = example_gen_pb2.Output(
             split_config=example_gen_pb2.SplitConfig(splits=[
                 example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=10)
             ],
    ))
example_gen = BigQueryExampleGen(query=query, output_config=output)
context.run(example_gen, beam_pipeline_args=['--project', 'ml-sandbox-101', '--temp_location', 'gs://metadata-bucket-sky/tmp'])



  temp_location = pcoll.pipeline.options.view_as(


CPU times: user 836 ms, sys: 146 ms, total: 982 ms
Wall time: 17.8 s


0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } BigQueryExampleGen at 0x7f41782a5250.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0.exec_properties['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n synopsis,\n tags\nFROM \n `metadata_sky.merlin_movie_series_data_small`\nLIMIT 100\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }['custom_config']None"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.exec_properties,"['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n synopsis,\n tags\nFROM \n `metadata_sky.merlin_movie_series_data_small`\nLIMIT 100\n""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }['custom_config']None"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""\nSELECT \n synopsis,\n tags\nFROM \n `metadata_sky.merlin_movie_series_data_small`\nLIMIT 100\n""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 10,  ""name"": ""train""  }  ]  } }"
['custom_config'],

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0


**Inspect the data**

In [5]:
def make_dataset(examples, split):
    """
    Create a dataset from examples artifact
    """
    uri = os.path.join(examples.get()[0].uri, split)
    
    tfrecord_filenames = [os.path.join(uri, name) for name in os.listdir(uri)]
    dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")
    
    return dataset

In [6]:
train_dataset = make_dataset(example_gen.outputs['examples'], 'train')

print('TRAIN DATASET SIZE: {}'.format(train_dataset.reduce(0, lambda x, _: x + 1)))

print('{:=^80}'.format(' Example '))

# Iterate over the first record and decode it.
for tfrecord in train_dataset.take(1):
    serialized_example = tfrecord.numpy()
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    pp.pprint(example)

TRAIN DATASET SIZE: 100
features {
  feature {
    key: "synopsis"
    value {
      bytes_list {
        value: "Soccer players Pablo Mastroeni and Jimmy Conrad join Emeril in the kitchen. Lamb and white bean casserole; cowboy chicken casserole. Homemade bagels; kicked-up schmeers; New Orleans crawfish bagels; bagel chips. Dad\'s day off. Barbecue favorites. Artichokes stuffed with ricotta; salami rolls; fried turkey; chocolate-mousse layer cake. Bloody Maria; warm hearts of palm salad; broiled crab backs; fettuccini with shellfish. Salad; cold puree of white bean soup; fried green tomato and rock-shrimp salad; roasted"
      }
    }
  }
  feature {
    key: "tags"
    value {
      bytes_list {
        value: "Food"
        value: "How-To"
      }
    }
  }
}



**The following query allows us to get the entire tag vocab**. This takes about 1 second to run, as opposed to several minutes to generate examples for every row data and then calculate the statistics for that data. 

In [7]:
%%bigquery tag_vocab_df

SELECT 
  DISTINCT(labels)
FROM `ml-sandbox-101.metadata_sky.merlin_movie_series_data_small`, UNNEST(tags) as labels

Unfortunately, this also causes us to calculate a domain for things like synopsis and content_id, which we don't want. 

In [9]:
stats_options = tfdv.StatsOptions(num_rank_histogram_buckets=1000)
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples'],
    stats_options=stats_options)
context.run(statistics_gen)



0,1
.execution_id,2
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } StatisticsGen at 0x7f417832c050.inputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0.outputs['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f419b02add0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""].exec_properties['stats_options_json']{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null, ""_per_feature_weight_override"": null, ""_vocab_paths"": null, ""_add_default_generators"": true}['exclude_splits'][]"
.component.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.component.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f419b02add0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"
.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f419b02add0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""]"
.exec_properties,"['stats_options_json']{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null, ""_per_feature_weight_override"": null, ""_vocab_paths"": null, ""_add_default_generators"": true}['exclude_splits'][]"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f419b02add0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/pipelinek433r6ld/StatisticsGen/statistics/2
.span,0
.split_names,"[""train""]"

0,1
['stats_options_json'],"{""_generators"": null, ""_feature_whitelist"": null, ""_schema"": null, ""label_feature"": null, ""weight_feature"": null, ""_slice_functions"": null, ""_sample_rate"": null, ""num_top_values"": 20, ""frequency_threshold"": 1, ""weighted_frequency_threshold"": 1.0, ""num_rank_histogram_buckets"": 1000, ""_num_values_histogram_buckets"": 10, ""_num_histogram_buckets"": 10, ""_num_quantiles_histogram_buckets"": 10, ""epsilon"": 0.01, ""infer_type_from_schema"": false, ""_desired_batch_size"": null, ""enable_semantic_domain_stats"": false, ""_semantic_domain_stats_sample_rate"": null, ""_per_feature_weight_override"": null, ""_vocab_paths"": null, ""_add_default_generators"": true}"
['exclude_splits'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f41782a5550.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1) at 0x7f41d69b3a50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1.span0.split_names[""train""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/pipelinek433r6ld/BigQueryExampleGen/examples/1
.span,0
.split_names,"[""train""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7f419b02add0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/pipelinek433r6ld/StatisticsGen/statistics/2) at 0x7f419b1c4790.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/pipelinek433r6ld/StatisticsGen/statistics/2.span0.split_names[""train""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/pipelinek433r6ld/StatisticsGen/statistics/2
.span,0
.split_names,"[""train""]"


Let's look at the statistics:

In [10]:
context.show(statistics_gen.outputs['statistics'])

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Some useful stuff here: 
 + Data is never missing (constraint)
 + Can potentially do some thresholds on labels + tags in order to detect drift 
     + Hard to say what is reasonable here, but we can just start with some value and see what happens
 + Can set the label and tag vocabulary if we'd like ("domain") 
     + Can also set constraints on the domain --> e.g. labels should always come from the observed domain. 
 + Need to increase the number of rank_histogram_buckets. 

In [11]:
stats_path = statistics_gen.outputs['statistics'].get()[0].uri
train_statistics_path = os.path.join(stats_path, 'train/stats_tfrecord')

train_statistics = tfdv.load_statistics(train_statistics_path)

In [16]:
# Get the full tag domain by setting max_string_domain_size to 5000
schema = tfdv.infer_schema(train_statistics, infer_feature_shape=True, max_string_domain_size=50)

In [17]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'synopsis',BYTES,required,,-
'tags',BYTES,required,"[1,inf)",-


**Below is where we set the domain manually**

In [24]:
tags = tfdv.get_feature(schema, 'tags')
tfdv.set_domain(schema, 'tags', schema_pb2.StringDomain(value=np.hstack(tag_vocab_df.values)))

1
4


In [29]:
tags.distribution_constraints.min_domain_mass = 0.95
tags.drift_comparator.infinity_norm.threshold = 0.05

In [30]:
tfdv.write_schema_text(schema, '../src/schema/schema.pbtxt')

---

In [31]:
loaded_schema = tfdv.load_schema_text('../src/schema/schema.pbtxt')

In [32]:
def get_domain_size(schema_path, feature):
    schema_text = tfdv.load_schema_text(schema_path)
    domain = tfdv.get_domain(schema_text, feature)

    return len(domain.value)

Can confirm that the schema has the entire domain as well as both features and their type! 

In [34]:
tfdv.display_schema(loaded_schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'synopsis',BYTES,required,,-
'tags',STRING,required,"[1,inf)",'tags_domain'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'tags_domain',"'Comedy', 'kids (ages 5-9)', 'Animated', 'Children\'s/Family Entertainment', 'Action & Adventure', 'Outdoors', 'Documentary', 'Parenting', 'Mystery', 'Crime', 'Drama', 'teens (ages 13-14)', 'Teens', 'Special', 'History', 'Military & War', 'Sports non-event', 'Sports', 'Horse', 'Science & Technology', 'News', 'Golf', 'Crime drama', 'Local', 'Hunting', 'Nature', 'older teens (ages 15+)', 'Business & Finance', 'Thriller', 'preschoolers (ages 2-4)', 'Educational', 'Entertainment', 'Romantic comedy', 'Cricket', 'Legal', 'Fantasy', 'Newscast', 'Public Affairs', 'Talk', 'Reality', 'Food', 'How-To', 'Mixed Martial Arts', 'History drama', 'Hockey', 'tweens (ages 10-12)', 'Home & Garden', 'Western', 'Travel', 'Computers', 'Football', 'Animals', 'Politics & Government', 'American history', 'Soccer', 'Interview', 'Paranormal', 'Docudrama', 'Hip-Hop & Rap Music', 'Music', 'Romance comedy', 'not for kids', 'Sports talk', 'Newsmagazine', 'Anime', 'Erotic', 'Self improvement', 'Game Show', 'Basketball', 'Baseball', 'Christmas', 'Holiday', 'Sitcom', 'Comedy drama', 'Consumer', 'Medical', 'Health', 'Motor Sports', 'Motorcycle Racing', 'Horror', 'Poker', 'Card Games', 'Soap Opera', 'Auto Racing', 'Debate', 'Romance', 'Gay and Lesbian', 'SportingEvent', 'Classic Sport Event', 'Variety', 'Science fiction', 'Auto', 'Biography', 'Pets', 'Winter Olympics', 'Olympics', 'Anthology', 'Fashion', 'Lacrosse', 'Fishing', 'Musical', 'Volleyball', 'Pro Wrestling', 'Home improvement', 'Gospel Music', 'Religion', 'Alternative Music', 'Art', 'Miniseries', 'Shopping', 'PyeongChang 2018', 'Luge', 'Gaming', 'Awards', 'Weather', 'Historical drama', 'Auction', 'Collectibles', 'Horse Racing', 'R&B Music', 'Boat', 'Action Sports', 'Fitness', 'Documentary drama', 'Softball', 'Real Estate', 'Dance', 'Crafts', 'Independent', 'Snowmobiling', 'Wrestling', 'Boxing', 'Foreign', 'Cross-Country Skiing', 'Pop Music', 'Cycling', 'Motorcycle', 'Event', 'Skateboarding', 'Martial arts', 'Musical comedy', 'Yacht Racing', 'Parade', 'Theater', 'Performing arts', 'Agriculture', 'Environment', 'Country Music', 'Weightlifting', 'Tennis', 'Shooting', 'Playoffs', 'Aviation', 'Summer Olympics', 'Archery', 'Equestrian', 'Rock Music', 'Swimming', 'Gymnastics', 'Rugby', 'Halloween', '2018 World Cup', 'Boating', 'Snowboarding', 'Cheerleading', 'Courtroom', 'Standup', 'Concert', 'Bowling', 'Curling', 'Reggae Music', 'Thanksgiving', 'Amazon Original', 'Preschool', 'Sumo Wrestling', 'Rodeo', 'Filmreihe', 'Skiing', 'Ballet', 'Advice', 'Weddings', 'Water Sports', 'Ice Skating', 'Speedskating', 'Netflix Original', 'Synchronized Swimming', 'Wall Street', 'Kayaking', 'Heavy Metal Music', 'Bicycle racing', 'Valentine\'s Day', 'eSports', 'Dark comedy', 'Dog Show', 'Fundraiser/Telethon', 'Triathlon', 'Bollywood', 'Latin Music', 'Rio 2016', 'Short track speed skating', 'Snooker', 'Track & Field', 'Diving', 'Suspense', 'Polo', 'Roller derby', 'Women\'s Curling', 'Men\'s Ice Hockey', 'Beach Volleyball', 'Figure Skating', 'Alpine Skiing', 'Modern Pentathlon', 'Taekwondo', 'July 4th', 'Water Skiing', 'Aerobics', 'Bicycle', 'Running', 'Bull riding', 'Women\'s Ice Hockey', 'Ancient history', 'World history', 'Based on the Video Game', 'Karaoke', 'Drag Racing', 'Easter', 'Classical Music', 'Arm Wrestling', 'Folk Music', 'Rugby union', 'Opera', 'Mountain Biking', 'Surfing', 'Water Polo', 'Sailing', 'Dog sled', 'Fencing', 'New Year\'s', 'Quebec Production', 'Billiards', 'Bluegrass Music', 'Table Tennis', 'Blues Music', 'Rowing', 'Techno Music', 'Biathlon', 'Rugby league', 'Multi-sport event', 'Art de Vivre', 'Rhythmic Gymnastics', 'Classic Movies', 'Field Hockey', 'Footvolley', 'Bullfighting', 'Easy Listening Music', 'St. Patrick\'s Day', 'Labor Day', 'Hanukkah', 'Skeleton', 'Mixed Pairs', 'Darts', 'Men\'s Moguls', 'Women\'s Moguls', 'Mixed Team', 'Bodybuilding', 'World Music', 'Jazz Music', 'Judo', 'Gaelic Football', 'Hurling', 'Bowls', 'Adaptation', 'Family Entertainment', 'Sequel', 'Badminton', 'Handball', 'Women\'s Parallel Giant Slalom', 'Men\'s Parallel Giant Slalom', 'Canoeing', '20 de noviembre', 'Bobsledding', 'Women\'s 2-Man Competition', 'Men\'s Individual', 'Women\'s Giant Slalom', 'Men\'s Snowboard Cross', 'Mixed Ice Dance', 'Mixed 2 x 6 + 2 x 7.5km Relay', 'Women\'s 3000m Relay', 'Men\'s Gundersen Large Hill / 10km', 'Magazine', 'Pro', 'Open 4-Man Competition', 'Women\'s Ski Cross', 'Soul Music', 'Australian Football', 'Boxing Day', 'Men\'s Individual - Large Hill', 'Men\'s 1000m', 'Women\'s Individual', 'Men\'s Singles', 'Women\'s 1500m', 'Hydroplane racing', 'Women\'s Halfpipe', 'Men\'s Aerials', 'Men\'s Giant Slalom', 'Women\'s 1000m', 'Men\'s Downhill', 'Trampoline Gymnastics', 'Marathon', 'Ski Jumping', 'Men\'s Team Pursuit 8 Laps', 'Men\'s 15km Mass Start', 'Women\'s 500m', 'Men\'s 4 x 10km Relay', 'Men\'s Slalom', 'Women\'s Alpine Combined', 'Beach soccer', 'Women\'s Aerials', 'Men\'s Curling', 'Women\'s Singles', 'Mixed Curling', 'Women\'s 10km Pursuit', 'Men\'s 12.5km Pursuit', 'Women\'s Single', 'Women\'s Individual - Normal Hill', 'Pool', 'Men\'s 500m', 'Men\'s 2-Man Competition', 'Women\'s Team Pursuit 6 Laps', 'Racquetball', 'Lifestyle', 'Reality Competition', 'Spin-Off', 'Men\'s Halfpipe', 'Classic Sports', 'Intl soccer', 'Men\'s 5000m Relay', 'Women\'s Downhill', 'Women\'s 30km Classic Style', 'Cartoon', 'Drift racing', 'Men\'s 10km Sprint', 'Men\'s 5000m', 'Men\'s 2 x 15km Skiathlon', 'Women\'s Team Sprint Freestyle', 'Men\'s Gundersen Normal Hill / 10km', 'Funk Music', 'Cinco de Mayo', 'Coming of Age', 'Men\'s Big Air', 'Animated Comedy', 'Retrospective', 'Men\'s 4 x 7.5km Relay', 'Women\'s Slopestyle', 'Men\'s Slopestyle', 'Ringuette', 'Men\'s Individual - Normal Hill', 'Women\'s Mass Start 16 Laps', 'Men\'s Mass Start 16 Laps', 'Men\'s Team Sprint Freestyle', 'Men\'s 1500m', 'Women\'s Sprint Classic Style', 'Men\'s Sprint Classic Style', 'Men\'s Single', 'Goth Music', 'Women\'s 10km Freestyle', 'Women\'s 15km Individual', 'Women\'s 4 x 6km Relay', 'Men\'s Team Large Hill / 4 x 5km', 'Women\'s Big Air', 'Men\'s Ski Cross', 'Dating', 'Blackjack', 'Komödie', 'Canada Day', 'Senior Citizen', 'Men\'s Alpine Combined', 'Serial', 'Men\'s 10000m', 'Mixed Team - Relay', 'Women\'s 7.5km Sprint', 'Women\'s 3000m', 'Freestyle Skiing', 'Open Double', 'Nordic Combined', 'Yom Kippur', 'Women\'s 2 x 7.5km Skiathlon', 'Men\'s 15km Freestyle', 'Dog Racing', 'Investigative', 'Men\'s 20km Individual', 'Short Subject', 'Soft Rock', 'Ska Music', 'Men\'s 50km Classic Style', '4K Test Tag', 'Women\'s 4 x 5km Relay', 'Women\'s 5000m', 'Men\'s Team - Large Hill', 'Women\'s 12.5km Mass Start', 'Trains', 'Arts & Literature', 'Women\'s Snowboard Cross', 'Women\'s Slalom', 'Men\'s Super G', 'Floorball', 'Women\'s Super G', 'Behind the Scenes', 'Highlights'"


In [33]:
get_domain_size('../src/schema/schema.pbtxt', 'tags')

409