# StatisticsGen Test run

## Set up

TFX requires apache-airflow and docker SDK.

In [None]:
!pip install 'apache-airflow[gcp]' docker tfx

In this notebook, we use TFX version 0.13.0

In [1]:
import tfx
tfx.version.__version__

'0.13.0'

TFX requires TensorFlow >= 1.13.1

In [2]:
import tensorflow as tf
tf.enable_eager_execution()
tf.__version__

'1.13.1'

TFX supports Python 3.5 from version 0.13.0

In [3]:
import sys
sys.version

'3.5.2 (default, Nov 12 2018, 13:43:14) \n[GCC 5.4.0 20160609]'

## Download sample data

In [4]:
%%bash
# This enables you to run this notebook twice.
# There should not be train/eval files at ~/taxi/data, since TFX can handle only single file with version 0.13.0
if [ -e ~/taxi/data ]; then
    rm -rf ~/taxi/data
fi

# download taxi data
mkdir -p ~/taxi/data/simple
mkdir -p ~/taxi/serving_model/taxi_simple
wget https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/chicago_taxi_pipeline/data/simple/data.csv -O ~/taxi/data/simple/data.csv

# download 
wget https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py -O ~/taxi/taxi_utils.py

--2019-06-13 06:14:40--  https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/chicago_taxi_pipeline/data/simple/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.108.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1922668 (1.8M) [text/plain]
Saving to: ‘/root/taxi/data/simple/data.csv’

     0K .......... .......... .......... .......... ..........  2% 3.81M 0s
    50K .......... .......... .......... .......... ..........  5% 8.93M 0s
   100K .......... .......... .......... .......... ..........  7% 23.1M 0s
   150K .......... .......... .......... .......... .......... 10% 6.94M 0s
   200K .......... .......... .......... .......... .......... 13% 3.32M 0s
   250K .......... .......... .......... .......... .......... 15% 17.2M 0s
   300K .......... .......... .......... .......... .......... 18% 14.7M 0s
   350K ........

## Import

In [5]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import logging
import os
from google.protobuf import json_format

from tfx.components.base.base_component import ComponentOutputs
from tfx.components.evaluator.component import Evaluator
from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen
from tfx.components.example_validator.component import ExampleValidator
from tfx.components.model_validator.component import ModelValidator
from tfx.components.pusher.component import Pusher
from tfx.components.schema_gen.component import SchemaGen
from tfx.components.statistics_gen.component import StatisticsGen
from tfx.components.trainer.component import Trainer
from tfx.components.transform.component import Transform
from tfx.orchestration import pipeline
from tfx.orchestration.airflow.airflow_runner import AirflowDAGRunner
from tfx.proto import evaluator_pb2
from tfx.proto import example_gen_pb2
from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.utils.dsl_utils import csv_input
from tfx.utils.channel import Channel
from tfx.utils import types

  'Running the Apache Beam SDK on Python 3 is not yet fully supported. '


## configs

In [6]:
# This example assumes that the taxi data is stored in ~/taxi/data and the
# taxi utility function is in ~/taxi.  Feel free to customize this as needed.
_taxi_root = os.path.join(os.environ['HOME'], 'taxi')
_data_root = os.path.join(_taxi_root, 'data/simple')
# Python module file to inject customized logic into the TFX components. The
# Transform and Trainer both require user-defined functions to run successfully.
_taxi_module_file = os.path.join(_taxi_root, 'taxi_utils.py')
# Path which can be listened to by the model server.  Pusher will output the
# trained model here.
_serving_model_dir = os.path.join(_taxi_root, 'serving_model/taxi_simple')

# Directory and data locations.  This example assumes all of the chicago taxi
# example code and metadata library is relative to $HOME, but you can store
# these files anywhere on your local filesystem.
_tfx_root = os.path.join(os.environ['HOME'], 'tfx')
_pipeline_root = os.path.join(_tfx_root, 'pipelines')
_metadata_db_root = os.path.join(_tfx_root, 'metadata')
_log_root = os.path.join(_tfx_root, 'logs')

# Airflow-specific configs; these will be passed directly to airflow
_airflow_config = {
    'schedule_interval': None,
    'start_date': datetime.datetime(2019, 1, 1),
}

# Logging overrides
logger_overrides = {'log_root': _log_root, 'log_level': logging.INFO}

## Create ExampleGen

In [7]:
"""Implements the chicago taxi pipeline with TFX."""
examples = csv_input(_data_root)

# Brings data into the pipeline or otherwise joins/converts training data.
train_config = example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2)
eval_config = example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
output_config = example_gen_pb2.Output(
    split_config=example_gen_pb2.SplitConfig(splits=[
        train_config,
        eval_config
    ]))

# Create outputs
train_examples = types.TfxType(type_name='ExamplesPath', split='train')
train_examples.uri = os.path.join(_data_root, 'train/')

eval_examples = types.TfxType(type_name='ExamplesPath', split='eval')
eval_examples.uri = os.path.join(_data_root, 'eval/')

output_dict = {'examples': Channel(
    type_name='ExamplesPath',
    static_artifact_collection=[train_examples, eval_examples])}

outputs = ComponentOutputs(output_dict)

example_gen = CsvExampleGen(input_base=examples, output_config=output_config, outputs=outputs)

## Create StatisticsGen

In [8]:
# Create outputs
train_statistics = types.TfxType(type_name='ExampleStatisticsPath', split='train')
train_statistics.uri = os.path.join(_data_root, 'train/stats/')

eval_statistics = types.TfxType(type_name='ExampleStatisticsPath', split='eval')
eval_statistics.uri = os.path.join(_data_root, 'eval/stats/')

output_dict = {'output': Channel(
    type_name='ExampleStatisticsPath',
    static_artifact_collection=[train_statistics, eval_statistics])}

outputs = ComponentOutputs(output_dict)

statistics_gen = StatisticsGen(
    input_data=example_gen.outputs.examples,
    name='Statistics Generator', # Optional, name should be unique if you are going to use multiple StatisticsGen in same pipeline.
    outputs=outputs
)

## Execute

In [9]:
def execute_component(component):
    input_dict = {key:value.get() for key, value in component.input_dict.items()}
    output_dict = {key: value.get() for key, value in component.outputs.get_all().items()}
    exec_properties = component.exec_properties
    executor = component.executor()
    executor.Do(input_dict, output_dict, exec_properties)

execute_component(example_gen)
execute_component(statistics_gen)

INFO:tensorflow:Starting Executor execution.
[2019-06-13 06:14:50,756] {base_executor.py:72} INFO - Starting Executor execution.
INFO:tensorflow:Inputs for Executor is: {"input-base": [{"artifact": {"uri": "/root/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"name": "ExternalPath", "properties": {"name": "STRING", "state": "STRING", "type_name": "STRING", "span": "INT", "split": "STRING"}}}]}
[2019-06-13 06:14:50,765] {base_executor.py:74} INFO - Inputs for Executor is: {"input-base": [{"artifact": {"uri": "/root/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"name": "ExternalPath", "properties": {"name": "STRING", "state": "STRING", "type_name": "STRING", "span": "INT", "split": "STRING"}}}]}
INFO:tensorflow:Outputs for Executor is: {"examples": [{"artifact": {"uri": "/root/taxi/data/simple/train/", "properties": {"split

[2019-06-13 06:14:59,398] {fn_api_runner.py:437} INFO - Running (ref_PCollection_PCollection_42/Read)+((ref_AppliedPTransform_OutputSpliteval/Write/WriteImpl/PreFinalize_76)+(ref_PCollection_PCollection_51/Write))
[2019-06-13 06:14:59,432] {fn_api_runner.py:437} INFO - Running (ref_PCollection_PCollection_42/Read)+(ref_AppliedPTransform_OutputSpliteval/Write/WriteImpl/FinalizeWrite_77)
[2019-06-13 06:14:59,443] {filebasedsink.py:290} INFO - Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
[2019-06-13 06:14:59,548] {filebasedsink.py:327} INFO - Renamed 1 shards in 0.10 seconds.
[2019-06-13 06:14:59,571] {fn_api_runner.py:437} INFO - Running ((ref_AppliedPTransform_OutputSplittrain/Write/WriteImpl/DoOnce/Read_40)+((ref_AppliedPTransform_OutputSplittrain/Write/WriteImpl/InitializeWrite_41)+(ref_PCollection_PCollection_25/Write)))+(ref_PCollection_PCollection_24/Write)
[2019-06-13 06:14:59,602] {fn_api_runner.py:437} INFO - Running ((ShuffleSplitt

[2019-06-13 06:15:11,777] {fn_api_runner.py:437} INFO - Running ((((((GenerateStatistics.train/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/TopKUniques_CountSlicedFeatureNameValueTuple/CombinePerKey(CountCombineFn)/Group/Read)+(GenerateStatistics.train/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/TopKUniques_CountSlicedFeatureNameValueTuple/CombinePerKey(CountCombineFn)/Merge))+(GenerateStatistics.train/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/TopKUniques_CountSlicedFeatureNameValueTuple/CombinePerKey(CountCombineFn)/ExtractOutputs))+(ref_AppliedPTransform_GenerateStatistics.train/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/TopKUniques_ModifyKeyToSlicedFeatureName_146))+(GenerateStatistics.train/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/TopK_GetTopK/CombinePerKey(TopCombineFn)/Precombine))+(((ref_AppliedPTransform_GenerateStatistics.tr

[2019-06-13 06:15:15,743] {fn_api_runner.py:437} INFO - Running ((GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/Uniques_CountPerFeatureName/CombinePerKey(CountCombineFn)/Group/Read)+(GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/Uniques_CountPerFeatureName/CombinePerKey(CountCombineFn)/Merge))+(((GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/Uniques_CountPerFeatureName/CombinePerKey(CountCombineFn)/ExtractOutputs)+(ref_AppliedPTransform_GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/Uniques_ConvertToSingleFeatureStats_63))+(GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/TopKUniquesStatsGenerator/FlattenTopKUniquesResults/Write/1))
[2019-06-13 06:15:15,762] {fn_api_runner.py:437} INFO - Running ((GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatis

[2019-06-13 06:15:16,878] {fn_api_runner.py:437} INFO - Running (((ref_AppliedPTransform_WriteStatsOutput.eval/Write/WriteImpl/DoOnce/Read_94)+(ref_AppliedPTransform_WriteStatsOutput.eval/Write/WriteImpl/InitializeWrite_95))+(ref_PCollection_PCollection_55/Write))+(ref_PCollection_PCollection_56/Write)
[2019-06-13 06:15:16,902] {fn_api_runner.py:437} INFO - Running ((GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/ToList/ToList/CombinePerKey/Group/Read)+(GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/ToList/ToList/CombinePerKey/Merge))+((GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/ToList/ToList/CombinePerKey/ExtractOutputs)+((ref_AppliedPTransform_GenerateStatistics.eval/RunStatsGenerators/GenerateSlicedStatisticsImpl/ToList/ToList/UnKey_85)+(ref_PCollection_PCollection_51/Write)))
[2019-06-13 06:15:16,926] {fn_api_runner.py:437} INFO - Running (ref_AppliedPTransform_GenerateStatistics.eval/RunStatsGenerators/

## Check Result

In [10]:
!ls -Rlhs /root/taxi/data/simple/

/root/taxi/data/simple/:
total 1.9M
1.9M -rw-r--r-- 1 root root 1.9M Jun 13 06:14 data.csv
4.0K drwxr-xr-x 3 root root 4.0K Jun 13 06:15 eval
4.0K drwxr-xr-x 3 root root 4.0K Jun 13 06:15 train

/root/taxi/data/simple/eval:
total 208K
204K -rw-r--r-- 1 root root 201K Jun 13 06:14 data_tfrecord-00000-of-00001.gz
4.0K drwxr-xr-x 2 root root 4.0K Jun 13 06:15 stats

/root/taxi/data/simple/eval/stats:
total 20K
20K -rw-r--r-- 1 root root 17K Jun 13 06:15 stats_tfrecord

/root/taxi/data/simple/train:
total 412K
408K -rw-r--r-- 1 root root 405K Jun 13 06:15 data_tfrecord-00000-of-00001.gz
4.0K drwxr-xr-x 2 root root 4.0K Jun 13 06:15 stats

/root/taxi/data/simple/train/stats:
total 20K
20K -rw-r--r-- 1 root root 18K Jun 13 06:15 stats_tfrecord


In [20]:
def get_stats_directories(statistics_gen):
    output_dict = {key: value.get() for key, value in statistics_gen.outputs.get_all().items()}
    input_dict = {key:value.get() for key, value in statistics_gen.input_dict.items()}
    split_to_instance = {x.split: x for x in input_dict['input_data']}
    directories = [types.get_split_uri(output_dict['output'], split) for split, instance in split_to_instance.items()]
    return directories

directories = get_stats_directories(statistics_gen)

In [16]:
import tensorflow_data_validation as tfdv
from tfx.utils import io_utils

train_stats = tfdv.load_statistics(io_utils.get_only_uri_in_dir(directories[0]))

In [17]:
tfdv.visualize_statistics(train_stats)

In [19]:
schema = tfdv.infer_schema(train_stats)
schema

feature {
  name: "trip_start_timestamp"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "fare"
  type: FLOAT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "trip_start_day"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "pickup_community_area"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "dropoff_longitude"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "pickup_census_tract"
  type: BYTES
  presence {
    min_count: 0
  }
}
feature {
  name: "trip_start_hour"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "pickup_lat