In [1]:
import os
from pathlib import Path
import shutil

import pandas as pd
import numpy as np
import tensorflow as tf
from tfx.components import CsvExampleGen, ImportExampleGen
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext  # 대화형 컴포넌트 관리자
from tfx.proto import example_gen_pb2  # example_gen 옵션 지정용

2023-01-13 12:53:26.092747: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 이번 노트북에서 공통적으로 사용할 경로
dir_path = Path().parent.absolute()
data_dir = os.path.join(dir_path, "data")
complaints_dir = os.path.join(data_dir, "complaints")

In [3]:
context = InteractiveContext()



# TFRecord
 장점
  - 플랫폼 독립적인 protobuf 기반
  - 대용량 데이터 고속 처리
  - TF 생태계에 통용되는 형식

In [2]:
record_path = "test.tfrecord"
with tf.io.TFRecordWriter(record_path) as writer:
    writer.write(b"First example")
    writer.write(b"Second example")


In [3]:
 for example in tf.data.TFRecordDataset(record_path):
    print(example)

2023-01-10 04:13:48.413612: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


tf.Tensor(b'First example', shape=(), dtype=string)
tf.Tensor(b'Second example', shape=(), dtype=string)


# ExampleGen
데이터를 다음과 같은 곳에서 읽을 수 있다:
  - CSV(`CsvExampleGen`)
  - TFRecord(`ImportExampleGen`)
  - Apache Avro
  - Apache Parquet

## CSV 파일 읽어들이기

In [67]:
csv_dir = os.path.join(data_dir, "taxi")

In [10]:
# `input_base`에 데이터 파일만 넣어야 하며, 패턴 지정이 없는 한 하위 디렉토리는 읽지 않는다.
csv_example_gen = CsvExampleGen(input_base=csv_dir)
context.run(csv_example_gen)



0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7fd560763c70.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd56074d760.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']/home/hakjun/projects/pipeline/data/taxi['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:1922812,xor_checksum:1673313402,sum_checksum:1673313402"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd56074d760.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd56074d760.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/home/hakjun/projects/pipeline/data/taxi['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:1922812,xor_checksum:1673313402,sum_checksum:1673313402"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd56074d760.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/home/hakjun/projects/pipeline/data/taxi
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:1922812,xor_checksum:1673313402,sum_checksum:1673313402"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd56074d760.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1) at 0x7fd58a4a5b50.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


## TFRecord 읽어들이기(tfds에서 받은 mnist trainset 사용)

In [16]:
rec_example_gen = ImportExampleGen(
    input_base=os.path.join(complaints_dir, "records"))
context.run(rec_example_gen)
context.show(rec_example_gen)

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd55f47a340.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5) at 0x7fd5eb79dcd0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/home/hakjun/projects/pipeline/data/tfrecords['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:19832042,xor_checksum:1673325422,sum_checksum:1673325422"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fd55f47a340.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5) at 0x7fd5eb79dcd0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5) at 0x7fd5eb79dcd0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5) at 0x7fd5eb79dcd0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T04_19_06.444458-1_blds93/ImportExampleGen/examples/5
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/home/hakjun/projects/pipeline/data/tfrecords
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:19832042,xor_checksum:1673325422,sum_checksum:1673325422"


## 임의의 데이터 읽어 TFRecord로 변환하기

In [7]:
filepath = tf.keras.utils.get_file(
    "complaints.csv.zip",
    "http://files.consumerfinance.gov/ccdb/complaints.csv.zip"
)

NameError: name 'tf' is not defined

In [6]:
processed_dir = os.path.join(complaints_dir, "processed")
Path(processed_dir).mkdir(parents=True, exist_ok=False)

FileExistsError: [Errno 17] File exists: '/home/hakjun/projects/pipeline/data/complaints/processed'

In [7]:
# VM 인스턴스에서 너무 느려서 직접 풀고 주석 처리
# shutil.unpack_archive(filepath, data_dir)

In [4]:
df = pd.read_csv(os.path.join(complaints_dir, "raw", "complaints.csv"))

  df = pd.read_csv(os.path.join(complaints_dir, "raw", "complaints.csv"))


In [5]:
df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [6]:
df.loc[df["Consumer disputed?"] == "", "Consumer disputed?"] = np.nan

In [7]:
# 다음 열 중 비어 있는 레코드는 제외
df = df.dropna(subset=["Consumer complaint narrative", "Consumer disputed?", "ZIP code"])

In [8]:
# "Consumer disputed?" 열은 카테고리임을 알 수 있다.
set(df["Consumer disputed?"].values)

{'No', 'Yes'}

In [9]:
# 이 열을 0과 1인 바이너리로 인코딩
df.loc[df["Consumer disputed?"] == "Yes", "Consumer disputed?"] = 1
df.loc[df["Consumer disputed?"] == "No", "Consumer disputed?"] = 0

In [10]:
# "ZIP code"열 전처리
df["ZIP code"] = df.astype({"ZIP code": "string"})["ZIP code"].str.replace(".0", "")

  df["ZIP code"] = df.astype({"ZIP code": "string"})["ZIP code"].str.replace(".0", "")


In [11]:
# 빈 문자열이거나 null을 "000000"로 변환
df.loc[df["ZIP code"] == "", "ZIP code"] = "000000"
df.loc[pd.isna(df["ZIP code"]), "ZIP code"] = "000000"

In [12]:
df = df[df["ZIP code"].str.len() == 5]

In [13]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
136386,2016-05-09,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,XXXX of XXXX complaints regarding same company...,Company has responded to the consumer and the ...,Specialized Loan Servicing Holdings LLC,CA,95472,,Consent provided,Web,2016-05-09,Closed with explanation,Yes,0,1915601
144085,2016-06-01,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,"My Mortgage Company, Ditech, failed to apply p...",Company believes complaint is the result of an...,Ditech Financial LLC,SC,29414,,Consent provided,Web,2016-06-01,Closed with explanation,Yes,1,1950363
148160,2017-04-18,Mortgage,FHA mortgage,"Loan modification,collection,foreclosure",,My husband and I are going through a financial...,,"PENNYMAC LOAN SERVICES, LLC.",AZ,85338,,Consent provided,Web,2017-04-18,Closed with explanation,Yes,0,2440537
148164,2017-04-17,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Having problems with customer service,In XX/XX/2016 I contacted ACS regarding incorr...,,ACS Education Services,CA,92223,,Consent provided,Web,2017-04-17,Closed with explanation,Yes,0,2438881
148166,2017-04-21,Student loan,Non-federal student loan,Dealing with my lender or servicer,Trouble with how payments are handled,This complaint is further evidence tha t Navie...,,"Navient Solutions, LLC.",TX,78746,,Consent provided,Web,2017-04-21,Closed with explanation,Yes,0,2446956


In [14]:
df["ZIP code"] = df["ZIP code"].str.replace("XX", "00")
df = df.reset_index(drop=True)
df["ZIP code"] = pd.to_numeric(df["ZIP code"], errors="coerce")
df = df[df["ZIP code"].notna()]
df["ZIP code"] = df["ZIP code"].astype("int")

In [15]:
df.to_csv(os.path.join(processed_dir, "processed-complaints.csv"), index=False)

NameError: name 'processed_dir' is not defined

In [16]:
def _bytes_feature(value: str):
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[value.encode()])
    )


def _float_feature(value: float):
    return tf.train.Feature(
        float_list=tf.train.FloatList(value=[value])
    )


def __int64_feature(value: int):
    return tf.train.Feature(
        int64_list=tf.train.Int64List(value=[value])
    )

In [21]:
df = pd.read_csv(os.path.join(processed_dir, "processed-complaints.csv"))

In [17]:
# 분할된 tfrecord들을 ExampleGen으로 읽는 경우도 생각해서 쪼개 놓겠다
train_num_samples = int(0.5 * len(df))
eval_num_samples = int(0.2 * len(df))
test_num_samples = len(df) - (train_num_samples + eval_num_samples)

trainset = df.iloc[:train_num_samples]
evalset = df.iloc[train_num_samples:train_num_samples + eval_num_samples]
testset = df.iloc[train_num_samples + eval_num_samples:]

In [18]:
record_path = os.path.join(complaints_dir, "records", "consumer-complaints.tfrecord")
split_dir = os.path.join(complaints_dir, 'splits')

In [19]:
# writer 컨텍스트 내에서 파일 생성이 아니라면 종료 전 `record_writer.close()`를 실행할 것.
def write_tfrecord(dataframe, filepath):
    with tf.io.TFRecordWriter(filepath) as record_writer:
        for idx, row in dataframe.iterrows():
            example = tf.train.Example(
                features=tf.train.Features(
                    feature={
                        "product": _bytes_feature(str(row["Product"])),
                        "sub_product": _bytes_feature(str(row["Sub-product"])),
                        "issue": _bytes_feature(str(row["Issue"])),
                        "sub_issue": _bytes_feature(str(row["Sub-issue"])),
                        "state": _bytes_feature(str(row["State"])),
                        "zip_code": __int64_feature(int(row["ZIP code"])),
                        "company": _bytes_feature(str(row["Company"])),
                        "company_response": _bytes_feature(str(row["Company public response"])),
                        "timely_response": _bytes_feature(str(row["Timely response?"])),
                        "consumer_disputed": _float_feature(float(row["Consumer disputed?"])),
                        "consumer_complaint_narrative": _bytes_feature(str(row["Consumer complaint narrative"]))
                    }
                )
            )
            # 데이터 직렬화
            record_writer.write(example.SerializeToString())

In [21]:
write_tfrecord(df, record_path)

write_tfrecord(trainset, os.path.join(split_dir, "train.tfrecord"))
write_tfrecord(evalset, os.path.join(split_dir, "eval.tfrecord"))
write_tfrecord(testset, os.path.join(split_dir, "test.tfrecord"))

In [36]:
# 이렇게 만든 tfrecord 파일을 이제 ExampleGen 컴포넌트로 읽어들일 수 있다.
rec_example_gen = ImportExampleGen(
    input_base=os.path.join(complaints_dir, "records"))
context.run(rec_example_gen)
context.show(rec_example_gen)

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f9ce08b5b50.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2) at 0x7f9cd9f826d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/home/hakjun/projects/pipeline/data/complaints/records['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:100971035,xor_checksum:1673489812,sum_checksum:1673489812"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f9ce08b5b50.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2) at 0x7f9cd9f826d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2) at 0x7f9cd9f826d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2) at 0x7f9cd9f826d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-12T02_09_36.578374-9iejjbtz/ImportExampleGen/examples/2
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/home/hakjun/projects/pipeline/data/complaints/records
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:100971035,xor_checksum:1673489812,sum_checksum:1673489812"


## 원격 데이터 수집

In [64]:
# 가상의 스토리지이므로 작동 안 함
remote_example_gen = CsvExampleGen(input_base="gs://example/")
context.run(remote_example_gen)
context.show(remote_example_gen)

RuntimeError: Split pattern gs://example_compliance_data/* does not match any files.

# 데이터 준비

## 데이터 분할

### 직접 분할 지정

In [71]:
# 분할 설정 지정: 지정 없으면 기본 설정은 train:test = 2:1이다.
output_config = example_gen_pb2.Output(
    split_config=example_gen_pb2.SplitConfig(splits=[
        example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=6),
        example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=2),
        example_gen_pb2.SplitConfig.Split(name="test", hash_buckets=2)
    ])
)

csv_example_gen = CsvExampleGen(input_base=csv_dir, output_config=output_config)
context.run(csv_example_gen)

0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f905dc153a0.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f90565c6ac0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0.exec_properties['input_base']/home/hakjun/projects/pipeline/data/taxi['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:1922812,xor_checksum:1673313402,sum_checksum:1673313402"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f90565c6ac0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f90565c6ac0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0"
.exec_properties,"['input_base']/home/hakjun/projects/pipeline/data/taxi['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:1922812,xor_checksum:1673313402,sum_checksum:1673313402"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f90565c6ac0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0

0,1
['input_base'],/home/hakjun/projects/pipeline/data/taxi
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:1922812,xor_checksum:1673313402,sum_checksum:1673313402"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f90565c6ac0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1) at 0x7f90565c6b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0


In [72]:
for artifact in csv_example_gen.outputs['examples'].get():
    print(artifact)  # "split_names" 특성에서 분할 결과 확인 가능.

Artifact(artifact: id: 1
type_id: 14
uri: "/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/CsvExampleGen/examples/1"
properties {
  key: "split_names"
  value {
    string_value: "[\"train\", \"eval\", \"test\"]"
  }
}
custom_properties {
  key: "file_format"
  value {
    string_value: "tfrecords_gzip"
  }
}
custom_properties {
  key: "input_fingerprint"
  value {
    string_value: "split:single_split,num_files:1,total_bytes:1922812,xor_checksum:1673313402,sum_checksum:1673313402"
  }
}
custom_properties {
  key: "payload_format"
  value {
    string_value: "FORMAT_TF_EXAMPLE"
  }
}
custom_properties {
  key: "span"
  value {
    int_value: 0
  }
}
custom_properties {
  key: "state"
  value {
    string_value: "published"
  }
}
custom_properties {
  key: "tfx_version"
  value {
    string_value: "1.12.0"
  }
}
state: LIVE
, artifact_type: id: 14
name: "Examples"
properties {
  key: "span"
  value: INT
}
properties {
  key: "split_names"
  value: STRING
}
properties {
  key: "

### 이미 분할된 파일을 가져오기

In [103]:
input_config = example_gen_pb2.Input(
    splits=[
        # '00000-00100' 이런 식으로 분할별 파일이 많아지면 "train/*" 이런 패턴이 더 낫음.
        example_gen_pb2.Input.Split(name="train", pattern="train.tfrecord"),
        example_gen_pb2.Input.Split(name="eval", pattern="eval.tfrecord"),
        example_gen_pb2.Input.Split(name="test", pattern="test.tfrecord")
    ]
)
example_gen = ImportExampleGen(input_base=split_dir, input_config=input_config)

In [104]:
context.run(example_gen)

0,1
.execution_id,8
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } ImportExampleGen at 0x7f905fb726a0.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f905fb66d00.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0.exec_properties['input_base']/home/hakjun/projects/pipeline/data/tfrecords/splits['input_config']{  ""splits"": [  {  ""name"": ""train"",  ""pattern"": ""train.tfrecord""  },  {  ""name"": ""eval"",  ""pattern"": ""eval.tfrecord""  },  {  ""name"": ""test"",  ""pattern"": ""test.tfrecord""  }  ] }['output_config']{}['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:train,num_files:1,total_bytes:24762731,xor_checksum:1673335398,sum_checksum:1673335398 split:eval,num_files:1,total_bytes:24762731,xor_checksum:1673335414,sum_checksum:1673335414 split:test,num_files:1,total_bytes:24762731,xor_checksum:1673335431,sum_checksum:1673335431"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f905fb66d00.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f905fb66d00.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"
.exec_properties,"['input_base']/home/hakjun/projects/pipeline/data/tfrecords/splits['input_config']{  ""splits"": [  {  ""name"": ""train"",  ""pattern"": ""train.tfrecord""  },  {  ""name"": ""eval"",  ""pattern"": ""eval.tfrecord""  },  {  ""name"": ""test"",  ""pattern"": ""test.tfrecord""  }  ] }['output_config']{}['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:train,num_files:1,total_bytes:24762731,xor_checksum:1673335398,sum_checksum:1673335398 split:eval,num_files:1,total_bytes:24762731,xor_checksum:1673335414,sum_checksum:1673335414 split:test,num_files:1,total_bytes:24762731,xor_checksum:1673335431,sum_checksum:1673335431"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f905fb66d00.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0

0,1
['input_base'],/home/hakjun/projects/pipeline/data/tfrecords/splits
['input_config'],"{  ""splits"": [  {  ""name"": ""train"",  ""pattern"": ""train.tfrecord""  },  {  ""name"": ""eval"",  ""pattern"": ""eval.tfrecord""  },  {  ""name"": ""test"",  ""pattern"": ""test.tfrecord""  }  ] }"
['output_config'],{}
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:train,num_files:1,total_bytes:24762731,xor_checksum:1673335398,sum_checksum:1673335398 split:eval,num_files:1,total_bytes:24762731,xor_checksum:1673335414,sum_checksum:1673335414 split:test,num_files:1,total_bytes:24762731,xor_checksum:1673335431,sum_checksum:1673335431"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f905fb66d00.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8) at 0x7f90617beeb0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T06_37_47.936270-f9j737lv/ImportExampleGen/examples/8
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0


## 스팬(span): 데이터의 스냅샷

In [5]:
span_dir = os.path.join(data_dir, 'span')

# SPAN 자리 표시자를 사용해서 항상 최신의 스팬을 불러오도록 설정한다.
# export-0 < export-1 < export-2 순으로 작은 숫자는 큰 숫자 데이터의 부분집합이다.
input_config = example_gen_pb2.Input(
    splits=[example_gen_pb2.Input.Split(pattern="export-{SPAN}/*")]
)

span_example_gen = CsvExampleGen(input_base=span_dir, input_config=input_config)
context.run(span_example_gen)
context.show(span_example_gen)  # 'input_config'에서 최신 스팬을 받은 것을 확인.



0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f6a6c207340.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1) at 0x7f6a84d767c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/home/hakjun/projects/pipeline/data/span['input_config']{  ""splits"": [  {  ""pattern"": ""export-3/*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']3['version']None['input_fingerprint']split:,num_files:1,total_bytes:8402,xor_checksum:1673346417,sum_checksum:1673346417"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f6a6c207340.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1) at 0x7f6a84d767c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1) at 0x7f6a84d767c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1) at 0x7f6a84d767c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/home/hakjun/projects/pipeline/data/span
['input_config'],"{  ""splits"": [  {  ""pattern"": ""export-3/*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],3
['version'],
['input_fingerprint'],"split:,num_files:1,total_bytes:8402,xor_checksum:1673346417,sum_checksum:1673346417"


## 데이터셋 버저닝(Dataset versioning)
  - [DVC](https://dvc.org/)
  - [Pachyderm](https://www.pachyderm.com/)

외부 도구 사용 시 완전한 파이프라인을 위해선 MLMD에 메타데이터를 저장할 수 있는지 여부를 확인해야 한다.

## 이미지 데이터 입력

In [23]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def generate_label(table: dict, filepath):
    """

    :param filepath: 파일명 라벨의 역할을 한다고 가정한다.
    :param table: str 라벨에 대응하는 int 값을 기술한 dict.
    :return: `filepath`를 파싱한 라벨.
    """
    filename = os.path.basename(filepath)
    label = os.path.splitext(filename)[0]

    return table[label]


# 주의: 통상 이미지는 압축된 상태로 이를 디코딩하면 급격한 메모리, 디스크 부족 상태가 될 수 있음.
def write_img_tfrecord(indir, outpath, label_gen_fn):
    """분류 작업을 위한 이미지 데이터셋 생성

    :param indir: 이미지만 담고 있는 디렉토리. 이미지들은 모두 `label`을 파일명으로 지닌다 가정.
    :param outpath: tfrecord을 생성할 경로.
    :param label_gen_fn: 정수형으로 인코딩된 라벨을 반환하는 함수.
    """
    filenames = os.scandir(indir)

    with tf.io.TFRecordWriter(outpath) as writer:
        for img_path in filenames:
            try:
                print(f"처리 중: {img_path.path}")
                raw_file = tf.io.read_file(img_path.path)
            except FileNotFoundError:
                print(f"{img_path.name}은 존재하지 않습니다.")
                continue

            example = tf.train.Example(
                features=tf.train.Features(
                    feature={
                        'raw_image': _bytes_feature(raw_file.numpy()),
                        'label': _int64_feature(label_gen_fn(img_path))
                    }
                )
            )

            writer.write(example.SerializeToString())

In [24]:
from functools import partial

In [31]:
img_dir = os.path.join(data_dir, "images")
record_path = os.path.join(data_dir, "animal_records", "samples.tfrecord")
enc_tab = {
    'dog': 0,
    'cat': 1,
    'parrot': 2
}
generate_encoded_label = partial(generate_label, enc_tab)

In [32]:
write_img_tfrecord(img_dir, record_path, generate_encoded_label)

처리 중: /home/hakjun/projects/pipeline/data/images/dog.jpg
처리 중: /home/hakjun/projects/pipeline/data/images/parrot.jpg
처리 중: /home/hakjun/projects/pipeline/data/images/cat.jpg


In [37]:
# tfrecord 생성에 성공했는지 읽어서 확인.
rec_example_gen = ImportExampleGen(input_base=str(Path(record_path).parent))
context.run(rec_example_gen)
context.show(rec_example_gen)

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f6a51597730.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3) at 0x7f6a68ae27c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/home/hakjun/projects/pipeline/data/animal_records['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:917346,xor_checksum:1673350567,sum_checksum:1673350567"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f6a51597730.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3) at 0x7f6a68ae27c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3) at 0x7f6a68ae27c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3) at 0x7f6a68ae27c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-10T10_12_19.691034-9ridu_hl/ImportExampleGen/examples/3
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/home/hakjun/projects/pipeline/data/animal_records
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:917346,xor_checksum:1673350567,sum_checksum:1673350567"
