# 데이터 검증(Data validation)

1. 무엇을 하는 단계인가:
  - 이상치 검출
  - 스키마 변경 감지
  - 데이터 드리프트를 유발시키는 데이터셋 간(train-eval, span ver1-ver2 etc.) 통계 변경 감지

2. TFDV(TensorFlow Data Validation)
  - TFDV의 시각화는 [Facet](https://pair-code.github.io/facets/) 프로젝트에 기반
  - TFDV가 받아들일 수 있는 파일 입력 형식은 CSV와 TFRecord 두 가지(메모리 상 객체론 ExampleGen이 생성한 아티팩트를 파이프라인 다음 단계로서 받아들인다)

In [42]:
import os

import pandas as pd
import tensorflow_data_validation as tfdv
# 관심 슬라이스(SQL `GROUP BY` 같은 거)별로 데이터셋을 묶기 위한 유틸.
from tensorflow_data_validation.utils import slicing_util
from tensorflow_metadata.proto.v0 import statistics_pb2 as stats_pb2
# 머신러닝 파이프라인 일부로서 데이터 검증을 위한 컴포넌트들
from tfx.components import (ImportExampleGen, StatisticsGen, SchemaGen,
                            ExampleValidator,)
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

In [2]:
data_dir = os.path.join(os.getcwd(), "data")
complaints_dir = os.path.join(data_dir, "complaints")
raw_path = os.path.join(complaints_dir, "consumer-complaints.tfrecord")
processed_path = os.path.join(complaints_dir, "processed", "processed-complaints.csv")


In [3]:
df = pd.read_csv(processed_path)

In [4]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2016-05-09,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,XXXX of XXXX complaints regarding same company...,Company has responded to the consumer and the ...,Specialized Loan Servicing Holdings LLC,CA,95472,,Consent provided,Web,2016-05-09,Closed with explanation,Yes,0,1915601
1,2016-06-01,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,"My Mortgage Company, Ditech, failed to apply p...",Company believes complaint is the result of an...,Ditech Financial LLC,SC,29414,,Consent provided,Web,2016-06-01,Closed with explanation,Yes,1,1950363
2,2017-04-18,Mortgage,FHA mortgage,"Loan modification,collection,foreclosure",,My husband and I are going through a financial...,,"PENNYMAC LOAN SERVICES, LLC.",AZ,85338,,Consent provided,Web,2017-04-18,Closed with explanation,Yes,0,2440537
3,2017-04-17,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Having problems with customer service,In XX/XX/2016 I contacted ACS regarding incorr...,,ACS Education Services,CA,92223,,Consent provided,Web,2017-04-17,Closed with explanation,Yes,0,2438881
4,2017-04-21,Student loan,Non-federal student loan,Dealing with my lender or servicer,Trouble with how payments are handled,This complaint is further evidence tha t Navie...,,"Navient Solutions, LLC.",TX,78746,,Consent provided,Web,2017-04-21,Closed with explanation,Yes,0,2446956


In [3]:
# TODO(me): `generate_statistics_from_csv` 메소드 실행 시 서버 주피터 커널이 자꾸 재시작하는 문제
# stats = tfdv.generate_statistics_from_csv(
#     data_location=processed_path,  # ExampleGen과 달리 단일 파일을 지정해야 함을 헷갈리지 말 것.
#     delimiter=','
# )

# tfrecord 피처명은 processed-complaints.csv에 있는 피처명을 소문자화,
# 공백은 밑줄로 처리하고, 일부 단어를 생략한 피처명이므로 데이터프레임 대신 tfrecord를 읽었다.
full_records = os.path.join(complaints_dir, "records",
                            "consumer-complaints.tfrecord")
stats = tfdv.generate_statistics_from_tfrecord(data_location=full_records)



Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [4]:
stats  # 각 피처에 대한 통계를 볼 수 있다.

datasets {
  num_examples: 66657
  features {
    type: STRING
    string_stats {
      common_stats {
        num_non_missing: 66657
        min_num_values: 1
        max_num_values: 1
        avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6665.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6665.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6665.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6665.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6665.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6665.7
          }
          buckets {
            low

## 데이터셋 스키마(Schema)
의미: 데이터셋 표현을 정의한 구조.
구성: 피처, 각 피처의 데이터형과 데이터 범위를 정의해야 한다(스키마 객체를 생성해 살펴보면 알 수 있다).
용도: 데이터셋 검증 외에도 데이터 전처리 시에도 필요.

[schema protobuf 정의](https://github.com/tensorflow/metadata/blob/master/tensorflow_metadata/proto/v0/schema.proto)

In [5]:
schema = tfdv.infer_schema(stats)

In [6]:
schema

feature {
  name: "company"
  type: BYTES
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "company_response"
  type: BYTES
  domain: "company_response"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "consumer_disputed"
  type: FLOAT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "issue"
  type: BYTES
  domain: "issue"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "product"
  type: BYTES
  domain: "product"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "state"
  type: BYTES
  domain: "state"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "sub_issue"
  type

In [7]:
tfdv.display_schema(schema)  # facets를 백엔드로 사용한 시각화.
# Presence: 필수인가 아닌가. NaN 허용 여부. 이는 데이터 사용자에 의한 필수 여부가 아니라,
#       TFDV에서 추론한 것이다.
# Valency: 하나의 학습 예제에서 피처의 값이 몇 개여야 하는지 정한다.
#       categorical feature가 single이면 단일 범주를 지녀야 한다(cat이면서 dog일 순 없다).
#       valency=3이라면 example의 그 피처가 [1, 2, 3] 같은 형태라는 뜻이다.

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'company',BYTES,required,,-
'company_response',STRING,required,,'company_response'
'consumer_disputed',FLOAT,required,,-
'issue',STRING,required,,'issue'
'product',STRING,required,,'product'
'state',STRING,required,,'state'
'sub_issue',STRING,required,,'sub_issue'
'sub_product',STRING,required,,'sub_product'
'timely_response',STRING,required,,'timely_response'
'zip_code',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'company_response',"'Company believes complaint caused principally by actions of third party outside the control or direction of the company', 'Company believes complaint is the result of an isolated error', 'Company believes complaint relates to a discontinued policy or procedure', 'Company believes complaint represents an opportunity for improvement to better serve consumers', 'Company believes it acted appropriately as authorized by contract or law', 'Company believes the complaint is the result of a misunderstanding', 'Company can\'t verify or dispute the facts in the complaint', 'Company chooses not to provide a public response', 'Company disputes the facts presented in the complaint', 'Company has responded to the consumer and the CFPB and chooses not to provide a public response', 'nan'"
'issue',"'APR or interest rate', 'Account opening, closing, or management', 'Account terms and changes', 'Adding money', 'Advertising and marketing', 'Advertising, marketing or disclosures', 'Application processing delay', 'Application, originator, mortgage broker', 'Applied for loan/did not receive money', 'Arbitration', 'Balance transfer', 'Balance transfer fee', 'Bankruptcy', 'Billing disputes', 'Billing statement', 'Can\'t contact lender', 'Can\'t repay my loan', 'Can\'t stop charges to bank account', 'Cash advance', 'Cash advance fee', 'Charged bank acct wrong day or amt', 'Charged fees or interest I didn\'t expect', 'Closing/Cancelling account', 'Communication tactics', 'Cont\'d attempts collect debt not owed', 'Convenience checks', 'Credit card protection / Debt protection', 'Credit decision / Underwriting', 'Credit determination', 'Credit line increase/decrease', 'Credit monitoring or identity protection', 'Credit reporting company\'s investigation', 'Customer service / Customer relations', 'Customer service/Customer relations', 'Dealing with my lender or servicer', 'Delinquent account', 'Deposits and withdrawals', 'Disclosure verification of debt', 'Disclosures', 'Excessive fees', 'False statements or representation', 'Fees', 'Forbearance / Workout plans', 'Fraud or scam', 'Getting a loan', 'Identity theft / Fraud / Embezzlement', 'Improper contact or sharing of info', 'Improper use of my credit report', 'Incorrect exchange rate', 'Incorrect information on credit report', 'Incorrect/missing disclosures or info', 'Late fee', 'Lender damaged or destroyed vehicle', 'Lender repossessed or sold the vehicle', 'Lender sold the property', 'Loan modification,collection,foreclosure', 'Loan servicing, payments, escrow account', 'Lost or stolen check', 'Lost or stolen money order', 'Making/receiving payments, sending money', 'Managing the line of credit', 'Managing the loan or lease', 'Managing, opening, or closing account', 'Money was not available when promised', 'Other', 'Other fee', 'Other service issues', 'Other transaction issues', 'Overdraft, savings or rewards features', 'Overlimit fee', 'Payment to acct not credited', 'Payoff process', 'Privacy', 'Problems caused by my funds being low', 'Problems when you are unable to pay', 'Received a loan I didn\'t apply for', 'Rewards', 'Sale of account', 'Settlement process and costs', 'Shopping for a line of credit', 'Shopping for a loan or lease', 'Taking out the loan or lease', 'Taking/threatening an illegal action', 'Transaction issue', 'Unable to get credit report/credit score', 'Unauthorized transactions/trans. issues', 'Unexpected/Other fees', 'Unsolicited issuance of credit card', 'Using a debit or ATM card', 'Wrong amount charged or received'"
'product',"'Bank account or service', 'Consumer Loan', 'Credit card', 'Credit reporting', 'Debt collection', 'Money transfers', 'Mortgage', 'Other financial service', 'Payday loan', 'Prepaid card', 'Student loan', 'Virtual currency'"
'state',"'AA', 'AE', 'AK', 'AL', 'AP', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'FM', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MH', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND', 'NE', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY', 'nan'"
'sub_issue',"'Account status', 'Account terms', 'Account terms and changes', 'Attempted to collect wrong amount', 'Attempted to/Collected exempt funds', 'Billing dispute', 'Called after sent written cease of comm', 'Called outside of 8am-9pm', 'Can\'t decrease my monthly payments', 'Can\'t get flexible payment options', 'Can\'t qualify for a loan', 'Can\'t temporarily postpone payments', 'Contacted employer after asked not to', 'Contacted me after I asked not to', 'Contacted me instead of my attorney', 'Debt is not mine', 'Debt resulted from identity theft', 'Debt was discharged in bankruptcy', 'Debt was paid', 'Don\'t agree with fees charged', 'Frequent or repeated calls', 'Having problems with customer service', 'Impersonated an attorney or official', 'Inadequate help over the phone', 'Indicated committed crime not paying', 'Indicated shouldn\'t respond to lawsuit', 'Information is not mine', 'Investigation took too long', 'Keep getting calls about my loan', 'Need information about my balance/terms', 'No notice of investigation status/result', 'Not disclosed as an attempt to collect', 'Not given enough info to verify debt', 'Personal information', 'Problem cancelling or closing account', 'Problem getting my free annual report', 'Problem getting report or credit score', 'Problem with fraud alerts', 'Problem with statement of dispute', 'Public record', 'Qualify for a better loan than offered', 'Received bad information about my loan', 'Received marketing offer after opted out', 'Receiving unwanted marketing/advertising', 'Reinserted previously deleted info', 'Report improperly shared by CRC', 'Report shared with employer w/o consent', 'Right to dispute notice not received', 'Seized/Attempted to seize property', 'Sued w/o proper notification of suit', 'Sued where didn\'t live/sign for debt', 'Talked to a third party about my debt', 'Threatened arrest/jail if do not pay', 'Threatened to sue on too old debt', 'Threatened to take legal action', 'Trouble with how payments are handled', 'Used obscene/profane/abusive language', 'nan'"
'sub_product',"'(CD) Certificate of deposit', 'Auto', 'Cashing a check without an account', 'Check cashing', 'Checking account', 'Conventional adjustable mortgage (ARM)', 'Conventional fixed mortgage', 'Credit card', 'Credit repair', 'Debt settlement', 'Domestic (US) money transfer', 'Electronic Benefit Transfer / EBT card', 'FHA mortgage', 'Federal student loan', 'Federal student loan servicing', 'Foreign currency exchange', 'General purpose card', 'Gift or merchant card', 'Government benefit payment card', 'Home equity loan or line of credit', 'I do not know', 'ID prepaid card', 'Installment loan', 'International money transfer', 'Medical', 'Mobile wallet', 'Money order', 'Mortgage', 'Non-federal student loan', 'Other (i.e. phone, health club, etc.)', 'Other bank product/service', 'Other mortgage', 'Other special purpose card', 'Pawn loan', 'Payday loan', 'Payroll card', 'Personal line of credit', 'Refund anticipation check', 'Reverse mortgage', 'Savings account', 'Title loan', 'Transit card', 'Traveler’s/Cashier’s checks', 'VA mortgage', 'Vehicle lease', 'Vehicle loan', 'nan'"
'timely_response',"'No', 'Yes'"


### 스키마로 두 데이터셋 간 비교

In [11]:
record_dir = os.path.join(data_dir, 'complaints', 'splits')
train_record_path = os.path.join(record_dir, 'train', 'train.tfrecord')
eval_record_path = os.path.join(record_dir, 'eval', 'eval.tfrecord')

train_stats = tfdv.generate_statistics_from_tfrecord(data_location=train_record_path)
eval_stats = tfdv.generate_statistics_from_tfrecord(data_location=eval_record_path)



Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [12]:
# lhs: left-hand side, rhs: right-hand side
# 데이터셋에 이상이 있다고 판단하면 그 부분을 강조 표시한다.
tfdv.visualize_statistics(lhs_statistics=train_stats, lhs_name='Trainset',
                          rhs_statistics=eval_stats, rhs_name='Validset')

In [20]:
# 데이터셋에서 생성한 통계에 이상치가 있는지 스키마를 기준으로 검사한다.
anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema)

In [21]:
tfdv.display_anomalies(anomalies)

In [22]:
# 스키마를 직렬화시켜 파일로 저장할 수도 있다.
tfdv.write_schema_text(schema, "metadata/complaints_schem")

In [23]:
# 스키마 업데이트: 피처의 설정값들을 조정 가능.
sub_issue_feat = tfdv.get_feature(schema, "sub_issue")
# example들의 90% 이상에 존재해야 한다는 제약 설정.
sub_issue_feat.presence.min_fraction = 0.9

In [37]:
# '도메인'이란 피처의 값 공간 - 데이터셋에서 이 피처가 어떤 값들을 지니고 있는지 - 를 나열한다.
state_domain = tfdv.get_domain(schema, "state")
# 특정 값을 도메인에서 제거
state_domain.value.remove("AK")

In [39]:
# 'state' 열에서 알래스카("AK") 값은 허용되지 않도록 스키마를 업데이트 했기에,
# 현재 eval 데이터셋에 그 내용의 이상치가 감지됐다.
updated_anomalies = tfdv.validate_statistics(eval_stats, schema=schema)
tfdv.display_anomalies(updated_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'state',Unexpected string values,Examples contain values missing from the schema: AK (<1%).


### Data skew & drift
데이터 스큐: 두 데이터셋 간 통계 분포의 큰 차이를 감지. 이는 통계학에서의 skew의 정의(평균에서 떨어진
정도)와 다르다. TFDV의 내장 스큐 비교기는 두 데이터셋 간 `service_statistics` 차이에 대한
L-infinity norm을 산출한다. 스큐의 임계값 초과 여부는 TFDV가 이상치 검사로 다루는 한 종류이다.

L-infinity norm: 두 벡터 간 차이를 정의.
```Python
v1, v2 = [5, 7, -1], [3, -5, 13]
diff = v1 - v2  # [2, 12, -14]
l_inifinity_norm = max(np.abs(diff))  # 14
```

스큐 비교기에서 두 벡터는 두 데이터셋의 요약 통계이다.


In [40]:
company_feat = tfdv.get_feature(schema, 'company')

In [43]:
# 설정된 임계값 0.01을 초과하면 이상치로 감지하도록 설정.
company_feat.skew_comparator.infinity_norm.threshold = 0.01
skew_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    serving_statistics=eval_stats)  # skew anomaly 검출을 위해 필요한 인자.

In [48]:
skew_anomalies.anomaly_info

{'state': description: "Examples contain values missing from the schema: AK (<1%). "
severity: ERROR
short_description: "Unexpected string values"
reason {
  type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
  short_description: "Unexpected string values"
  description: "Examples contain values missing from the schema: AK (<1%). "
}
path {
  step: "state"
}
}

#### Data drift

In [55]:
span_dir = os.path.join(data_dir, "span")
span_files = [os.path.join(span_dir, f"export-{i}", f"ver{i}.csv") for i
              in range(1, len(os.listdir(span_dir)) + 1)]

span_stats = [tfdv.generate_statistics_from_csv(file) for file in span_files]
span_schema = tfdv.infer_schema(span_stats[2])

In [56]:
span_schema.feature

[name: "LatD"
type: INT
presence {
  min_fraction: 1.0
  min_count: 1
}
shape {
  dim {
    size: 1
  }
}
, name: " \"LatM\""
type: INT
presence {
  min_fraction: 1.0
  min_count: 1
}
shape {
  dim {
    size: 1
  }
}
, name: " \"LatS\""
type: INT
presence {
  min_fraction: 1.0
  min_count: 1
}
shape {
  dim {
    size: 1
  }
}
, name: " \"NS\""
type: BYTES
domain: " \"NS\""
presence {
  min_fraction: 1.0
  min_count: 1
}
shape {
  dim {
    size: 1
  }
}
, name: " \"LonD\""
type: INT
presence {
  min_fraction: 1.0
  min_count: 1
}
shape {
  dim {
    size: 1
  }
}
, name: " \"LonM\""
type: INT
presence {
  min_fraction: 1.0
  min_count: 1
}
shape {
  dim {
    size: 1
  }
}
, name: " \"LonS\""
type: INT
presence {
  min_fraction: 1.0
  min_count: 1
}
shape {
  dim {
    size: 1
  }
}
, name: " \"EW\""
type: BYTES
domain: " \"EW\""
presence {
  min_fraction: 1.0
  min_count: 1
}
shape {
  dim {
    size: 1
  }
}
, name: " \"City\""
type: BYTES
presence {
  min_fraction: 1.0
  min_count

In [57]:
# 드리프트 발생 여부를 "LatD" 피처를 대상으로 검사한다.
lat_d_feat = tfdv.get_feature(schema, "LatD")
lat_d_feat.drift_comparator.infinity_norm.threshold = 0.01
drift_anomalies = tfdv.validate_statistics(
    statistics=span_stats[2],  # ver3 데이터셋이
    schema=span_schema,
    previous_statistics=span_stats[1]  # ver1과 비교해서 drift가 일어났나 확인.
)

### 이외 살펴볼 문제: 편향(bias)
편향: 데이터가 현실을 제대로 반영하지 못하는 문제
선택 편향: 편향된 표본을 선택

참조: [구글 머신러닝 개발자 과정](https://developers.google.com/machine-learning/crash-course/fairness/types-of-bias)

### Slicing
SQL에서 각 행을 `GROUP BY species = "mammal"` 이런 식으로 묶을 수 있듯이,
slicer 함수를 정의하여 각 example들도 슬라이싱별로 묶을 수 있다.

In [8]:
slice_by_california_fn = slicing_util.get_feature_value_slicer(
    features={"state": [b"CA"]}  # 키 피처의 값은 binary list로 제공해야 한다.
)

slice_options = tfdv.StatsOptions(slice_functions=[slice_by_california_fn])
slice_stats = tfdv.generate_statistics_from_tfrecord(
    data_location=raw_path,
    stats_options=slice_options
)

In [23]:
def get_slice_keys(stats):
    return list(map(lambda x: x.name, slice_stats.datasets))


def display_slice_keys(stats):
    print(get_sliced_stats(stats))


def get_sliced_stats(stats, slice_key):
    """

    :param stats: 슬라이스 된 통계 객체임을 가정한다.
    :param slice_key: `stats.datasets`의 이름 중 하나.
    :return: `slice_key`에 의해 반환된 특정 슬라이스 통계.
    """
    sliced_stats = list(filter(lambda ds: ds.name == slice_key,
                               stats.datasets))[0]

    result = stats_pb2.DatasetFeatureStatisticsList()
    result.datasets.add().CopyFrom(sliced_stats)
    return result


def compare_slices(stats, slice_key1, slice_key2):
    lhs_stats = get_sliced_stats(stats, slice_key1)
    rhs_stats = get_sliced_stats(stats, slice_key2)
    tfdv.visualize_statistics(lhs_statistics=lhs_stats, rhs_statistics=rhs_stats)

In [24]:
ca_stats = get_sliced_stats(slice_stats, "state_CA")

In [25]:
compare_slices(slice_stats, *get_slice_keys(slice_stats))

## 파이프라인 일부로서 TFDV
지금까지는 TFDV 라이브러리를 사용하여 독립실행형 데이터 검증을 실시했다. 이제 이 과정을 TFX 컴포넌트로서
파이프라인을 구축한다. 데이터가 통과하는 컴포넌트 순서는 다음과 같다. 각 컴포넌트는 대화형 컨텍스트
객체가 관리하며 완전한 파이프라인은 각 컨텍스트를 오케스트레이션 도구로 자동 관리한다.
1. ExampleGen
2. StatisticsGen
3. SchemaGen
4. ExampleValidator

In [28]:
context = InteractiveContext()



In [33]:
# 먼저 소스(이 경우 tfrecord)에서 데이터 수집을 해야 한다.
example_gen = ImportExampleGen(
    input_base=os.path.join(complaints_dir, "records"))
context.run(example_gen)
context.show(example_gen)

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7ff5482c8790.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2) at 0x7ff5482c8af0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/home/hakjun/projects/pipeline/data/complaints/records['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:24762731,xor_checksum:1673334870,sum_checksum:1673334870"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7ff5482c8790.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2) at 0x7ff5482c8af0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2) at 0x7ff5482c8af0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2) at 0x7ff5482c8af0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/home/hakjun/projects/pipeline/data/complaints/records
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:24762731,xor_checksum:1673334870,sum_checksum:1673334870"


In [38]:
example_artifact = example_gen.outputs["examples"]

In [35]:
# 다음으로 검증을 위한 통계를 만든다.
statistics_gen = StatisticsGen(examples=example_artifact)
context.run(statistics_gen)
context.show(statistics_gen)  # 통계 내용을 보여 주는 것이 아닌, 객체 자체 정보를 보여준다.

0,1
.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7ff5482c8790.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2) at 0x7ff5482c8af0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"
.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7ff54812e760.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"
.exec_properties,['stats_options_json']None['exclude_splits'][]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7ff5482c8790.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2) at 0x7ff5482c8af0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2) at 0x7ff5482c8af0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2) at 0x7ff5482c8af0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ImportExampleGen/examples/2
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7ff54812e760.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3
.span,0
.split_names,"[""train"", ""eval""]"

0,1
['stats_options_json'],
['exclude_splits'],[]


In [40]:
# 통계는 컴포넌트의 실행자(executor)가 생성한 아티팩트이므로 `outputs` 속성으로 검사해야 한다.
statistics_artifact = statistics_gen.outputs["statistics"]
context.show(statistics_artifact)

In [49]:
# 스키마는 데이터 검증과, 데이터 전처리를 모델 그래프 일부로 합체시키는 데도 필요하다.
# 파이프라인에서 스키마가 없을 시에만 이 컴포넌트가 새 스키마를 생성한다.
# 필요 시 스키마 생성 직후에 스키마를 조정하면 파이프라인 전 과정에서 사용할 수 있다.
schema_gen = SchemaGen(statistics=statistics_artifact, infer_feature_shape=True)
context.run(schema_gen)
context.show(schema_gen)

0,1
.inputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7ff54812e760.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"
.outputs,['schema'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7ff55cc5bf40.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4) at 0x7ff55caaf160.type<class 'tfx.types.standard_artifacts.Schema'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4
.exec_properties,['infer_feature_shape']1['exclude_splits'][]

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7ff54812e760.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3
.span,0
.split_names,"[""train"", ""eval""]"

0,1
['schema'],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7ff55cc5bf40.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4) at 0x7ff55caaf160.type<class 'tfx.types.standard_artifacts.Schema'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4

0,1
.type_name,Schema
._artifacts,[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4) at 0x7ff55caaf160.type<class 'tfx.types.standard_artifacts.Schema'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4

0,1
[0],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4) at 0x7ff55caaf160.type<class 'tfx.types.standard_artifacts.Schema'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4

0,1
.type,<class 'tfx.types.standard_artifacts.Schema'>
.uri,/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4

0,1
['infer_feature_shape'],1
['exclude_splits'],[]


In [52]:
schema_artifact = schema_gen.outputs["schema"]
context.show(schema_artifact)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'company',BYTES,required,,-
'company_response',STRING,required,,'company_response'
'consumer_disputed',FLOAT,required,,-
'issue',STRING,required,,'issue'
'product',STRING,required,,'product'
'state',STRING,required,,'state'
'sub_issue',STRING,required,,'sub_issue'
'sub_product',STRING,required,,'sub_product'
'timely_response',STRING,required,,'timely_response'
'zip_code',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'company_response',"'Company believes complaint caused principally by actions of third party outside the control or direction of the company', 'Company believes complaint is the result of an isolated error', 'Company believes complaint relates to a discontinued policy or procedure', 'Company believes complaint represents an opportunity for improvement to better serve consumers', 'Company believes it acted appropriately as authorized by contract or law', 'Company believes the complaint is the result of a misunderstanding', 'Company can\'t verify or dispute the facts in the complaint', 'Company chooses not to provide a public response', 'Company disputes the facts presented in the complaint', 'Company has responded to the consumer and the CFPB and chooses not to provide a public response', 'nan'"
'issue',"'APR or interest rate', 'Account opening, closing, or management', 'Account terms and changes', 'Adding money', 'Advertising and marketing', 'Advertising, marketing or disclosures', 'Application processing delay', 'Application, originator, mortgage broker', 'Applied for loan/did not receive money', 'Arbitration', 'Balance transfer', 'Balance transfer fee', 'Bankruptcy', 'Billing disputes', 'Billing statement', 'Can\'t contact lender', 'Can\'t repay my loan', 'Can\'t stop charges to bank account', 'Cash advance', 'Cash advance fee', 'Charged bank acct wrong day or amt', 'Charged fees or interest I didn\'t expect', 'Closing/Cancelling account', 'Communication tactics', 'Cont\'d attempts collect debt not owed', 'Convenience checks', 'Credit card protection / Debt protection', 'Credit decision / Underwriting', 'Credit determination', 'Credit line increase/decrease', 'Credit monitoring or identity protection', 'Credit reporting company\'s investigation', 'Customer service / Customer relations', 'Customer service/Customer relations', 'Dealing with my lender or servicer', 'Delinquent account', 'Deposits and withdrawals', 'Disclosure verification of debt', 'Disclosures', 'Excessive fees', 'False statements or representation', 'Fees', 'Forbearance / Workout plans', 'Fraud or scam', 'Getting a loan', 'Identity theft / Fraud / Embezzlement', 'Improper contact or sharing of info', 'Improper use of my credit report', 'Incorrect exchange rate', 'Incorrect information on credit report', 'Incorrect/missing disclosures or info', 'Late fee', 'Lender repossessed or sold the vehicle', 'Lender sold the property', 'Loan modification,collection,foreclosure', 'Loan servicing, payments, escrow account', 'Lost or stolen check', 'Lost or stolen money order', 'Making/receiving payments, sending money', 'Managing the line of credit', 'Managing the loan or lease', 'Managing, opening, or closing account', 'Money was not available when promised', 'Other', 'Other fee', 'Other service issues', 'Other transaction issues', 'Overdraft, savings or rewards features', 'Overlimit fee', 'Payment to acct not credited', 'Payoff process', 'Privacy', 'Problems caused by my funds being low', 'Problems when you are unable to pay', 'Received a loan I didn\'t apply for', 'Rewards', 'Sale of account', 'Settlement process and costs', 'Shopping for a line of credit', 'Shopping for a loan or lease', 'Taking out the loan or lease', 'Taking/threatening an illegal action', 'Transaction issue', 'Unable to get credit report/credit score', 'Unauthorized transactions/trans. issues', 'Unexpected/Other fees', 'Unsolicited issuance of credit card', 'Using a debit or ATM card', 'Wrong amount charged or received', 'Lender damaged or destroyed vehicle'"
'product',"'Bank account or service', 'Consumer Loan', 'Credit card', 'Credit reporting', 'Debt collection', 'Money transfers', 'Mortgage', 'Other financial service', 'Payday loan', 'Prepaid card', 'Student loan', 'Virtual currency'"
'state',"'AA', 'AK', 'AL', 'AP', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'FM', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MH', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND', 'NE', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI', 'WA', 'WI', 'WV', 'WY', 'nan', 'AE', 'VT'"
'sub_issue',"'Account status', 'Account terms', 'Account terms and changes', 'Attempted to collect wrong amount', 'Attempted to/Collected exempt funds', 'Billing dispute', 'Called after sent written cease of comm', 'Called outside of 8am-9pm', 'Can\'t decrease my monthly payments', 'Can\'t get flexible payment options', 'Can\'t qualify for a loan', 'Can\'t temporarily postpone payments', 'Contacted employer after asked not to', 'Contacted me after I asked not to', 'Contacted me instead of my attorney', 'Debt is not mine', 'Debt resulted from identity theft', 'Debt was discharged in bankruptcy', 'Debt was paid', 'Don\'t agree with fees charged', 'Frequent or repeated calls', 'Having problems with customer service', 'Impersonated an attorney or official', 'Inadequate help over the phone', 'Indicated committed crime not paying', 'Indicated shouldn\'t respond to lawsuit', 'Information is not mine', 'Investigation took too long', 'Keep getting calls about my loan', 'Need information about my balance/terms', 'No notice of investigation status/result', 'Not disclosed as an attempt to collect', 'Not given enough info to verify debt', 'Personal information', 'Problem cancelling or closing account', 'Problem getting my free annual report', 'Problem getting report or credit score', 'Problem with fraud alerts', 'Problem with statement of dispute', 'Public record', 'Qualify for a better loan than offered', 'Received bad information about my loan', 'Received marketing offer after opted out', 'Receiving unwanted marketing/advertising', 'Reinserted previously deleted info', 'Report improperly shared by CRC', 'Report shared with employer w/o consent', 'Right to dispute notice not received', 'Seized/Attempted to seize property', 'Sued w/o proper notification of suit', 'Sued where didn\'t live/sign for debt', 'Talked to a third party about my debt', 'Threatened arrest/jail if do not pay', 'Threatened to sue on too old debt', 'Threatened to take legal action', 'Trouble with how payments are handled', 'Used obscene/profane/abusive language', 'nan'"
'sub_product',"'(CD) Certificate of deposit', 'Auto', 'Cashing a check without an account', 'Check cashing', 'Checking account', 'Conventional adjustable mortgage (ARM)', 'Conventional fixed mortgage', 'Credit card', 'Credit repair', 'Debt settlement', 'Domestic (US) money transfer', 'Electronic Benefit Transfer / EBT card', 'FHA mortgage', 'Federal student loan', 'Federal student loan servicing', 'Foreign currency exchange', 'General purpose card', 'Gift or merchant card', 'Government benefit payment card', 'Home equity loan or line of credit', 'I do not know', 'ID prepaid card', 'Installment loan', 'International money transfer', 'Medical', 'Mobile wallet', 'Money order', 'Mortgage', 'Non-federal student loan', 'Other (i.e. phone, health club, etc.)', 'Other bank product/service', 'Other mortgage', 'Other special purpose card', 'Pawn loan', 'Payday loan', 'Payroll card', 'Personal line of credit', 'Refund anticipation check', 'Reverse mortgage', 'Savings account', 'Title loan', 'Transit card', 'Traveler’s/Cashier’s checks', 'VA mortgage', 'Vehicle lease', 'Vehicle loan', 'nan'"
'timely_response',"'No', 'Yes'"


In [51]:
# 데이터셋 검증: 설정(+ 업데이트)된 스키마에 맞춰 통계를 검증한다. 피처 속성, skew, drift 임계치 등
# 설정에 의한 이상치 판단 결과를 확인할 수 있다.
example_validator = ExampleValidator(statistics=statistics_artifact,
                                     schema=schema_artifact)
context.run(example_validator)
context.show(example_validator)

0,1
.inputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7ff54812e760.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]['schema'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7ff55cc5bf40.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4) at 0x7ff55caaf160.type<class 'tfx.types.standard_artifacts.Schema'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4"
.outputs,"['anomalies'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleAnomalies' (1 artifact) at 0x7ff55cc2f2e0.type_nameExampleAnomalies._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleAnomalies' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8) at 0x7ff55cc5baf0.type<class 'tfx.types.standard_artifacts.ExampleAnomalies'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8.span0.split_names[""train"", ""eval""]"
.exec_properties,['exclude_splits'][]['custom_validation_config']None

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7ff54812e760.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"
['schema'],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7ff55cc5bf40.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4) at 0x7ff55caaf160.type<class 'tfx.types.standard_artifacts.Schema'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3) at 0x7ff5481292e0.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3.span0.split_names[""train"", ""eval""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/StatisticsGen/statistics/3
.span,0
.split_names,"[""train"", ""eval""]"

0,1
.type_name,Schema
._artifacts,[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4) at 0x7ff55caaf160.type<class 'tfx.types.standard_artifacts.Schema'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4

0,1
[0],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4) at 0x7ff55caaf160.type<class 'tfx.types.standard_artifacts.Schema'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4

0,1
.type,<class 'tfx.types.standard_artifacts.Schema'>
.uri,/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/SchemaGen/schema/4

0,1
['anomalies'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleAnomalies' (1 artifact) at 0x7ff55cc2f2e0.type_nameExampleAnomalies._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleAnomalies' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8) at 0x7ff55cc5baf0.type<class 'tfx.types.standard_artifacts.ExampleAnomalies'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8.span0.split_names[""train"", ""eval""]"

0,1
.type_name,ExampleAnomalies
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleAnomalies' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8) at 0x7ff55cc5baf0.type<class 'tfx.types.standard_artifacts.ExampleAnomalies'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8.span0.split_names[""train"", ""eval""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleAnomalies' (uri: /tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8) at 0x7ff55cc5baf0.type<class 'tfx.types.standard_artifacts.ExampleAnomalies'>.uri/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8.span0.split_names[""train"", ""eval""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleAnomalies'>
.uri,/tmp/tfx-interactive-2023-01-11T06_57_02.948885-ch4kwyky/ExampleValidator/anomalies/8
.span,0
.split_names,"[""train"", ""eval""]"

0,1
['exclude_splits'],[]
['custom_validation_config'],


In [53]:
anomalies_artifact = example_validator.outputs['anomalies']
context.show(anomalies_artifact)