In [3]:
# Import libraries
import os
import requests
import pandas as pd
import tensorflow as tf
import tensorflow_data_validation as tfdv
from sklearn.model_selection import train_test_split
print('TF version:', tf.__version__)
print('TFDV version:', tfdv.version.__version__)

TF version: 2.11.0
TFDV version: 1.13.0.dev20230115


In [4]:
## Download data
if not os.path.isfile('diabetic_data.csv'):
    #http://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008#
    url = 'https://docs.google.com/uc?export=download&id=1k5-1caezQ3zWJbKaiMULTGq-3sz6uThC'
    r = requests.get(url, allow_redirects=True)
    open('diabetic_data.csv', 'wb').write(r.content)

In [5]:
# Check some data
df = pd.read_csv('diabetic_data.csv', index_col=False)
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [6]:
# Check all columns names
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [7]:
# Save Asian data race for later usage
df_with_Asian = df[df['race'] == 'Asian']
df_without_Asian = df[df['race'] != 'Asian']

In [8]:
# Check some Asian data 
df_with_Asian.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
200,2655870,716805,Asian,Male,[60-70),?,6,25,7,1,...,No,No,No,No,No,No,No,No,No,NO
384,3628932,1660932,Asian,Female,[50-60),?,6,25,7,6,...,No,Steady,No,No,No,No,No,No,Yes,>30
414,3783912,100533195,Asian,Male,[70-80),?,6,1,7,8,...,No,No,No,No,No,No,No,No,No,>30
649,5122188,6490224,Asian,Male,[50-60),?,6,25,1,2,...,No,No,No,No,No,No,No,No,Yes,>30
766,5929704,90056529,Asian,Male,[60-70),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [9]:
# Validate objective data distribution
df_without_Asian['diabetesMed'].value_counts(normalize=True)

Yes    0.770215
No     0.229785
Name: diabetesMed, dtype: float64

In [10]:
# Validate objective data distribution
df_with_Asian['diabetesMed'].value_counts(normalize=True)

Yes    0.74103
No     0.25897
Name: diabetesMed, dtype: float64

In [11]:
# Split into train and eval data
TRAIN_DATA, EVAL_DATA = train_test_split(df_without_Asian, test_size=0.25, random_state=42)

In [12]:
# Use saved Asian data for serving data
SERVING_DATA = df_with_Asian.drop('diabetesMed', axis=1)

In [13]:
# Load train frata from DataFrame
train_stats = tfdv.generate_statistics_from_dataframe(TRAIN_DATA)

In [15]:
# Compute stats for training data
# Analyze data distribution and check if transformation are required
tfdv.visualize_statistics(train_stats)

In [16]:
# Infer schema 
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'encounter_id',INT,required,,-
'patient_nbr',INT,required,,-
'race',STRING,required,,'race'
'gender',STRING,required,,'gender'
'age',STRING,required,,'age'
'weight',STRING,required,,'weight'
'admission_type_id',INT,required,,-
'discharge_disposition_id',INT,required,,-
'admission_source_id',INT,required,,-
'time_in_hospital',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'race',"'?', 'AfricanAmerican', 'Caucasian', 'Hispanic', 'Other'"
'gender',"'Female', 'Male', 'Unknown/Invalid'"
'age',"'[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)'"
'weight',"'>200', '?', '[0-25)', '[100-125)', '[125-150)', '[150-175)', '[175-200)', '[25-50)', '[50-75)', '[75-100)'"
'payer_code',"'?', 'BC', 'CH', 'CM', 'CP', 'DM', 'HM', 'MC', 'MD', 'MP', 'OG', 'OT', 'PO', 'SI', 'SP', 'UN', 'WC'"
'medical_specialty',"'?', 'AllergyandImmunology', 'Anesthesiology', 'Anesthesiology-Pediatric', 'Cardiology', 'Cardiology-Pediatric', 'DCPTEAM', 'Dentistry', 'Emergency/Trauma', 'Endocrinology', 'Endocrinology-Metabolism', 'Family/GeneralPractice', 'Gastroenterology', 'Gynecology', 'Hematology', 'Hematology/Oncology', 'Hospitalist', 'InfectiousDiseases', 'InternalMedicine', 'Nephrology', 'Neurology', 'Neurophysiology', 'Obsterics&Gynecology-GynecologicOnco', 'Obstetrics', 'ObstetricsandGynecology', 'Oncology', 'Ophthalmology', 'Orthopedics', 'Orthopedics-Reconstructive', 'Osteopath', 'Otolaryngology', 'OutreachServices', 'Pathology', 'Pediatrics', 'Pediatrics-AllergyandImmunology', 'Pediatrics-CriticalCare', 'Pediatrics-EmergencyMedicine', 'Pediatrics-Endocrinology', 'Pediatrics-Hematology-Oncology', 'Pediatrics-InfectiousDiseases', 'Pediatrics-Neurology', 'Pediatrics-Pulmonology', 'Perinatology', 'PhysicalMedicineandRehabilitation', 'PhysicianNotFound', 'Podiatry', 'Proctology', 'Psychiatry', 'Psychiatry-Addictive', 'Psychiatry-Child/Adolescent', 'Psychology', 'Pulmonology', 'Radiologist', 'Radiology', 'Resident', 'Rheumatology', 'Speech', 'SportsMedicine', 'Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', 'Surgery-Neuro', 'Surgery-Pediatric', 'Surgery-Plastic', 'Surgery-Thoracic', 'Surgery-Vascular', 'SurgicalSpecialty', 'Urology'"
'max_glu_serum',"'>200', '>300', 'None', 'Norm'"
'A1Cresult',"'>7', '>8', 'None', 'Norm'"
'metformin',"'Down', 'No', 'Steady', 'Up'"
'repaglinide',"'Down', 'No', 'Steady', 'Up'"


In [17]:
# Compute stats for evaluation data
eval_stats = tfdv.generate_statistics_from_dataframe(EVAL_DATA)

In [18]:
# Analyze distribution differences between train and eval
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

In [19]:
# Compute evaluation statistics using schema inferred from training data
anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'glimepiride-pioglitazone',Unexpected string values,Examples contain values missing from the schema: Steady (<1%).
'medical_specialty',Unexpected string values,"Examples contain values missing from the schema: Dermatology (<1%), Surgery-PlasticwithinHeadandNeck (<1%)."
'payer_code',Unexpected string values,Examples contain values missing from the schema: FR (<1%).


In [20]:
# Relax the minimum fraction of values that must come from the domain for feature medical_specialty.
medical_specialty = tfdv.get_feature(schema, 'medical_specialty')
medical_specialty.distribution_constraints.min_domain_mass = 0.9

# Add new value to the domain of feature glimepiride-pioglitazone. (medicine)
glimepiride_pioglitazone = tfdv.get_domain(schema, 'glimepiride-pioglitazone')
glimepiride_pioglitazone.value.append('Steady')

# Add new value to the domain of feature payer_code.
payer_code = tfdv.get_domain(schema, 'payer_code')
payer_code.value.append('FR')

# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)

In [21]:
# Compute serving statistics using schema inferred from training data
serving_stats = tfdv.generate_statistics_from_dataframe(SERVING_DATA)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)
tfdv.display_anomalies(serving_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'diabetesMed',Column dropped,Column is completely missing
'race',Unexpected string values,Examples contain values missing from the schema: Asian (~100%).


In [22]:
# Add new value to the domain of feature race.
race = tfdv.get_domain(schema, 'race')
race.value.append('Asian')

# Validate serv stats after updating the schema 
updated_serving_anomalies = tfdv.validate_statistics(serving_stats, schema)
tfdv.display_anomalies(updated_serving_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'diabetesMed',Column dropped,Column is completely missing


In [23]:
# All features are by default in both TRAINING and SERVING environments.
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

# Specify that 'diabetesMed' feature is not in SERVING environment.
tfdv.get_feature(schema, 'diabetesMed').not_in_environment.append('SERVING')

serving_anomalies_with_env = tfdv.validate_statistics(
    serving_stats, schema, environment='SERVING')

tfdv.display_anomalies(serving_anomalies_with_env)

In [24]:
# Add skew comparator for 'glimepiride-pioglitazone' feature.
glimepiride_pioglitazone = tfdv.get_feature(schema, 'glimepiride-pioglitazone')
glimepiride_pioglitazone.skew_comparator.infinity_norm.threshold = 0.01

# Add drift comparator for 'medical_specialty' feature.
medical_specialty=tfdv.get_feature(schema, 'medical_specialty')
medical_specialty.drift_comparator.infinity_norm.threshold = 0.001

skew_anomalies = tfdv.validate_statistics(train_stats, schema,
                                          previous_statistics=eval_stats,
                                          serving_statistics=serving_stats)

tfdv.display_anomalies(skew_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'medical_specialty',High Linfty distance between current and previous,"The Linfty distance between current and previous is 0.00244668 (up to six significant digits), above the threshold 0.001. The feature value with maximum difference is: InternalMedicine"


In [25]:
from tensorflow.python.lib.io import file_io
from google.protobuf import text_format
OUTPUT_DIR='./'
file_io.recursive_create_dir(OUTPUT_DIR)
schema_file = os.path.join(OUTPUT_DIR, 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)

!head {schema_file}


feature {
  name: "encounter_id"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
