In [6]:
pip install tensorflow-data-validation==1.4.0


Collecting tensorflow-data-validation==1.4.0
  Downloading tensorflow_data_validation-1.4.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 4.7 MB/s 
Collecting absl-py<0.13,>=0.9
  Downloading absl_py-0.12.0-py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 39.9 MB/s 
Collecting numpy<1.20,>=1.16
  Downloading numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 402 kB/s 
Collecting tfx-bsl<1.5,>=1.4.0
  Downloading tfx_bsl-1.4.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 1.2 MB/s 
Collecting tensorflow-metadata<1.5,>=1.4
  Downloading tensorflow_metadata-1.4.0-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 3.9 MB/s 
Collecting tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,<3,>=1.15.2
  Downloading tensorflow-2.7.1-cp37-cp

# Import Libraries

In [1]:
# Import packages
import os
import pandas as pd
import tensorflow as tf
import tempfile, urllib, zipfile
import tensorflow_data_validation as tfdv


from tensorflow.python.lib.io import file_io
from tensorflow_data_validation.utils import slicing_util
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList, DatasetFeatureStatistics

# Set TF's logger to only display errors to avoid internal warnings being shown
tf.get_logger().setLevel('ERROR')

#Read and Split the Dataset

In [2]:
# Read CSV data into a dataframe and recognize the missing data that is encoded with '?' string as NaN
df = pd.read_csv('diabetic_data.csv', header=0, na_values = '?')

# Preview the dataset
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
def prepare_data_splits_from_dataframe(df):
    
    # 70% of records for generating the training set
    train_len = int(len(df) * 0.7)
    
    # Remaining 30% of records for generating the evaluation and serving sets
    eval_serv_len = len(df) - train_len
    
    # Half of the 30%, which makes up 15% of total records, for generating the evaluation set
    eval_len = eval_serv_len // 2
    
    # Remaining 15% of total records for generating the serving set
    serv_len = eval_serv_len - eval_len 
 
    # Sample the train, validation and serving sets. We specify a random state for repeatable outcomes.
    train_df = df.iloc[:train_len].sample(frac=1, random_state=48).reset_index(drop=True)
    eval_df = df.iloc[train_len: train_len + eval_len].sample(frac=1, random_state=48).reset_index(drop=True)
    serving_df = df.iloc[train_len + eval_len: train_len + eval_len + serv_len].sample(frac=1, random_state=48).reset_index(drop=True)
 
    # Serving data emulates the data that would be submitted for predictions, so it should not have the label column.
    serving_df = serving_df.drop(['readmitted'], axis=1)

    return train_df, eval_df, serving_df

In [4]:
# Split the datasets
train_df, eval_df, serving_df = prepare_data_splits_from_dataframe(df)
print('Training dataset has {} records\nValidation dataset has {} records\nServing dataset has {} records'.format(len(train_df),len(eval_df),len(serving_df)))



Training dataset has 3894 records
Validation dataset has 835 records
Serving dataset has 835 records


#Removing Irrelevant Features


In [5]:
# Define features to remove
features_to_remove = {'encounter_id', 'patient_nbr'}

# Collect features to whitelist while computing the statistics
approved_cols = [col for col in df.columns if (col not in features_to_remove)]

# Instantiate a StatsOptions class and define the feature_whitelist property
stats_options = tfdv.StatsOptions(feature_allowlist=approved_cols)

# Review the features to generate the statistics
print(stats_options.feature_allowlist)

['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


#Generate Training Statistics

In [6]:
train_stats = tfdv.generate_statistics_from_dataframe(train_df, stats_options)

# get the number of features used to compute statistics
print(f"Number of features used: {len(train_stats.datasets[0].features)}")

# check the number of examples used
print(f"Number of examples used: {train_stats.datasets[0].num_examples}")

# check the column names of the first and last feature
print(f"First feature: {train_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {train_stats.datasets[0].features[-1].path.step[0]}")

Number of features used: 48
Number of examples used: 3894
First feature: race
Last feature: readmitted


#Visualize Training Statistics

In [7]:
tfdv.visualize_statistics(train_stats)

#Infer the training set schema

In [8]:
# Infer the data schema by using the training statistics that you generated
schema = tfdv.infer_schema(train_stats)

# Display the data schema
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'race',STRING,optional,single,'race'
'gender',STRING,required,,'gender'
'age',STRING,required,,'age'
'weight',STRING,optional,single,'weight'
'admission_type_id',INT,required,,-
'discharge_disposition_id',INT,required,,-
'admission_source_id',INT,required,,-
'time_in_hospital',INT,required,,-
'payer_code',FLOAT,optional,,-
'medical_specialty',STRING,optional,single,'medical_specialty'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'race',"'AfricanAmerican', 'Asian', 'Caucasian', 'Hispanic', 'Other'"
'gender',"'Female', 'Male'"
'age',"'[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)'"
'weight',"'[0-25)', '[100-125)', '[125-150)', '[25-50)', '[50-75)', '[75-100)'"
'medical_specialty',"'Anesthesiology-Pediatric', 'Cardiology', 'Emergency/Trauma', 'Endocrinology', 'Family/GeneralPractice', 'Gastroenterology', 'Gynecology', 'Hematology/Oncology', 'InternalMedicine', 'Nephrology', 'Neurology', 'Obsterics&Gynecology-GynecologicOnco', 'ObstetricsandGynecology', 'Oncology', 'Ophthalmology', 'Orthopedics', 'Orthopedics-Reconstructive', 'Otolaryngology', 'Pediatrics', 'Pediatrics-CriticalCare', 'Pediatrics-EmergencyMedicine', 'Pediatrics-Endocrinology', 'Pediatrics-Hematology-Oncology', 'Pediatrics-Neurology', 'Pediatrics-Pulmonology', 'PhysicalMedicineandRehabilitation', 'Podiatry', 'Psychiatry', 'Psychiatry-Child/Adolescent', 'Psychology', 'Pulmonology', 'Radiology', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Neuro', 'Surgery-Pediatric', 'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck', 'Surgery-Thoracic', 'Urology'"
'max_glu_serum',"'>200', '>300', 'None', 'Norm'"
'A1Cresult',"'>7', '>8', 'None', 'Norm'"
'metformin',"'Down', 'No', 'Steady', 'Up'"
'repaglinide',"'Down', 'No', 'Steady', 'Up'"
'nateglinide','No'


In [9]:
# Check number of features
print(f"Number of features in schema: {len(schema.feature)}")

# Check domain name of 2nd feature
print(f"Second feature in schema: {list(schema.feature)[1].domain}")

Number of features in schema: 48
Second feature in schema: gender


#Calculate, Visualize and Fix Evaluation Anomalies

In [10]:
# Compare Training and Evaluation Statistics
# HINT: Remember to use the evaluation dataframe and to pass the stats_options (that you defined before) as an argument
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df, stats_options=stats_options)

# Compare evaluation data with training data 
# HINT: Remember to use both the evaluation and training statistics with the lhs_statistics and rhs_statistics arguments
# HINT: Assign the names of 'EVAL_DATASET' and 'TRAIN_DATASET' to the lhs and rhs protocols
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')
                          

In [11]:
# get the number of features used to compute statistics
print(f"Number of features: {len(eval_stats.datasets[0].features)}")

# check the number of examples used
print(f"Number of examples: {eval_stats.datasets[0].num_examples}")

# check the column names of the first and last feature
print(f"First feature: {eval_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {eval_stats.datasets[0].features[-1].path.step[0]}")

Number of features: 48
Number of examples: 835
First feature: race
Last feature: readmitted


#Detecting Anomalies

In [12]:
def calculate_and_display_anomalies(statistics, schema):

    # HINTS: Pass the statistics and schema parameters into the validation function 
    anomalies = tfdv.validate_statistics(statistics, schema)
    
    # HINTS: Display input anomalies by using the calculated anomalies
    tfdv.display_anomalies(anomalies)


calculate_and_display_anomalies(eval_stats, schema=schema)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'medical_specialty',Unexpected string values,Examples contain values missing from the schema: InfectiousDiseases (<1%).
'glyburide-metformin',Unexpected string values,Examples contain values missing from the schema: Steady (<1%).


#Fix evaluation anomalies in the schema

In [13]:
# Get the domain associated with the input feature, glimepiride-pioglitazone, from the schema
glyburide_metformin_domain = tfdv.get_domain(schema, 'glyburide-metformin') 

# HINT: Append the missing value 'Steady' to the domain
glyburide_metformin_domain.value.append('Down')

# Get the domain associated with the input feature, medical_specialty, from the schema
medical_specialty_domain = tfdv.get_domain(schema, 'medical_specialty') 

# HINT: Append the missing value 'Neurophysiology' to the domain
medical_specialty_domain.value.append('AllergyandImmunology')
medical_specialty_domain.value.append('Rheumatology')
medical_specialty_domain.value.append('Surgery-Maxillofacial')

# HINT: Re-calculate and re-display anomalies with the new schema
calculate_and_display_anomalies(eval_stats, schema=schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'medical_specialty',Unexpected string values,Examples contain values missing from the schema: InfectiousDiseases (<1%).
'glyburide-metformin',Unexpected string values,Examples contain values missing from the schema: Steady (<1%).


#Check anomalies in the serving set
Let's create a new StatsOptions that is aware of the information provided by the schema and use it when generating statistics from the serving DataFrame.



In [14]:
# Define a new statistics options by the tfdv.StatsOptions class for the serving data by passing the previously inferred schema
options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True, feature_allowlist=approved_cols)

# Generate serving dataset statistics
# HINT: Remember to use the serving dataframe and to pass the newly defined statistics options
serving_stats = tfdv.generate_statistics_from_dataframe(serving_df, stats_options=options)

# HINT: Calculate and display anomalies using the generated serving statistics
calculate_and_display_anomalies(serving_stats, schema=schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'metformin-rosiglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'readmitted',Column dropped,Column is completely missing
'A1Cresult',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'nateglinide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'acarbose',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'glyburide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'metformin-pioglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'metformin',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'acetohexamide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'troglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"


#Detecting anomalies with environments

In [18]:
# All features are by default in both TRAINING and SERVING environments.
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

# Specify that 'readmitted' feature is not in SERVING environment.
# HINT: Append the 'SERVING' environmnet to the not_in_environment attribute of the feature
tfdv.get_feature(schema, 'readmitted').not_in_environment.append('SERVING')

# HINT: Calculate anomalies with the validate_statistics function by using the serving statistics, 
# inferred schema and the SERVING environment parameter.
serving_anomalies_with_env = tfdv.validate_statistics(serving_stats, schema, environment='SERVING')

# Display anomalies
tfdv.display_anomalies(serving_anomalies_with_env)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'max_glu_serum',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'pioglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'metformin-rosiglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'A1Cresult',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'diabetesMed',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'tolbutamide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'glipizide-metformin',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'miglitol',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'number_outpatient',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"
'number_diagnoses',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.998802"


#Check for Data Drift and Skew

In [21]:
# Calculate skew for the diabetesMed feature
diabetes_med = tfdv.get_feature(schema, 'diabetesMed')
diabetes_med.skew_comparator.infinity_norm.threshold = 0.03 # domain knowledge helps to determine this threshold

# Calculate drift for the payer_code feature
payer_code = tfdv.get_feature(schema, 'payer_code')
payer_code.drift_comparator.infinity_norm.threshold = 0.03 # domain knowledge helps to determine this threshold

# Calculate anomalies
skew_drift_anomalies = tfdv.validate_statistics(train_stats, schema,
                                          previous_statistics=eval_stats,
                                          serving_statistics=serving_stats)

# Display anomalies
tfdv.display_anomalies(skew_drift_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'diabetesMed',High Linfty distance between training and serving,"The Linfty distance between training and serving is 0.0638318 (up to six significant digits), above the threshold 0.03. The feature value with maximum difference is: Yes"


#Display Stats for Data Slices

In [23]:
def split_datasets(dataset_list):
    '''
    split datasets.

            Parameters:
                    dataset_list: List of datasets to split

            Returns:
                    datasets: sliced data
    '''
    datasets = []
    for dataset in dataset_list.datasets:
        proto_list = DatasetFeatureStatisticsList()
        proto_list.datasets.extend([dataset])
        datasets.append(proto_list)
    return datasets


def display_stats_at_index(index, datasets):
    '''
    display statistics at the specified data index

            Parameters:
                    index : index to show the anomalies
                    datasets: split data

            Returns:
                    display of generated sliced data statistics at the specified index
    '''
    if index < len(datasets):
        print(datasets[index].datasets[0].name)
        tfdv.visualize_statistics(datasets[index])

In [26]:
def sliced_stats_for_slice_fn(slice_fn, approved_cols, dataframe, schema):
  
    # Set the StatsOptions
    slice_stats_options = tfdv.StatsOptions(schema=schema,slice_functions=[slice_fn],infer_type_from_schema=True,feature_allowlist=approved_cols)
    
    # Convert Dataframe to CSV since `slice_functions` works only with `tfdv.generate_statistics_from_csv`
    CSV_PATH = 'slice_sample.csv'
    dataframe.to_csv(CSV_PATH)
    
    # Calculate statistics for the sliced dataset
    sliced_stats = tfdv.generate_statistics_from_csv(CSV_PATH, stats_options=slice_stats_options)
    
    # Split the dataset using the previously defined split_datasets function
    slice_info_datasets = split_datasets(sliced_stats)
    
    return slice_info_datasets

In [27]:
# Generate slice function for the `medical_speciality` feature
slice_fn = slicing_util.get_feature_value_slicer(features={'medical_specialty': None})

# Generate stats for the sliced dataset
slice_datasets = sliced_stats_for_slice_fn(slice_fn, approved_cols, dataframe=train_df, schema=schema)

# Print name of slices for reference
print(f'Statistics generated for:\n')
print('\n'.join([sliced.datasets[0].name for sliced in slice_datasets]))

# Display at index 10, which corresponds to the slice named `medical_specialty_Gastroenterology`
display_stats_at_index(10, slice_datasets)





Statistics generated for:

All Examples
medical_specialty_Family/GeneralPractice
medical_specialty_InternalMedicine
medical_specialty_Cardiology
medical_specialty_Orthopedics-Reconstructive
medical_specialty_Nephrology
medical_specialty_Psychiatry
medical_specialty_Surgery-General
medical_specialty_Gastroenterology
medical_specialty_Neurology
medical_specialty_Pediatrics-Endocrinology
medical_specialty_Surgery-Cardiovascular/Thoracic
medical_specialty_Psychiatry-Child/Adolescent
medical_specialty_Obsterics&Gynecology-GynecologicOnco
medical_specialty_Pulmonology
medical_specialty_Hematology/Oncology
medical_specialty_Pediatrics-CriticalCare
medical_specialty_Pediatrics
medical_specialty_Surgery-Neuro
medical_specialty_ObstetricsandGynecology
medical_specialty_Urology
medical_specialty_Otolaryngology
medical_specialty_Endocrinology
medical_specialty_PhysicalMedicineandRehabilitation
medical_specialty_Gynecology
medical_specialty_Anesthesiology-Pediatric
medical_specialty_Pediatrics-Hema

#Freeze the schema

In [None]:
# Create output directory
OUTPUT_DIR = "output"
file_io.recursive_create_dir(OUTPUT_DIR)

# Use TensorFlow text output format pbtxt to store the schema
schema_file = os.path.join(OUTPUT_DIR, 'schema.pbtxt')

# write_schema_text function expect the defined schema and output path as parameters
tfdv.write_schema_text(schema, schema_file) 