In [1]:
!pip install tensorflow_data_validation

Collecting tensorflow_data_validation
  Downloading tensorflow_data_validation-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
Collecting apache-beam[gcp]<3,>=2.47 (from tensorflow_data_validation)
  Downloading apache_beam-2.53.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyfarmhash<0.4,>=0.2.2 (from tensorflow_data_validation)
  Downloading pyfarmhash-0.3.2.tar.gz (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.9/99.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tfx-bsl<1.15,>=1.14.0 (from tensorflow_data_validation)
  Downloading tfx_bsl-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylin

In [2]:
import os
import pandas as pd
import tensorflow as tf
import tempfile, urllib, zipfile
import tensorflow_data_validation as tfdv

from tensorflow.python.lib.io import file_io
from tensorflow_data_validation.utils import slicing_util
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList

In [3]:
df = pd.read_csv("/content/diabetic_data.csv",header=0, na_values = '?')
df.head(2)

  df = pd.read_csv("/content/diabetic_data.csv",header=0, na_values = '?')


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30


In [4]:
# Splitting the data

def prepare_data_splits_from_dataframe(df) :
  train_len = int(len(df) * 0.7)
  eval_serv_len = len(df) - train_len
  eval_len = eval_serv_len // 2
  serv_len = eval_serv_len - eval_len

  train_df = df.iloc[: train_len].sample(frac = 1, random_state = 48).reset_index(drop=True)
  eval_df = df.iloc[train_len: train_len + eval_len].sample(frac = 1, random_state = 48).reset_index(drop = True)
  serving_df = df.iloc[train_len+eval_len : train_len + eval_len + serv_len].sample(frac=1, random_state = 48).reset_index(drop = True)

  serving_df = serving_df.drop(['readmitted'], axis = 1)

  return train_df, eval_df, serving_df


In [5]:
# Split the datasets
train_df, eval_df, serving_df = prepare_data_splits_from_dataframe(df)

In [6]:
# Features to remove
features_to_remove = {'encounter_id', 'patient_nbr'}

# Create a list of approved features
approved_cols = [col for col in df.columns if col not in features_to_remove]

# Create a new DataFrame with only the approved features
train_df = train_df[approved_cols]
eval_df = eval_df[approved_cols]

cols = [col for col in serving_df.columns if col not in features_to_remove]
serving_df = serving_df[cols]
# Compute statistics for the filtered DataFrame
stats_options = tfdv.StatsOptions()
train_stats = tfdv.generate_statistics_from_dataframe(train_df, stats_options=stats_options)

In [7]:
# TEST CODE

# get the number of features used to compute statistics
print(f"Number of features used: {len(train_stats.datasets[0].features)}")

# check the number of examples used
print(f"Number of examples used: {train_stats.datasets[0].num_examples}")

# check the column names of the first and last feature
print(f"First feature: {train_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {train_stats.datasets[0].features[-1].path.step[0]}")

Number of features used: 48
Number of examples used: 34758
First feature: race
Last feature: readmitted


In [8]:
# visulaise
tfdv.visualize_statistics(train_stats)

In [9]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'race',STRING,optional,single,'race'
'gender',STRING,required,,'gender'
'age',STRING,required,,'age'
'weight',STRING,optional,single,'weight'
'admission_type_id',INT,required,,-
'discharge_disposition_id',INT,required,,-
'admission_source_id',INT,required,,-
'time_in_hospital',INT,required,,-
'payer_code',STRING,optional,single,'payer_code'
'medical_specialty',STRING,optional,single,'medical_specialty'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'race',"'AfricanAmerican', 'Asian', 'Caucasian', 'Hispanic', 'Other'"
'gender',"'Female', 'Male', 'Unknown/Invalid'"
'age',"'[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)'"
'weight',"'>200', '[0-25)', '[100-125)', '[125-150)', '[150-175)', '[175-200)', '[25-50)', '[50-75)', '[75-100)'"
'payer_code',"'BC', 'CH', 'CM', 'CP', 'DM', 'HM', 'MC', 'MD', 'OT', 'PO', 'SI', 'SP', 'UN', 'WC'"
'medical_specialty',"'AllergyandImmunology', 'Anesthesiology', 'Anesthesiology-Pediatric', 'Cardiology', 'Dentistry', 'Emergency/Trauma', 'Endocrinology', 'Family/GeneralPractice', 'Gastroenterology', 'Gynecology', 'Hematology', 'Hematology/Oncology', 'InfectiousDiseases', 'InternalMedicine', 'Nephrology', 'Neurology', 'Obsterics&Gynecology-GynecologicOnco', 'ObstetricsandGynecology', 'Oncology', 'Ophthalmology', 'Orthopedics', 'Orthopedics-Reconstructive', 'Osteopath', 'Otolaryngology', 'Pediatrics', 'Pediatrics-AllergyandImmunology', 'Pediatrics-CriticalCare', 'Pediatrics-EmergencyMedicine', 'Pediatrics-Endocrinology', 'Pediatrics-Hematology-Oncology', 'Pediatrics-InfectiousDiseases', 'Pediatrics-Neurology', 'Pediatrics-Pulmonology', 'PhysicalMedicineandRehabilitation', 'PhysicianNotFound', 'Podiatry', 'Proctology', 'Psychiatry', 'Psychiatry-Addictive', 'Psychiatry-Child/Adolescent', 'Psychology', 'Pulmonology', 'Radiology', 'Rheumatology', 'Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', 'Surgery-Neuro', 'Surgery-Pediatric', 'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck', 'Surgery-Thoracic', 'Surgery-Vascular', 'Urology'"
'max_glu_serum',"'>200', '>300', 'None', 'Norm'"
'A1Cresult',"'>7', '>8', 'None', 'Norm'"
'metformin',"'Down', 'No', 'Steady', 'Up'"
'repaglinide',"'Down', 'No', 'Steady', 'Up'"


In [10]:
# testing 2.0

print(f"No of features in schema : {len(schema.feature)}")
print(f"Second feature in schema : {list(schema.feature)[1].domain}")

No of features in schema : 48
Second feature in schema : gender


In [11]:
# Evaluation statistics
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df, stats_options = stats_options)

# Comparing
tfdv.visualize_statistics(
    lhs_statistics = eval_stats,
    rhs_statistics = train_stats,
    lhs_name = 'EVAL_DATASET',
    rhs_name = 'TRAIN_DATASET'
)

In [12]:
print(f"No of features : {len(eval_stats.datasets[0].features)}")
print(f"No of examples : {eval_stats.datasets[0].num_examples}")
print(f"First feature: {eval_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {eval_stats.datasets[0].features[-1].path.step[0]}")

No of features : 48
No of examples : 7448
First feature: race
Last feature: readmitted


In [13]:
# Detecting anamoly
def calculate_and_display_anomalies(statistics, schema):
  anomalies = tfdv.validate_statistics(statistics, schema)
  tfdv.display_anomalies(anomalies)

In [14]:
calculate_and_display_anomalies(eval_stats, schema=schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'medical_specialty',Unexpected string values,"Examples contain values missing from the schema: Obstetrics (<1%), Radiologist (<1%), SurgicalSpecialty (<1%)."
'payer_code',Unexpected string values,Examples contain values missing from the schema: OG (~1%).
'glipizide-metformin',Unexpected string values,Examples contain values missing from the schema: Steady (<1%).
'nateglinide',Unexpected string values,Examples contain values missing from the schema: Up (<1%).
'miglitol',Unexpected string values,Examples contain values missing from the schema: Down (<1%).


In [15]:
# anomalies correction

glimepride = tfdv.get_domain(schema, 'glimepiride-pioglitazone')
glimepride.value.append('Steady')

medical = tfdv.get_domain(schema, 'medical_specialty')
medical.value.append('Neurophysiology')

calculate_and_display_anomalies(eval_stats, schema= schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'payer_code',Unexpected string values,Examples contain values missing from the schema: OG (~1%).
'medical_specialty',Unexpected string values,"Examples contain values missing from the schema: Obstetrics (<1%), Radiologist (<1%), SurgicalSpecialty (<1%)."
'miglitol',Unexpected string values,Examples contain values missing from the schema: Down (<1%).
'glipizide-metformin',Unexpected string values,Examples contain values missing from the schema: Steady (<1%).
'nateglinide',Unexpected string values,Examples contain values missing from the schema: Up (<1%).


In [16]:
# checking anomalies in serving set
serving_stats = tfdv.generate_statistics_from_dataframe(serving_df, stats_options = stats_options)
calculate_and_display_anomalies(serving_stats, schema= schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'change',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'chlorpropamide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'diabetesMed',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'num_lab_procedures',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'num_medications',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'glimepiride-pioglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'A1Cresult',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'pioglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'miglitol',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'citoglipton',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"


In [17]:
# relaxes the min fraction
prayer_code = tfdv.get_feature(schema, 'payer_code')
prayer_code.distribution_constraints.min_domain_mass = 0.9

medical_specialty = tfdv.get_feature(schema, 'medical_specialty')
medical_specialty.distribution_constraints.min_domain_mass = 0.9

calculate_and_display_anomalies(serving_stats, schema = schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'metformin',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'glimepiride',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'nateglinide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866 Examples contain values missing from the schema: Up (<1%)."
'readmitted',Column dropped,Column is completely missing
'repaglinide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'glipizide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'troglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'num_procedures',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'examide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'glyburide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"


In [18]:
# Changing domain
def modify_domain_of_features(features_list, schema, to_domain_name):
  for feature in features_list:
    tfdv.set_domain(schema, feature, to_domain_name)
  return schema

In [19]:
domain_change_features = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
                          'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
                          'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
                          'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
                          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']


# Infer new schema by using your modify_domain_of_features function
# and the defined domain_change_features feature list
schema = modify_domain_of_features(domain_change_features, schema, 'metformin')

# Display new schema
tfdv.display_schema(schema)



Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'race',STRING,optional,single,'race'
'gender',STRING,required,,'gender'
'age',STRING,required,,'age'
'weight',STRING,optional,single,'weight'
'admission_type_id',INT,required,,-
'discharge_disposition_id',INT,required,,-
'admission_source_id',INT,required,,-
'time_in_hospital',INT,required,,-
'payer_code',STRING,optional,single,'payer_code'
'medical_specialty',STRING,optional,single,'medical_specialty'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'race',"'AfricanAmerican', 'Asian', 'Caucasian', 'Hispanic', 'Other'"
'gender',"'Female', 'Male', 'Unknown/Invalid'"
'age',"'[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)'"
'weight',"'>200', '[0-25)', '[100-125)', '[125-150)', '[150-175)', '[175-200)', '[25-50)', '[50-75)', '[75-100)'"
'payer_code',"'BC', 'CH', 'CM', 'CP', 'DM', 'HM', 'MC', 'MD', 'OT', 'PO', 'SI', 'SP', 'UN', 'WC'"
'medical_specialty',"'AllergyandImmunology', 'Anesthesiology', 'Anesthesiology-Pediatric', 'Cardiology', 'Dentistry', 'Emergency/Trauma', 'Endocrinology', 'Family/GeneralPractice', 'Gastroenterology', 'Gynecology', 'Hematology', 'Hematology/Oncology', 'InfectiousDiseases', 'InternalMedicine', 'Nephrology', 'Neurology', 'Obsterics&Gynecology-GynecologicOnco', 'ObstetricsandGynecology', 'Oncology', 'Ophthalmology', 'Orthopedics', 'Orthopedics-Reconstructive', 'Osteopath', 'Otolaryngology', 'Pediatrics', 'Pediatrics-AllergyandImmunology', 'Pediatrics-CriticalCare', 'Pediatrics-EmergencyMedicine', 'Pediatrics-Endocrinology', 'Pediatrics-Hematology-Oncology', 'Pediatrics-InfectiousDiseases', 'Pediatrics-Neurology', 'Pediatrics-Pulmonology', 'PhysicalMedicineandRehabilitation', 'PhysicianNotFound', 'Podiatry', 'Proctology', 'Psychiatry', 'Psychiatry-Addictive', 'Psychiatry-Child/Adolescent', 'Psychology', 'Pulmonology', 'Radiology', 'Rheumatology', 'Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', 'Surgery-Neuro', 'Surgery-Pediatric', 'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck', 'Surgery-Thoracic', 'Surgery-Vascular', 'Urology', 'Neurophysiology'"
'max_glu_serum',"'>200', '>300', 'None', 'Norm'"
'A1Cresult',"'>7', '>8', 'None', 'Norm'"
'metformin',"'Down', 'No', 'Steady', 'Up'"
'repaglinide',"'Down', 'No', 'Steady', 'Up'"


In [20]:
# TEST CODE

# check that the domain of some features are now switched to `metformin`
print(f"Domain name of 'chlorpropamide': {tfdv.get_feature(schema, 'chlorpropamide').domain}")
print(f"Domain values of 'chlorpropamide': {tfdv.get_domain(schema, 'chlorpropamide').value}")
print(f"Domain name of 'repaglinide': {tfdv.get_feature(schema, 'repaglinide').domain}")
print(f"Domain values of 'repaglinide': {tfdv.get_domain(schema, 'repaglinide').value}")
print(f"Domain name of 'nateglinide': {tfdv.get_feature(schema, 'nateglinide').domain}")
print(f"Domain values of 'nateglinide': {tfdv.get_domain(schema, 'nateglinide').value}")

Domain name of 'chlorpropamide': metformin
Domain values of 'chlorpropamide': ['Down', 'No', 'Steady', 'Up']
Domain name of 'repaglinide': metformin
Domain values of 'repaglinide': ['Down', 'No', 'Steady', 'Up']
Domain name of 'nateglinide': metformin
Domain values of 'nateglinide': ['Down', 'No', 'Steady', 'Up']


In [21]:
calculate_and_display_anomalies(serving_stats, schema = schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'tolazamide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'glipizide-metformin',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'glimepiride-pioglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'number_inpatient',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'acetohexamide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'num_procedures',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'diabetesMed',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'insulin',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'number_outpatient',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'A1Cresult',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"


In [22]:
# Environments
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

In [24]:
tfdv.get_feature(schema, 'readmitted').not_in_environment.append('SERVING')
serving_anomalies_with_env  = tfdv.validate_statistics(serving_stats, schema, environment = 'SERVING')

In [25]:
tfdv.display_anomalies(serving_anomalies_with_env)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'chlorpropamide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'num_medications',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'rosiglitazone',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'glimepiride',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'acetohexamide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'number_inpatient',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'change',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'citoglipton',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'metformin',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"
'nateglinide',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.999866"


In [28]:
# Calculate skew for the diabetesMed feature
diabetes_med = tfdv.get_feature(schema, 'diabetesMed')
diabetes_med.skew_comparator.infinity_norm.threshold = 0.035 # domain knowledge helps to determine this threshold

# Calculate drift for the payer_code feature
payer_code = tfdv.get_feature(schema, 'payer_code')
payer_code.drift_comparator.infinity_norm.threshold = 0.03 # domain knowledge helps to determine this threshold

# Calculate anomalies
skew_drift_anomalies = tfdv.validate_statistics(train_stats, schema,
                                          previous_statistics=eval_stats,
                                          serving_statistics=serving_stats)

# Display anomalies
tfdv.display_anomalies(skew_drift_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'payer_code',High Linfty distance between current and previous,"The Linfty distance between current and previous is 0.104036 (up to six significant digits), above the threshold 0.03. The feature value with maximum difference is: SP"


In [29]:
# Splitting
def split_datasets(dataset_list):
  datasets = []
  for dataset in dataset_list.datasets :
    proto_list = DatasetFeatureStatisticsList()
    proto_list.datasets.extend([dataset])
    datasets.append(proto_list)
  return datasets

def display_stats_at_index(index, datasets):
  if index < len(datasets):
    print(datasets[index].datasets[0].name)
    tfdv.visualize_statistics(datasets[index])

In [32]:
def sliced_stats_for_slice_fn(slice_fn, approved_cols, dataframe, schema):
    '''
    generate statistics for the sliced data.

            Parameters:
                    slice_fn : slicing definition
                    approved_cols: list of features to pass to the statistics options
                    dataframe: pandas dataframe to slice
                    schema: the schema

            Returns:
                    slice_info_datasets: statistics for the sliced dataset
    '''
    # Set the StatsOptions
    slice_stats_options = tfdv.StatsOptions(schema=schema,
                                            slice_functions=[slice_fn],
                                            infer_type_from_schema=True,
                                             feature_allowlist=approved_cols)

    # Convert Dataframe to CSV since `slice_functions` works only with `tfdv.generate_statistics_from_csv`
    CSV_PATH = 'slice_sample.csv'
    dataframe.to_csv(CSV_PATH)

    # Calculate statistics for the sliced dataset
    sliced_stats = tfdv.generate_statistics_from_csv(CSV_PATH, stats_options=slice_stats_options)

    # Split the dataset using the previously defined split_datasets function
    slice_info_datasets = split_datasets(sliced_stats)

    return slice_info_datasets

In [33]:
# Generate slice function for the `medical_speciality` feature
slice_fn = slicing_util.get_feature_value_slicer(features={'medical_specialty': None})

# Generate stats for the sliced dataset
slice_datasets = sliced_stats_for_slice_fn(slice_fn, approved_cols, dataframe=train_df, schema=schema)

# Print name of slices for reference
print(f'Statistics generated for:\n')
print('\n'.join([sliced.datasets[0].name for sliced in slice_datasets]))

# Display at index 10, which corresponds to the slice named `medical_specialty_Gastroenterology`
display_stats_at_index(10, slice_datasets)



Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Statistics generated for:

All Examples
medical_specialty_Psychiatry
medical_specialty_Surgery-Neuro
medical_specialty_InternalMedicine
medical_specialty_Orthopedics-Reconstructive
medical_specialty_Pediatrics-CriticalCare
medical_specialty_Family/GeneralPractice
medical_specialty_Nephrology
medical_specialty_Anesthesiology
medical_specialty_Cardiology
medical_specialty_Surgery-General
medical_specialty_Pulmonology
medical_specialty_Orthopedics
medical_specialty_Pediatrics
medical_specialty_Urology
medical_specialty_Emergency/Trauma
medical_specialty_PhysicalMedicineandRehabilitation
medical_specialty_ObstetricsandGynecology
medical_specialty_Surgery-Cardiovascular/Thoracic
medical_specialty_InfectiousDiseases
medical_specialty_Psychiatry-Child/Adolescent
medical_specialty_Hematology/Oncology
medical_specialty_Surgery-Plastic
medical_specialty_Otolaryngology
medical_specialty_Gastroenterology
medical_specialty_Neurology
medical_specialty_Osteopath
medical_specialty_Endocrinology
medica

In [34]:
Output_dir = "output"
file_io.recursive_create_dir(Output_dir)
schema_file = os.path.join(Output_dir, 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)