### install all dependincies and get ready enviroment

In [4]:
!pip install python-snappy

Collecting python-snappy
[?25l  Downloading https://files.pythonhosted.org/packages/df/bd/a1040e2e04df42fb07e080e74f6464f6a5898bdefe6d08a210e3c3278fb9/python_snappy-0.6.0-cp37-cp37m-manylinux2010_x86_64.whl (55kB)
[K     |████████████████████████████████| 61kB 4.5MB/s eta 0:00:011
[?25hInstalling collected packages: python-snappy
Successfully installed python-snappy-0.6.0


In [1]:
import sys

# Confirm that we're using Python 3
assert sys.version_info.major is 3, 'Oops, not running Python 3. Use Runtime > Change runtime type'

In [2]:
import tensorflow as tf

print('Installing TensorFlow Data Validation')
!pip install -q tensorflow_data_validation[visualization]

Installing TensorFlow Data Validation
[K     |████████████████████████████████| 788kB 7.8MB/s 
[K     |████████████████████████████████| 368kB 39.3MB/s 
[31mERROR: jupyter-console 5.2.0 has requirement prompt-toolkit<2.0.0,>=1.0.0, but you'll have prompt-toolkit 3.0.18 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement ipython~=5.5.0, but you'll have ipython 7.22.0 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement requests~=2.23.0, but you'll have requests 2.25.1 which is incompatible.[0m
[?25h

In [3]:
import pandas as pd
import tensorflow_data_validation as tfdev

### Simple data-analysis

In [7]:
dataset = pd.read_csv('/content/1.2 pollution_small.csv')

In [8]:
dataset.shape

(2188, 5)

In [9]:
training_data = dataset[:1600]
training_data.describe()

Unnamed: 0,pm10,no2,so2,soot
count,1600.0,1600.0,1600.0,1600.0
mean,49.656494,30.980519,16.229981,21.551956
std,35.211906,12.400788,10.621896,12.127354
min,6.38,9.74,4.01,6.0
25%,28.345,22.5675,9.7775,14.4
50%,38.835,28.715,13.275,18.63
75%,58.05,36.37,19.2825,24.0725
max,277.25,138.01,123.13,107.65


In [11]:
test_data = dataset[1600:]
test_data.describe()

Unnamed: 0,pm10,no2,so2,soot
count,588.0,588.0,588.0,588.0
mean,44.648248,37.296922,13.60517,18.44131
std,28.992087,10.94005,5.098944,6.596459
min,11.9,15.07,4.99,8.0
25%,28.3375,29.2175,10.1225,14.41
50%,35.555,35.815,12.345,17.09
75%,50.8125,43.8725,15.855,20.9625
max,273.77,106.03,38.03,87.21


### Data Analysis and validation using tfdev (tensorflow_data_validation)

#### Generate training data statistics

In [12]:
train_stats = tfdev.generate_statistics_from_dataframe(dataframe=dataset)

#### Infering the schema

In [13]:
schema = tfdev.infer_schema(statistics=train_stats)

In [14]:
tfdev.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Date',BYTES,required,,-
'pm10',FLOAT,required,,-
'no2',FLOAT,required,,-
'so2',FLOAT,required,,-
'soot',FLOAT,required,,-


#### Calculate test set statistics

In [15]:
test_stats = tfdev.generate_statistics_from_dataframe(dataframe=test_data)

In [16]:
schema = tfdev.infer_schema(statistics=test_stats)

In [17]:
tfdev.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Date',BYTES,required,,-
'pm10',FLOAT,required,,-
'no2',FLOAT,required,,-
'so2',FLOAT,required,,-
'soot',FLOAT,required,,-


### Compare test statistics with the schema 

#### checking for anomalies in new data

> any difference between test set and schema is called anomalies

In [19]:
anomalies = tfdev.validate_statistics(statistics=test_stats, schema=schema)

#### Display all detected anomalies

* integer larger than 10
* STRING type when expected INT type
* FLOAT type when expected INT type
* integer smaller than 0

In [20]:
tfdev.display_anomalies(anomalies)

  pd.set_option('max_colwidth', -1)


### New data WITH Anomalies

In [22]:
test_set_copy = test_data.copy()

In [23]:
test_set_copy.drop('soot', axis=1, inplace=True)

#### Statistics based on data with anomalies

In [24]:
test_set_copy_stats = tfdev.generate_statistics_from_dataframe(dataframe=test_set_copy)

In [25]:
anomalies_new = tfdev.validate_statistics(statistics=test_set_copy_stats, schema=schema)

In [26]:
tfdev.display_anomalies(anomalies_new)

  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'soot',Column dropped,Column is completely missing


### Prepare the schema for serving

In [30]:
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

#### Removing a target column from the serving schema

In [31]:
tfdev.get_feature(schema, 'soot').not_in_environment.append('SERVING')

#### Checking  for anomalies between the SERVING enviroment and new test set

In [32]:
serving_env_anomalies = tfdev.validate_statistics(test_set_copy_stats, schema, environment='SERVING')

In [33]:
tfdev.display_anomalies(serving_env_anomalies)

  pd.set_option('max_colwidth', -1)


### Freezing the schema

In [34]:
tfdev.write_schema_text(schema=schema, output_path='pollution_schema.pbtxt')