In [1]:
import tempfile
import pandas as pd
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as tft_beam

from __future__ import print_function
from tensorflow_transform.tf_metadata import dataset_metadata, dataset_schema

In [2]:
dataset =  pd.read_csv("pollution-small.csv")

In [3]:
dataset.head()

Unnamed: 0,Date,pm10,no2,so2,soot
0,1/1/2009,98.67,14.1,44.38,34.81
1,1/2/2009,52.33,14.1,29.75,33.06
2,1/3/2009,74.67,20.5,36.25,39.25
3,1/4/2009,72.0,17.3,46.44,34.38
4,1/5/2009,81.0,25.64,56.56,45.59


In [4]:
fratures = dataset.drop("Date",axis=1)

In [5]:
fratures.head()

Unnamed: 0,pm10,no2,so2,soot
0,98.67,14.1,44.38,34.81
1,52.33,14.1,29.75,33.06
2,74.67,20.5,36.25,39.25
3,72.0,17.3,46.44,34.38
4,81.0,25.64,56.56,45.59


In [6]:
dict_features = list(fratures.to_dict("index").values())

In [7]:
dict_features[:2]

[{'no2': 14.1, 'pm10': 98.67, 'so2': 44.38, 'soot': 34.81},
 {'no2': 14.1, 'pm10': 52.33, 'so2': 29.75, 'soot': 33.06}]

In [8]:
data_model = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec({
        'no2':  tf.compat.v1.FixedLenFeature([],tf.float32),
        'so2':  tf.compat.v1.FixedLenFeature([],tf.float32),
        'pm10': tf.compat.v1.FixedLenFeature([],tf.float32),
        'soot': tf.compat.v1.FixedLenFeature([],tf.float32),
    }
    )
)

Instructions for updating:
from_feature_spec is a deprecated, use schema_utils.schema_from_feature_spec


In [9]:
data_model

{'_schema': feature {
  name: "no2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "pm10"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "so2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "soot"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
}

In [10]:
def preprocessing_fn(inputs):
    
    no2 = inputs['no2']
    pm10 = inputs['pm10']
    so2 = inputs['so2']
    soot = inputs['soot']
    
    no2_normalized = no2 - tft.mean(no2)
    so2_normalized = so2 - tft.mean(so2)
    
    pm10_normalized = tft.scale_to_0_1(pm10)
    soot_normalized = tft.scale_by_min_max(soot)
    
    return {
        "no2_normalized":no2_normalized,
        "so2_normalized":so2_normalized,
        "pm10_normalized":pm10_normalized,
        "soot_normalized":soot_normalized
    }

In [11]:
def data_transform():
    
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = ((dict_features, data_model) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        
    transformed_data, transformed_metadata = transformed_dataset
    
    for i in range(len(transformed_data)):
        print("Raw: ", dict_features[i])
        print("Transformed:", transformed_data[i])

In [12]:
data_transform()

NameError: name 'data_metadata' is not defined