### installing dependencies and setting up enviroment

### Importing important libraries

In [1]:
try:
  import colab
  !pip install --upgrade pip
except:
  pass



In [2]:
!pip install -q -U tensorflow_transform==0.24.1

In [3]:
import pprint
import tempfile
import pandas as pd

import tensorflow as tf
import tensorflow_transform as tft

import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

### dataset preprocessing
#### Loading the dataset

In [6]:
dataset = pd.read_csv('/content/1.2 pollution_small.csv')

In [7]:
dataset.head()

Unnamed: 0,Date,pm10,no2,so2,soot
0,1/1/2009,98.67,14.1,44.38,34.81
1,1/2/2009,52.33,14.1,29.75,33.06
2,1/3/2009,74.67,20.5,36.25,39.25
3,1/4/2009,72.0,17.3,46.44,34.38
4,1/5/2009,81.0,25.64,56.56,45.59


#### Droping the date column

In [8]:
features = dataset.drop('Date', axis=1)

In [9]:
features.head()

Unnamed: 0,pm10,no2,so2,soot
0,98.67,14.1,44.38,34.81
1,52.33,14.1,29.75,33.06
2,74.67,20.5,36.25,39.25
3,72.0,17.3,46.44,34.38
4,81.0,25.64,56.56,45.59


#### Converting the dataframe from dataframe to python dictionary

In [10]:
dict_features = list(features.to_dict('index').values())

In [11]:
dict_features[:2]

[{'no2': 14.1, 'pm10': 98.67, 'so2': 44.38, 'soot': 34.81},
 {'no2': 14.1, 'pm10': 52.33, 'so2': 29.75, 'soot': 33.06}]

#### Defining the dataset metadata

In [16]:
data_metadata = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec({
        'no2' : tf.io.FixedLenFeature([], tf.float32),
        'so2' : tf.io.FixedLenFeature([], tf.float32),
        'pm10' : tf.io.FixedLenFeature([], tf.float32),
        'soot' : tf.io.FixedLenFeature([], tf.float32)
    }
    )
)

In [17]:
data_metadata

{'_schema': feature {
  name: "no2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "pm10"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "so2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "soot"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
}

### The preprocessing function

In [18]:
def preprocessing_fn(inputs):
  no2 = inputs['no2']
  pm10 = inputs['pm10']
  so2 = inputs['so2']
  soot = inputs['soot']

  no2_normalized = no2 - tft.mean(no2)
  so2_normalized = so2 - tft.mean(so2)
  pm10_normalized = tft.scale_to_0_1(pm10)
  soot_normalized = tft.scale_to_0_1(soot)

  return {
      'no2_normalized' : no2_normalized,
      'so2_normalized' : so2_normalized,
      'pm10_normalized' : pm10_normalized,
      'soot_normalized' : soot_normalized
  }

### Putting everything together

Tensorflow Transform uses **Apache Beam** in the background to perform scalable data transforms. In this function we will use a direct runner.

Arguments to provide to the runner:

    dict_features - This is our dataset converted into Python Dictionary.
    data_metadata - This is our mada data for the dataset that we have created.
    preprocessing_fn - The main preprocessing function. Called to perform preprocessing operation per column.


This is a special syntax used in Apache Beam. This is used to stack operations and invoke transforms on our data.

```
result = data_to_pass | where_to_pass_the_data
```

Let's break down our case:

**result**  -> `transformed_dataset, transform_fn`

**data_to_pass** -> `(dict_features, data_metadata)`

**where_to_pass_the_data** -> `tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)` 

```
transformed_dataset, transform_fn = ((dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

```

If you want to learn more about the syntax, we recommend this link: 
https://beam.apache.org/documentation/programming-guide/#applying-transforms

LINKS:
> more about Apache Beam: https://beam.apache.org/ 

In [23]:
def data_transform():

  with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_dataset, transform_fn = ((dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

    transformed_data, transform_metadata = transformed_dataset

    for i in range(len(transformed_data)):
      print('Raw :', dict_features[i])
      print('Transformed :', transformed_data[i])

In [24]:
data_transform()













INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


INFO:tensorflow:SavedModel written to: /tmp/tmpkelahmvm/tftransform_tmp/622d7297f9594ac9813bc2e1b4f45805/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpkelahmvm/tftransform_tmp/622d7297f9594ac9813bc2e1b4f45805/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


'Counter' object has no attribute 'name'


INFO:tensorflow:SavedModel written to: /tmp/tmpkelahmvm/tftransform_tmp/cbec00d97914407f824cfcf8d22b35dd/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpkelahmvm/tftransform_tmp/cbec00d97914407f824cfcf8d22b35dd/saved_model.pb










INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /tmp/tmpkelahmvm/tftransform_tmp/2b319ee628a144b6b2ae6c3d0072a75d/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpkelahmvm/tftransform_tmp/2b319ee628a144b6b2ae6c3d0072a75d/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Raw : {'pm10': 98.67, 'no2': 14.1, 'so2': 44.38, 'soot': 34.81}
Transformed : {'no2_normalized': -18.577978, 'pm10_normalized': 0.34071696, 'so2_normalized': 28.855408, 'soot_normalized': 0.2834235}
Raw : {'pm10': 52.33, 'no2': 14.1, 'so2': 29.75, 'soot': 33.06}
Transformed : {'no2_normalized': -18.577978, 'pm10_normalized': 0.16963857, 'so2_normalized': 14.225407, 'soot_normalized': 0.26620758}
Raw : {'pm10': 74.67, 'no2': 20.5, 'so2': 36.25, 'soot': 39.25}
Transformed : {'no2_normalized': -12.1779785, 'pm10_normalized': 0.25211355, 'so2_normalized': 20.725407, 'soot_normalized': 0.32710278}
Raw : {'pm10': 72.0, 'no2': 17.3, 'so2': 46.44, 'soot': 34.38}
Transformed : {'no2_normalized': -15.377979, 'pm10_normalized': 0.24225645, 'so2_normalized': 30.915405, 'soot_normalized': 0.2791933}
Raw : {'pm10': 81.0, 'no2': 25.64, 'so2': 56.56, 'soot': 45.59}
Transformed : {'no2_normalized': -7.037979, 'pm10_normalized': 0.2754827, 'so2_normalized': 41.035408, 'soot_normalized': 0.38947365}
Raw 