# House Price - Data Processing with TF.Transform

This notebook will report all the required code to create and export a data processing pipeline using Apache Beam and TF.Transform.

Whenever possible the notebook will be coded to support cloud run on GCP and local run.

## Set global parameters

In [1]:
PROJECT = 'gcp-playground' # change to your project_Id
BUCKET = 'gcs-cloudml'     # change to your bucket name
REGION = 'region'          # change to your region
ROOT_DIR = 'houseprice_tft'# directory where the output is stored locally or on GCS

RUN_LOCAL = True # if True, the DirectRunner is used, else DataflowRunner

## Import required modules

In [2]:
import os

os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION
os.environ['ROOT_DIR'] = ROOT_DIR
os.environ['RUN_LOCAL'] = str(RUN_LOCAL)

In [3]:
import os

import tensorflow as tf
import apache_beam as beam
import tensorflow_transform as tft

from tensorflow_transform.beam import impl
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.coders import example_proto_coder
from tensorflow_transform.tf_metadata import metadata_io
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

In [10]:
from tensorflow_transform.tf_metadata.schema_utils import schema_from_feature_spec as Schema

## Create raw data metadata

In [11]:
CATEGORICAL_FEATURE_NAMES = ['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 
                             'BHK_OR_RK', 'READY_TO_MOVE', 'RESALE', 
                             'ADDRESS']
CATEGORICAL_FEATURE_MAX_VALUES = [24]

NUMERIC_FEATURE_NAMES = ['BHK_NO.',  'SQUARE_FT', 'LONGITUDE','LATITUDE']
BUCKET_FEATURE_BUCKET_COUNT = [100]

TARGET_FEATURE_NAME = "TARGET(PRICE_IN_LACS)"

In [18]:
def create_raw_metadata():  
    
    raw_data_schema = {}
    
 
    # target feature schema
    raw_data_schema[TARGET_FEATURE_NAME]= dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
    
    # categorical features schema
    raw_data_schema.update({ column_name : dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
                            for column_name in CATEGORICAL_FEATURE_NAMES})
    
    # numerical features schema
    raw_data_schema.update({ column_name : dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
                            for column_name in NUMERIC_FEATURE_NAMES})
    
    # create dataset_metadata given raw_schema
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema(raw_data_schema))
    
    return raw_metadata

In [19]:
print(create_raw_metadata().schema)

feature {
  name: "ADDRESS"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "BHK_NO."
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "BHK_OR_RK"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "LATITUDE"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "LONGITUDE"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "POSTED_BY"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "READY_TO_MOVE"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "RERA"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "RESALE"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "SQUARE_FT"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "