In [None]:
# !pip install -U tfx
# !pip install apache-beam==2.39.0
# !pip install pandas-tfrecords

In [None]:
import apache_beam as beam
from google.protobuf import text_format
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import pandas_tfrecords as pdtfr
import pathlib
import pprint
import tempfile
import tensorflow as tf
from tensorflow_metadata.proto.v0 import schema_pb2
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow.python.lib.io import file_io
from tfx_bsl.public import tfxio
from tfx_bsl.coders.example_coder import RecordBatchToExamples

# Display versions of TF and TFX related packages
print('Beam: {}'.format(beam.__version__))
print('TF: {}'.format(tf.__version__))
print('Transform: {}'.format(tft.__version__))

Beam: 2.39.0
TF: 2.9.1
Transform: 1.9.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Location of the data and model directory
DATA_DIR = '/content/drive/My Drive/Stroke Prediction ML System'
DATA_ROOT = f'{DATA_DIR}/data'

# Set the paths to the reduced dataset
DATA_DIR_SELECT = f'{DATA_ROOT}/select'
TRAINING_ROOT = f'{DATA_DIR}/training'
TESTING_ROOT = f'{DATA_DIR}/testing'
SERVING_ROOT = f'{DATA_DIR}/serving'

TRAINING_DATA = f'{TRAINING_ROOT}/stoke_prediction_training_dataset.csv'
TESTING_DATA = f'{TESTING_ROOT}/stoke_prediction_testing_dataset.csv'
SERVING_DATA = f'{SERVING_ROOT}/stoke_prediction_serving_dataset.csv'

# We will create two pipelines. One for schema generation and one for training.
SCHEMA_PIPELINE_NAME = 'stroke-mlops-schema'
PIPELINE_NAME = 'stroke-mlops'

# Output directory to store artifacts generated from the pipeline.
SCHEMA_PIPELINE_ROOT = os.path.join(DATA_DIR, 'pipeline', SCHEMA_PIPELINE_NAME)
PIPELINE_ROOT = os.path.join(DATA_DIR, 'pipeline', PIPELINE_NAME)

# Path to a SQLite DB file to use as an MLMD storage.
SCHEMA_METADATA_PATH = os.path.join(DATA_DIR, 'metadata', SCHEMA_PIPELINE_NAME, 'metadata.db')
METADATA_PATH = os.path.join(DATA_DIR, 'metadata', PIPELINE_NAME, 'metadata.db')

# Output directory where created models from the pipeline will be exported.
SERVING_MODEL_DIR = os.path.join(DATA_DIR, 'serving')

# Path to curated schema file
SCHEMA_FOLDER = os.path.join(DATA_DIR, 'schema/schema_output')

# Names of transformed data files
TRANSFORMED_TRAIN_DATA = 'train_transformed'
TRANSFORMED_TEST_DATA = 'test_transformed'
OUTPUT_DIR = f'{DATA_ROOT}/transformed'

# Set random seed
RANDOM_SEED = 0

In [None]:
SCALE_MINMAX_FEATURE_KEYS = ['age']

SCALE_Z_FEATURE_KEYS = ['avg_glucose_level', 'bmi']

VOCAB_FEATURE_KEYS = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

FEATURE_KEYS = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

LABEL_KEY = 'stroke'

In [None]:
def load_schema(input_path):
  schema = schema_pb2.Schema()
  schema_text = file_io.read_file_to_string(input_path)
  text_format.Parse(schema_text, schema)
  return schema

curated_schema = load_schema(f'{SCHEMA_FOLDER}/schema.pbtxt')

In [None]:
curated_schema

feature {
  name: "Residence_type"
  type: BYTES
  domain: "Residence_type"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "age"
  type: FLOAT
  float_domain {
    name: "age"
    min: 0.0
    max: 100.0
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "avg_glucose_level"
  type: FLOAT
  float_domain {
    name: "avg_glucose_level"
    min: 25.0
    max: 300.0
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "bmi"
  type: FLOAT
  float_domain {
    name: "bmi"
    min: 0.0
    max: 200.0
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  skew_comparator {
    infinity_norm {
      threshold: 0.01
    }
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "ever_married"
  type: BYTES
  domain: "ever_married"
  presence {
    min_fraction: 1.0

In [None]:
def preprocessing_fn(inputs):
  """
  Preprocess input columns into transformed columns
  """

  features_dict = {}

  for feature in SCALE_MINMAX_FEATURE_KEYS:
      data_col = inputs[feature] 
      # Transform using scaling of min_max function
      features_dict[feature] = tft.scale_by_min_max(data_col)

  for feature in SCALE_Z_FEATURE_KEYS:
      data_col = inputs[feature] 
      # Transforming using scaling to z score
      features_dict[feature] = tft.scale_to_z_score(data_col)

  for feature in VOCAB_FEATURE_KEYS:
      data_col = inputs[feature] 
      # Transforming using vocabulary available in column
      features_dict[feature] = tft.compute_and_apply_vocabulary(data_col)

  # No change in the label
  features_dict[LABEL_KEY] = inputs[LABEL_KEY]

  return features_dict

In [None]:
def transform_data(train_data_file, test_data_file, working_dir):
  """
  Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """

  with beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=DATA_ROOT):
      # Create a TFXIO to read the census data with the schema. To do this we
      # need to list all columns in order since the schema doesn't specify the
      # order of columns in the csv
      train_csv_tfxio = tfxio.CsvTFXIO(
          file_pattern=train_data_file,
          skip_header_lines=1,
          telemetry_descriptors=[],
          column_names=FEATURE_KEYS,
          schema=curated_schema)

      # Read in raw data and convert using CSV TFXIO
      raw_data = (
          pipeline |
          'ReadTrainCsv' >> train_csv_tfxio.BeamSource())

      # Combining data and schema into a dataset tuple.  Note that we already used
      # the schema to read the CSV data, but we also need it to interpret
      # raw_data
      cfg = train_csv_tfxio.TensorAdapterConfig()
      raw_dataset = (raw_data, cfg)

      # The TFXIO output format is chosen for improved performance
      transformed_dataset, transform_fn = (
          raw_dataset | tft_beam.AnalyzeAndTransformDataset(
              preprocessing_fn, output_record_batches=True))

      # Transformed metadata is not necessary for encoding.
      transformed_data, _ = transformed_dataset

      # Extract transformed RecordBatches, encode and write them to the given
      # directory
      _ = (
          transformed_data
          | 'EncodeTrainData' >>
          beam.FlatMapTuple(lambda batch, _: RecordBatchToExamples(batch))
          | 'WriteTrainData' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TRAIN_DATA)))

      # Now applying transform function to test data.  In this case we also 
      # remove the header line that is present in the test data file
      test_csv_tfxio = tfxio.CsvTFXIO(
          file_pattern=test_data_file,
          skip_header_lines=1,
          telemetry_descriptors=[],
          column_names=FEATURE_KEYS,
          schema=curated_schema)
      
      raw_test_data = (
          pipeline
          | 'ReadTestCsv' >> test_csv_tfxio.BeamSource())

      raw_test_dataset = (raw_test_data, test_csv_tfxio.TensorAdapterConfig())

      # The TFXIO output format is chosen for improved performance
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn)
          | tft_beam.TransformDataset(output_record_batches=True))

      # Transformed metadata is not necessary for encoding
      transformed_test_data, _ = transformed_test_dataset

      # Extract transformed RecordBatches, encode and write them to the given
      # directory
      _ = (
          transformed_test_data
          | 'EncodeTestData' >>
          beam.FlatMapTuple(lambda batch, _: RecordBatchToExamples(batch))
          | 'WriteTestData' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TEST_DATA)))

      # Will write a SavedModel and metadata to working_dir, which can then
      # be read by the tft.TFTransformOutput class
      _ = (
          transform_fn
          | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))

In [None]:
transform_data(TRAINING_DATA, TESTING_DATA, OUTPUT_DIR)



Instructions for updating:
Use ref() instead.


Instructions for updating:
Use ref() instead.


INFO:tensorflow:Assets written to: /content/drive/My Drive/Stroke Prediction ML System/data/tftransform_tmp/35624ee710a84b31ac1cf908b41f2b6f/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/Stroke Prediction ML System/data/tftransform_tmp/35624ee710a84b31ac1cf908b41f2b6f/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:Assets written to: /content/drive/My Drive/Stroke Prediction ML System/data/tftransform_tmp/d52fb9a7ed8144038335208fbdec8522/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/Stroke Prediction ML System/data/tftransform_tmp/d52fb9a7ed8144038335208fbdec8522/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


In [None]:
tf_transform_output = tft.TFTransformOutput(OUTPUT_DIR)
tf_transform_output.transformed_feature_spec()

{'Residence_type': FixedLenFeature(shape=[1], dtype=tf.int64, default_value=None),
 'age': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 'avg_glucose_level': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 'bmi': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 'ever_married': FixedLenFeature(shape=[1], dtype=tf.int64, default_value=None),
 'gender': FixedLenFeature(shape=[1], dtype=tf.int64, default_value=None),
 'smoking_status': FixedLenFeature(shape=[1], dtype=tf.int64, default_value=None),
 'stroke': FixedLenFeature(shape=[1], dtype=tf.int64, default_value=None),
 'work_type': FixedLenFeature(shape=[1], dtype=tf.int64, default_value=None)}

In [None]:
def get_dataset(working_dir, filebase):
  tf_transform_output = tft.TFTransformOutput(working_dir)

  data_path_pattern = os.path.join(working_dir, filebase + '-00000-of-00001')
  
  dataset = pdtfr.tfrecords_to_pandas(file_paths=data_path_pattern)

  return dataset

train_transformed_dataset = get_dataset(OUTPUT_DIR, TRANSFORMED_TRAIN_DATA)
train_transformed_dataset.head()

Unnamed: 0,Residence_type,age,avg_glucose_level,bmi,ever_married,gender,smoking_status,stroke,work_type
0,0.0,0.694824,2.859288,0.439707,0.0,0.0,0.0,0.0,0.0
1,0.0,0.353027,-0.931756,-0.095751,0.0,0.0,3.0,0.0,0.0
2,1.0,0.743652,-0.565362,-0.554715,0.0,0.0,3.0,0.0,0.0
3,1.0,0.018066,-0.252118,-1.11567,1.0,1.0,1.0,0.0,2.0
4,0.0,0.487305,0.2303,1.026161,0.0,1.0,0.0,0.0,3.0
