In [1]:
from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator

# datas storage for intermediate data and model artifacts.
from cerebro.storage import LocalStore, HDFSStore

# Model selection/AutoML methods.
from cerebro.tune import GridSearch, RandomSearch, TPESearch

# Utility functions for specifying the search space.
from cerebro.tune import hp_choice, hp_uniform, hp_quniform, hp_loguniform, hp_qloguniform

import tensorflow as tf
# tf.config.run_functions_eagerly(True)

from pyspark.sql import SparkSession


spark = SparkSession \
    .builder \
    .appName("Cerebro Iris") \
    .getOrCreate()

...

backend = SparkBackend(spark_context=spark.sparkContext, num_workers=1)
store = LocalStore(prefix_path='/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments')

from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.read.csv("/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/Iris_clean.csv", header=True, inferSchema=True)

encoder = OneHotEncoderEstimator(dropLast=False)
encoder.setInputCols(["Species"])
encoder.setOutputCols(["Species_OHE"])

encoder_model = encoder.fit(df)
encoded = encoder_model.transform(df)

feature_columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
label_columns=['Species_OHE']

# Initialize input DataFrames.
# You can download sample dataset from https://apache.googlesource.com/spark/+/master/data/mllib/sample_libsvm_data.txt

train_df, test_df = encoded.randomSplit([0.8, 0.2])

# Define estimator generating function.
# Input: Dictionary containing parameter values
# Output: SparkEstimator
def estimator_gen_fn(params):
#     inputs = [tf.keras.Input(shape=(1,)) for col in feature_columns]
#     embeddings1 = [tf.keras.layers.Dense(16, activation=tf.nn.relu)(input) for input in inputs]
#     embeddings2 = [tf.keras.layers.Dense(32, activation=tf.nn.relu)(input) for input in embeddings1]
#     combined = tf.keras.layers.Concatenate()(embeddings2)
#     output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(combined)
#     model = tf.keras.Model(inputs, output)

    inputs = [tf.keras.Input(shape=(1,)) for col in feature_columns]
    concat = tf.keras.layers.Concatenate()(inputs)
    output1 = tf.keras.layers.Dense(128, activation=tf.nn.relu)(concat)
    output2 = tf.keras.layers.Dense(1024, activation=tf.nn.relu)(output1)
    output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output2)
    model = tf.keras.Model(inputs, output)

#     inputs = tf.keras.Input(shape=(4,))
#     output1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(inputs)
#     output2 = tf.keras.layers.Dense(32, activation=tf.nn.relu)(output1)
#     output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output2)
#     model = tf.keras.Model(inputs, output)

    optimizer = tf.keras.optimizers.Adam(lr=params['lr'])
    loss = 'categorical_crossentropy'

    estimator = SparkEstimator(
        model=model,
        optimizer=optimizer,
        loss=loss,
        metrics=['accuracy'],
        batch_size=params['batch_size'])

    return estimator

# Define dictionary containing the parameter search space.
search_space = {
    'lr': hp_choice([0.01]),
    'batch_size': hp_choice([16])
}

# Instantiate TPE (Tree of Parzan Estimators a.k.a., HyperOpt) model selection object.
model_selection = RandomSearch(
    backend=backend, 
    store=store, 
    estimator_gen_fn=estimator_gen_fn, 
    search_space=search_space,
    num_models=1, 
    num_epochs=20, 
    validation=0.2, 
    evaluation_metric='accuracy',
    feature_columns=feature_columns,
    label_columns=label_columns
)

# Perform model selection. Returns best model.
model = model_selection.fit(train_df)

# Inspect best model training history.
model_history = model.get_history()

# # Perform inference using the best model and Spark DataFrame.
output_df = model.set_output_columns(['label_predicted']).transform(test_df)
output_df.select('Species', 'label_predicted').show(n=10)

# # Access all models.
# all_models = model.get_all_models()
# all_model_training_history = model.get_all_model_history()

# # Convert the best model to Keras and perform inference using numpy data.
# keras_model = model.keras()
# pred = keras_model.predict([np.ones([1, 692], dtype=np.float32)])
# # Save the keras checkpoint file.
# keras_model.save(ckpt_path)

# # Convert all the model to Keras.
# all_models_keras = [m.keras() for m in all_models]


21/11/22 06:45:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/22 06:45:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/11/22 06:45:45 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


CEREBRO => Time: 2021-11-22 06:45:46, Running 1 Workers
CEREBRO => Time: 2021-11-22 06:45:49, Preparing Data
CEREBRO => Time: 2021-11-22 06:45:50, Num Partitions: 1
CEREBRO => Time: 2021-11-22 06:45:50, Writing DataFrames
CEREBRO => Time: 2021-11-22 06:45:50, Train Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_train_data
CEREBRO => Time: 2021-11-22 06:45:50, Val Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_val_data


                                                                                

CEREBRO => Time: 2021-11-22 06:45:51, Train Partitions: 1


                                                                                

CEREBRO => Time: 2021-11-22 06:45:55, Val Partitions: 1


                                                                                

CEREBRO => Time: 2021-11-22 06:45:58, Train Rows: 93
CEREBRO => Time: 2021-11-22 06:45:58, Val Rows: 19
CEREBRO => Time: 2021-11-22 06:45:58, Initializing Workers
CEREBRO => Time: 2021-11-22 06:45:58, Initializing Data Loaders
CEREBRO => Time: 2021-11-22 06:45:58, Launching Model Selection Workload


2021-11-22 06:45:58.920273: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-22 06:45:58.920444: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[Stage 10:>                                                         (0 + 1) / 1]2021-11-22 06:45:59.095196: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-22 06:45:59.181146: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow w

Train on 6 steps
Epoch 4/4
2021-11-22 06:46:24.213675: W tensorflow/core/framework/op_kernel.cc:1751] Invalid argument: ValueError: callback pyfunc_17 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_17 is not found


2021-11-22 06:46:24.214068: W tensorflow/core/kernels/data/generator_dataset_op.cc:107] Error occurred when finalizing GeneratorDataset iterator: Invalid argument: ValueError: callback pyfunc_17 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_17 is not found


	 [[{{node PyFunc}}]]
CEREBRO => Time: 2021-11-22 06:46:24, Model: model_0_1

2021-11-22 06:46:52.356789: W tensorflow/core/framework/op_kernel.cc:1751] Invalid argument: ValueError: callback pyfunc_38 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_38 is not found


2021-11-22 06:46:52.357190: W tensorflow/core/kernels/data/generator_dataset_op.cc:107] Error occurred when finalizing GeneratorDataset iterator: Invalid argument: ValueError: callback pyfunc_38 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_38 is not found


	 [[{{node PyFunc}}]]
CEREBRO => Time: 2021-11-22 06:46:52, Model: model_0_1637592359, Mode: VALID, Ini

Train on 6 steps
Epoch 11/11
2021-11-22 06:47:20.781459: W tensorflow/core/framework/op_kernel.cc:1751] Invalid argument: ValueError: callback pyfunc_59 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_59 is not found


2021-11-22 06:47:20.781842: W tensorflow/core/kernels/data/generator_dataset_op.cc:107] Error occurred when finalizing GeneratorDataset iterator: Invalid argument: ValueError: callback pyfunc_59 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_59 is not found


	 [[{{node PyFunc}}]]
CEREBRO => Time: 2021-11-22 06:47:20, Model: model_0

2021-11-22 06:47:48.876042: W tensorflow/core/framework/op_kernel.cc:1751] Invalid argument: ValueError: callback pyfunc_80 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_80 is not found


2021-11-22 06:47:48.876440: W tensorflow/core/kernels/data/generator_dataset_op.cc:107] Error occurred when finalizing GeneratorDataset iterator: Invalid argument: ValueError: callback pyfunc_80 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_80 is not found


	 [[{{node PyFunc}}]]
CEREBRO => Time: 2021-11-22 06:47:48, Model: model_0_1637592359, Mode: VALID, Ini

Train on 6 steps
Epoch 18/18
2021-11-22 06:48:17.618872: W tensorflow/core/framework/op_kernel.cc:1751] Invalid argument: ValueError: callback pyfunc_101 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_101 is not found


2021-11-22 06:48:17.619261: W tensorflow/core/kernels/data/generator_dataset_op.cc:107] Error occurred when finalizing GeneratorDataset iterator: Invalid argument: ValueError: callback pyfunc_101 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_101 is not found


	 [[{{node PyFunc}}]]
CEREBRO => Time: 2021-11-22 06:48:17, Model: mod

CEREBRO => Time: 2021-11-22 06:48:40, Terminating Workers


[Stage 11:>                                                         (0 + 1) / 1]2021-11-22 06:48:44.577813: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-22 06:48:44.578291: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-22 06:48:44.721075: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[Stage 12:>                                                         (0 + 1) / 1]2021-11-22 06:48:47.355123: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-22 06:48:47.355629: I tensorflow/core/platform/cpu_feature_gua

+-------+--------------------+
|Species|     label_predicted|
+-------+--------------------+
|      0|[0.99996840953826...|
|      0|[0.99999892711639...|
|      2|[0.00283555034548...|
|      0|[0.99995791912078...|
|      0|[0.99995791912078...|
|      0|[0.99995791912078...|
|      0|[0.99990284442901...|
|      0|[0.99999284744262...|
|      0|[0.99998354911804...|
|      0|[0.99995231628417...|
+-------+--------------------+
only showing top 10 rows



                                                                                

In [2]:
model.metrics

{'model_0_1637592359': {'train_loss': [3.5598298708597818,
   1.1557079950968425,
   0.8183964689572653,
   0.55611935009559,
   0.5155076410155743,
   0.48213007384523127,
   0.35561879443654715,
   0.4118161859223619,
   0.25939790980676963,
   0.38293483707820997,
   0.1640534773662997,
   0.2704889229886855,
   0.1097771948358665,
   0.28366075836432475,
   0.12651007315920046,
   0.44658533505329007,
   1.1701089578370254,
   0.3829073728993535,
   0.39633384958142415,
   0.41608361599598237],
  'train_accuracy': [0.15625,
   0.3333333432674408,
   0.5729166865348816,
   0.65625,
   0.65625,
   0.6770833134651184,
   0.8645833134651184,
   0.8020833134651184,
   0.9375,
   0.8125,
   0.9479166865348816,
   0.8958333134651184,
   0.9583333134651184,
   0.875,
   0.9375,
   0.84375,
   0.5729166865348816,
   0.6979166865348816,
   0.8229166865348816,
   0.8020833134651184],
  'val_loss': [1.0147390365600586,
   0.8646733462810516,
   0.522622138261795,
   0.5134254693984985,
   0.48

In [3]:
import numpy as np
x = np.array(test_df.select(feature_columns).collect())
y = np.array(test_df.select(label_columns).collect())
x = [x[:,i,np.newaxis] for i in range(x.shape[1])]
y = np.squeeze(y,1)

y

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [4]:
keras_model = model.get_best_model().getModel()
keras_model.evaluate(x,y)



2021-11-22 06:48:47.947852: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


[0.25163865089416504, 1.0]

In [5]:
model.get_best_model()._get_metadata()

{'Species_OHE': {'spark_data_type': pyspark.sql.types.BinaryType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None}}

In [6]:
from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator

# datas storage for intermediate data and model artifacts.
from cerebro.storage import LocalStore, HDFSStore

# Model selection/AutoML methods.
from cerebro.tune import GridSearch, RandomSearch, TPESearch

# Utility functions for specifying the search space.
from cerebro.tune import hp_choice, hp_uniform, hp_quniform, hp_loguniform, hp_qloguniform

import tensorflow as tf
# tf.config.run_functions_eagerly(True)

from pyspark.sql import SparkSession


spark = SparkSession \
    .builder \
    .appName("Cerebro Iris") \
    .getOrCreate()

...

backend = SparkBackend(spark_context=spark.sparkContext, num_workers=1)
store = LocalStore(prefix_path='/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments')

from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.read.csv("/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/Iris_clean.csv", header=True, inferSchema=True)

encoder = OneHotEncoderEstimator(dropLast=False)
encoder.setInputCols(["Species"])
encoder.setOutputCols(["Species_OHE"])

encoder_model = encoder.fit(df)
encoded = encoder_model.transform(df)

feature_columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
label_columns=['Species_OHE']

# Initialize input DataFrames.
# You can download sample dataset from https://apache.googlesource.com/spark/+/master/data/mllib/sample_libsvm_data.txt

train_df, test_df = encoded.randomSplit([0.8, 0.2])

# Define estimator generating function.
# Input: Dictionary containing parameter values
# Output: SparkEstimator
def estimator_gen_fn(params):
    inputs = [tf.keras.Input(shape=(1,)) for col in feature_columns]
    embeddings = [tf.keras.layers.Dense(16, activation=tf.nn.relu)(input) for input in inputs]
    combined = tf.keras.layers.Concatenate()(embeddings)
    output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(combined)
    model = tf.keras.Model(inputs, output)

#     inputs = tf.keras.Input(shape=(4,))
#     output1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(inputs)
#     output2 = tf.keras.layers.Dense(32, activation=tf.nn.relu)(output1)
#     output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output2)
#     model = tf.keras.Model(inputs, output)

    optimizer = tf.keras.optimizers.Adam(lr=params['lr'])
    loss = 'categorical_crossentropy'

    estimator = SparkEstimator(
        model=model,
        optimizer=optimizer,
        loss=loss,
        metrics=['accuracy'],
        batch_size=params['batch_size'])

    return estimator

# Define dictionary containing the parameter search space.
search_space = {
    'lr': hp_choice([0.01, 0.001, 0.0001]),
    'batch_size': hp_quniform(16, 64, 16)
}

# Instantiate TPE (Tree of Parzan Estimators a.k.a., HyperOpt) model selection object.
model_selection = TPESearch(
    backend=backend, 
    store=store, 
    estimator_gen_fn=estimator_gen_fn, 
    search_space=search_space,
    num_models=1, 
    num_epochs=10, 
    validation=0.25, 
    evaluation_metric='loss',
    feature_columns=feature_columns,
    label_columns=label_columns
)

_, _, metadata, _ = model_selection.backend.prepare_data(model_selection.store, train_df, model_selection.validation, label_columns=model_selection.label_cols, feature_columns=model_selection.feature_cols)

model_selection.backend.initialize_workers()

model_selection.backend.initialize_data_loaders(model_selection.store, None, model_selection.feature_cols + model_selection.label_cols)

CEREBRO => Time: 2021-11-22 06:48:48, Running 1 Workers
CEREBRO => Time: 2021-11-22 06:48:48, Num Partitions: 1
CEREBRO => Time: 2021-11-22 06:48:48, Writing DataFrames
CEREBRO => Time: 2021-11-22 06:48:48, Train Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_train_data
CEREBRO => Time: 2021-11-22 06:48:48, Val Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_val_data
CEREBRO => Time: 2021-11-22 06:48:49, Train Partitions: 1


                                                                                

CEREBRO => Time: 2021-11-22 06:48:52, Val Partitions: 1


                                                                                

CEREBRO => Time: 2021-11-22 06:48:54, Train Rows: 94
CEREBRO => Time: 2021-11-22 06:48:54, Val Rows: 23


In [7]:
metadata

{'Species_OHE': {'spark_data_type': pyspark.sql.types.BinaryType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None}}

In [8]:
from cerebro.backend.spark.util import _get_metadata

_get_metadata(train_df)

{'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'Species': {'spark_data_type': pyspark.sql.types.IntegerType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'Species_OHE': {'spark_data_type': pyspark.ml.linalg.SparseVector,
  'is_sparse_vector_only': True,
  'shape': 3,
  'intermediate_format': 'custom_spar

In [9]:
from petastorm import make_reader

from petastorm.tf_utils import make_petastorm_dataset

with make_reader('file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_train_data') as reader:
    dataset = make_petastorm_dataset(reader)
    for ele in dataset:
        print(ele)

petastorm_schema_view(PetalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=1.0>, PetalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=0.2>, SepalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=4.6>, SepalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=3.6>, Species_OHE=<tf.Tensor: shape=(3,), dtype=float64, numpy=array([1., 0., 0.])>)
petastorm_schema_view(PetalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=1.1>, PetalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=0.1>, SepalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=4.3>, SepalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=3.0>, Species_OHE=<tf.Tensor: shape=(3,), dtype=float64, numpy=array([1., 0., 0.])>)
petastorm_schema_view(PetalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=1.2>, PetalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=0.2>, SepalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=5.8>, SepalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=4.0>, Species_OHE=<tf.Tensor: sh

In [10]:
train_df.head(10)

[Row(SepalLengthCm=4.3, SepalWidthCm=3.0, PetalLengthCm=1.1, PetalWidthCm=0.1, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=2.9, PetalLengthCm=1.4, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=3.0, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=3.2, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.5, SepalWidthCm=2.3, PetalLengthCm=1.3, PetalWidthCm=0.3, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.2, PetalLengthCm=1.4, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.6, PetalLengthCm=1.0, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.7, SepalWidthCm=3.2, PetalLengthCm=1.6, PetalWidthCm=0.2, Species=0, 

[Stage 25:>                                                         (0 + 1) / 1]