In [1]:
from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator

# datas storage for intermediate data and model artifacts.
from cerebro.storage import LocalStore, HDFSStore

# Model selection/AutoML methods.
from cerebro.tune import GridSearch, RandomSearch, TPESearch

# Utility functions for specifying the search space.
from cerebro.tune import hp_choice, hp_uniform, hp_quniform, hp_loguniform, hp_qloguniform

import tensorflow as tf
# tf.config.run_functions_eagerly(True)

from pyspark.sql import SparkSession


spark = SparkSession \
    .builder \
    .appName("Cerebro Iris") \
    .getOrCreate()

...

backend = SparkBackend(spark_context=spark.sparkContext, num_workers=1)
store = LocalStore(prefix_path='/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments')

from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.read.csv("/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/Iris_clean.csv", header=True, inferSchema=True)

encoder = OneHotEncoderEstimator(dropLast=False)
encoder.setInputCols(["Species"])
encoder.setOutputCols(["Species_OHE"])

encoder_model = encoder.fit(df)
encoded = encoder_model.transform(df)

feature_columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
label_columns=['Species_OHE']

# Initialize input DataFrames.
# You can download sample dataset from https://apache.googlesource.com/spark/+/master/data/mllib/sample_libsvm_data.txt

train_df, test_df = encoded.randomSplit([0.8, 0.2])

# Define estimator generating function.
# Input: Dictionary containing parameter values
# Output: SparkEstimator
def estimator_gen_fn(params):
    inputs = [tf.keras.Input(shape=(1,)) for col in feature_columns]
    embeddings1 = [tf.keras.layers.Dense(16, activation=tf.nn.relu)(input) for input in inputs]
    embeddings2 = [tf.keras.layers.Dense(32, activation=tf.nn.relu)(input) for input in embeddings1]
    combined = tf.keras.layers.Concatenate()(embeddings2)
    output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(combined)
    model = tf.keras.Model(inputs, output)

#     inputs = tf.keras.Input(shape=(4,))
#     output1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(inputs)
#     output2 = tf.keras.layers.Dense(32, activation=tf.nn.relu)(output1)
#     output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output2)
#     model = tf.keras.Model(inputs, output)

    optimizer = tf.keras.optimizers.Adam(lr=params['lr'])
    loss = 'categorical_crossentropy'

    estimator = SparkEstimator(
        model=model,
        optimizer=optimizer,
        loss=loss,
        metrics=['accuracy'],
        batch_size=params['batch_size'])

    return estimator

# Define dictionary containing the parameter search space.
search_space = {
    'lr': hp_choice([0.01, 0.001, 0.0001]),
    'batch_size': hp_quniform(16, 64, 16)
}

# Instantiate TPE (Tree of Parzan Estimators a.k.a., HyperOpt) model selection object.
model_selection = TPESearch(
    backend=backend, 
    store=store, 
    estimator_gen_fn=estimator_gen_fn, 
    search_space=search_space,
    num_models=1, 
    num_epochs=10, 
    validation=0.25, 
    evaluation_metric='loss',
    feature_columns=feature_columns,
    label_columns=label_columns
)

# Perform model selection. Returns best model.
model = model_selection.fit(train_df)

# Inspect best model training history.
model_history = model.get_history()

# # Perform inference using the best model and Spark DataFrame.
output_df = model.set_output_columns(['label_predicted']).transform(test_df)
output_df.select('Species', 'label_predicted').show(n=10)

# # Access all models.
# all_models = model.get_all_models()
# all_model_training_history = model.get_all_model_history()

# # Convert the best model to Keras and perform inference using numpy data.
# keras_model = model.keras()
# pred = keras_model.predict([np.ones([1, 692], dtype=np.float32)])
# # Save the keras checkpoint file.
# keras_model.save(ckpt_path)

# # Convert all the model to Keras.
# all_models_keras = [m.keras() for m in all_models]


21/11/21 23:06:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/21 23:06:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/11/21 23:06:59 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


CEREBRO => Time: 2021-11-21 23:07:00, Running 1 Workers
CEREBRO => Time: 2021-11-21 23:07:03, Preparing Data
CEREBRO => Time: 2021-11-21 23:07:03, Num Partitions: 1
CEREBRO => Time: 2021-11-21 23:07:03, Writing DataFrames
CEREBRO => Time: 2021-11-21 23:07:03, Train Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_train_data
CEREBRO => Time: 2021-11-21 23:07:03, Val Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_val_data


                                                                                

CEREBRO => Time: 2021-11-21 23:07:05, Train Partitions: 1


                                                                                

CEREBRO => Time: 2021-11-21 23:07:08, Val Partitions: 1


                                                                                

CEREBRO => Time: 2021-11-21 23:07:11, Train Rows: 89
CEREBRO => Time: 2021-11-21 23:07:11, Val Rows: 27
CEREBRO => Time: 2021-11-21 23:07:11, Initializing Workers
CEREBRO => Time: 2021-11-21 23:07:11, Initializing Data Loaders
CEREBRO => Time: 2021-11-21 23:07:11, Launching Model Selection Workload


2021-11-21 23:07:11.653309: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-21 23:07:11.653492: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[Stage 10:>                                                         (0 + 1) / 1]2021-11-21 23:07:11.862584: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-21 23:07:12.051972: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow w

Train on 3 steps
Epoch 4/4
2021-11-21 23:07:37.438778: W tensorflow/core/framework/op_kernel.cc:1751] Invalid argument: ValueError: callback pyfunc_17 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_17 is not found


2021-11-21 23:07:37.439172: W tensorflow/core/kernels/data/generator_dataset_op.cc:107] Error occurred when finalizing GeneratorDataset iterator: Invalid argument: ValueError: callback pyfunc_17 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_17 is not found


	 [[{{node PyFunc}}]]
CEREBRO => Time: 2021-11-21 23:07:37, Model: model_0_1

CEREBRO => Time: 2021-11-21 23:08:01, Model: model_0_1637564831, Mode: TRAIN, Initialization Time: 0.8204758167266846, Training Time: 0.32283592224121094, Finalization Time: 0.19846820831298828
2021-11-21 23:08:05.381015: W tensorflow/core/framework/op_kernel.cc:1751] Invalid argument: ValueError: callback pyfunc_38 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_38 is not found


2021-11-21 23:08:05.381408: W tensorflow/core/kernels/data/generator_dataset_op.cc:107] Error occurred when finalizing GeneratorDataset iterator: Invalid argument: ValueError: callback pyfunc_38 is not found
Traceback (most recent call last):

  File "/Users/zijian/.pyenv/versions/nocerebro/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 238, in __call__
    raise Val

CEREBRO => Time: 2021-11-21 23:08:32, Terminating Workers


[Stage 11:>                                                         (0 + 1) / 1]2021-11-21 23:08:35.735822: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-21 23:08:35.736363: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-21 23:08:35.913791: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[Stage 12:>                                                         (0 + 1) / 1]2021-11-21 23:08:38.619449: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-21 23:08:38.619853: I tensorflow/core/platform/cpu_feature_gua

+-------+-------------------+
|Species|    label_predicted|
+-------+-------------------+
|      0|0.30131030082702637|
|      0|0.29584527015686035|
|      0| 0.3063153028488159|
|      0| 0.2972525656223297|
|      1|0.26656144857406616|
|      1| 0.2630133628845215|
|      0| 0.2939305007457733|
|      1|0.26326262950897217|
|      0|   0.29635089635849|
|      1|0.26233184337615967|
+-------+-------------------+
only showing top 10 rows



                                                                                

In [4]:
model.get_best_model()._get_metadata()

{'Species_OHE': {'spark_data_type': pyspark.sql.types.BinaryType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None}}

In [1]:
from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator

# datas storage for intermediate data and model artifacts.
from cerebro.storage import LocalStore, HDFSStore

# Model selection/AutoML methods.
from cerebro.tune import GridSearch, RandomSearch, TPESearch

# Utility functions for specifying the search space.
from cerebro.tune import hp_choice, hp_uniform, hp_quniform, hp_loguniform, hp_qloguniform

import tensorflow as tf
# tf.config.run_functions_eagerly(True)

from pyspark.sql import SparkSession


spark = SparkSession \
    .builder \
    .appName("Cerebro Iris") \
    .getOrCreate()

...

backend = SparkBackend(spark_context=spark.sparkContext, num_workers=1)
store = LocalStore(prefix_path='/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments')

from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.read.csv("/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/Iris_clean.csv", header=True, inferSchema=True)

encoder = OneHotEncoderEstimator(dropLast=False)
encoder.setInputCols(["Species"])
encoder.setOutputCols(["Species_OHE"])

encoder_model = encoder.fit(df)
encoded = encoder_model.transform(df)

feature_columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
label_columns=['Species_OHE']

# Initialize input DataFrames.
# You can download sample dataset from https://apache.googlesource.com/spark/+/master/data/mllib/sample_libsvm_data.txt

train_df, test_df = encoded.randomSplit([0.8, 0.2])

# Define estimator generating function.
# Input: Dictionary containing parameter values
# Output: SparkEstimator
def estimator_gen_fn(params):
    inputs = [tf.keras.Input(shape=(1,)) for col in feature_columns]
    embeddings = [tf.keras.layers.Dense(16, activation=tf.nn.relu)(input) for input in inputs]
    combined = tf.keras.layers.Concatenate()(embeddings)
    output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(combined)
    model = tf.keras.Model(inputs, output)

#     inputs = tf.keras.Input(shape=(4,))
#     output1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(inputs)
#     output2 = tf.keras.layers.Dense(32, activation=tf.nn.relu)(output1)
#     output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output2)
#     model = tf.keras.Model(inputs, output)

    optimizer = tf.keras.optimizers.Adam(lr=params['lr'])
    loss = 'categorical_crossentropy'

    estimator = SparkEstimator(
        model=model,
        optimizer=optimizer,
        loss=loss,
        metrics=['accuracy'],
        batch_size=params['batch_size'])

    return estimator

# Define dictionary containing the parameter search space.
search_space = {
    'lr': hp_choice([0.01, 0.001, 0.0001]),
    'batch_size': hp_quniform(16, 64, 16)
}

# Instantiate TPE (Tree of Parzan Estimators a.k.a., HyperOpt) model selection object.
model_selection = TPESearch(
    backend=backend, 
    store=store, 
    estimator_gen_fn=estimator_gen_fn, 
    search_space=search_space,
    num_models=1, 
    num_epochs=10, 
    validation=0.25, 
    evaluation_metric='loss',
    feature_columns=feature_columns,
    label_columns=label_columns
)

_, _, metadata, _ = model_selection.backend.prepare_data(model_selection.store, train_df, model_selection.validation, label_columns=model_selection.label_cols, feature_columns=model_selection.feature_cols)

model_selection.backend.initialize_workers()

model_selection.backend.initialize_data_loaders(model_selection.store, None, model_selection.feature_cols + model_selection.label_cols)

21/11/21 23:29:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/21 23:29:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/11/21 23:29:20 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


CEREBRO => Time: 2021-11-21 23:29:21, Running 1 Workers
CEREBRO => Time: 2021-11-21 23:29:24, Num Partitions: 1
CEREBRO => Time: 2021-11-21 23:29:24, Writing DataFrames
CEREBRO => Time: 2021-11-21 23:29:24, Train Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_train_data
CEREBRO => Time: 2021-11-21 23:29:24, Val Data Path: file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_val_data


[Stage 4:>                                                          (0 + 1) / 1]                                                                                

{'PetalLengthCm': {'spark_data_type': <class 'pyspark.sql.types.DoubleType'>, 'is_sparse_vector_only': False, 'shape': 1, 'intermediate_format': 'nochange', 'max_size': 1}, 'PetalWidthCm': {'spark_data_type': <class 'pyspark.sql.types.DoubleType'>, 'is_sparse_vector_only': False, 'shape': 1, 'intermediate_format': 'nochange', 'max_size': 1}, 'SepalLengthCm': {'spark_data_type': <class 'pyspark.sql.types.DoubleType'>, 'is_sparse_vector_only': False, 'shape': 1, 'intermediate_format': 'nochange', 'max_size': 1}, 'SepalWidthCm': {'spark_data_type': <class 'pyspark.sql.types.DoubleType'>, 'is_sparse_vector_only': False, 'shape': 1, 'intermediate_format': 'nochange', 'max_size': 1}, 'Species_OHE': {'spark_data_type': <class 'pyspark.sql.types.ArrayType'>, 'is_sparse_vector_only': False, 'shape': 3, 'intermediate_format': 'nochange', 'max_size': 3}}
CEREBRO => Time: 2021-11-21 23:29:26, Train Partitions: 1


                                                                                

CEREBRO => Time: 2021-11-21 23:29:29, Val Partitions: 1


                                                                                

CEREBRO => Time: 2021-11-21 23:29:32, Train Rows: 96
CEREBRO => Time: 2021-11-21 23:29:32, Val Rows: 30


[Stage 10:>                                                         (0 + 1) / 1]

In [7]:
metadata

{'Species_OHE': {'spark_data_type': pyspark.sql.types.BinaryType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None}}

[Stage 23:>                                                         (0 + 1) / 1]

In [8]:
from cerebro.backend.spark.util import _get_metadata

_get_metadata(train_df)

                                                                                

{'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'Species': {'spark_data_type': pyspark.sql.types.IntegerType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'Species_OHE': {'spark_data_type': pyspark.ml.linalg.SparseVector,
  'is_sparse_vector_only': True,
  'shape': 3,
  'intermediate_format': 'custom_spar

[Stage 23:>                                                         (0 + 1) / 1]

In [2]:
from petastorm import make_reader

from petastorm.tf_utils import make_petastorm_dataset

with make_reader('file:///Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments/intermediate_train_data') as reader:
    dataset = make_petastorm_dataset(reader)
    for ele in dataset:
        print(ele)

2021-11-21 23:02:51.155902: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-21 23:02:51.156214: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[Stage 10:>                                                         (0 + 1) / 1]

petastorm_schema_view(PetalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=1.1>, PetalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=0.1>, SepalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=4.3>, SepalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=3.0>, Species_OHE=<tf.Tensor: shape=(3,), dtype=float64, numpy=array([1., 0., 0.])>)
petastorm_schema_view(PetalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=1.2>, PetalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=0.2>, SepalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=5.8>, SepalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=4.0>, Species_OHE=<tf.Tensor: shape=(3,), dtype=float64, numpy=array([1., 0., 0.])>)
petastorm_schema_view(PetalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=1.3>, PetalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=0.2>, SepalLengthCm=<tf.Tensor: shape=(), dtype=float64, numpy=4.4>, SepalWidthCm=<tf.Tensor: shape=(), dtype=float64, numpy=3.0>, Species_OHE=<tf.Tensor: sh

2021-11-21 23:02:51.604769: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[Stage 10:>                                                         (0 + 1) / 1]

In [7]:
train_df.head(10)

                                                                                

[Row(SepalLengthCm=4.4, SepalWidthCm=2.9, PetalLengthCm=1.4, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=3.0, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=3.2, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.5, SepalWidthCm=2.3, PetalLengthCm=1.3, PetalWidthCm=0.3, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.1, PetalLengthCm=1.5, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.2, PetalLengthCm=1.4, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.4, PetalLengthCm=1.4, PetalWidthCm=0.3, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.6, PetalLengthCm=1.0, PetalWidthCm=0.2, Species=0, 

[Stage 13:>                                                         (0 + 1) / 1]