In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from keras_tuner import HyperParameters

import autokeras as ak

from cerebro.nas.hphpmodel import HyperHyperModel

In [2]:
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
   .appName("NAS Iris") \
   .getOrCreate()

sc = spark.sparkContext

from cerebro.backend import SparkBackend
from cerebro.storage import LocalStore

backend = SparkBackend(spark_context=sc, num_workers=1)
store = LocalStore(prefix_path='/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments')

from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.read.csv("/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/Iris_clean.csv", header=True, inferSchema=True)

encoder = OneHotEncoderEstimator(dropLast=False)
encoder.setInputCols(["Species"])
encoder.setOutputCols(["Species_OHE"])

encoder_model = encoder.fit(df)
encoded = encoder_model.transform(df)
train_df, test_df = encoded.randomSplit([0.8, 0.2])

feature_columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
label_columns=['Species_OHE']

input_node = [ak.StructuredDataInput() for c in feature_columns]
embeddings = [ak.StructuredDataBlock()(innode) for innode in input_node]
output_node = ak.Merge()([embeddings])
output_node = ak.ClassificationHead()(output_node)
am = HyperHyperModel(
    inputs=input_node, outputs=output_node
)

am.resource_bind(
    backend=backend, 
    store=store,
    feature_columns=feature_columns,
    label_columns=label_columns,
    evaluation_metric='accuracy', 
)

train_df.head(10)

21/11/21 20:36:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


CEREBRO => Time: 2021-11-21 20:36:21, Running 1 Workers


[Row(SepalLengthCm=4.3, SepalWidthCm=3.0, PetalLengthCm=1.1, PetalWidthCm=0.1, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=2.9, PetalLengthCm=1.4, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=3.0, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=3.2, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.5, SepalWidthCm=2.3, PetalLengthCm=1.3, PetalWidthCm=0.3, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.1, PetalLengthCm=1.5, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.4, PetalLengthCm=1.4, PetalWidthCm=0.3, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.7, SepalWidthCm=3.2, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, 

In [3]:
am.tuner_bind(
    tuner="randomsearch", 
    hyperparameters=None, 
    objective="val_loss",
    max_trials=5,
    overwrite=True,
)
rel = am.fit(train_df, epochs=5)

Trial 5 Complete [00h 01m 11s]
val_loss: 0.6208969354629517

Best val_loss So Far: 0.6208969354629517
Total elapsed time: 00h 06m 01s


In [4]:
# Inspect best model training history.
model_history = rel.get_history()

In [5]:
model_history

{'trial': <keras_tuner.engine.trial.Trial at 0x179248ed0>,
 'train_loss': [1.1499926249186199,
  1.0678984920183818,
  1.091020663579305,
  1.0541109045346577,
  1.2112325032552083],
 'train_accuracy': [0.375,
  0.4791666567325592,
  0.46875,
  0.4583333432674408,
  0.3958333432674408],
 'val_loss': [1.151548147201538,
  1.1615432500839233,
  1.1486754417419434,
  1.1191351413726807,
  1.0912346839904785],
 'val_accuracy': [0.25, 0.1875, 0.15625, 0.15625, 0.15625]}

[Stage 11:>                                                         (0 + 1) / 1]

In [9]:
output_df = rel.set_output_columns(['label_predicted']).transform(test_df)
output_df.select('Species_OHE', 'label_predicted').show(n=10)

[Stage 11:>                 (0 + 1) / 1][Stage 16:>                 (0 + 1) / 1]2021-11-21 20:51:07.755889: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-21 20:51:07.756400: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-21 20:51:08.092117: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[Stage 11:>                 (0 + 1) / 1][Stage 17:>                 (0 + 1) / 1]2021-11-21 20:51:11.861301: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-21 20:51:11.861840: I tensorflow/core/platform/cpu_feature_gua

+-------------+-------------------+
|  Species_OHE|    label_predicted|
+-------------+-------------------+
|(3,[0],[1.0])|0.25515392422676086|
|(3,[0],[1.0])|0.25539514422416687|
|(3,[0],[1.0])|0.25300097465515137|
|(3,[0],[1.0])| 0.2515498101711273|
|(3,[2],[1.0])|0.23178207874298096|
|(3,[0],[1.0])|0.24540391564369202|
|(3,[0],[1.0])|0.24609287083148956|
|(3,[0],[1.0])|0.24646230041980743|
|(3,[0],[1.0])|0.23969466984272003|
|(3,[1],[1.0])|0.22828829288482666|
+-------------+-------------------+
only showing top 10 rows



2021-11-21 20:51:12.211978: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[Stage 11:>                                                         (0 + 1) / 1]

In [12]:
best_model = rel.get_best_model().getModel()

In [14]:
best_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
multi_category_encoding_2 (Mult (None, 1)            0           input_3[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 256)          512         multi_category_encoding_2[0][0]  
______________________________________________________________________________________________

[Stage 11:>                                                         (0 + 1) / 1]

In [16]:
x = np.array(test_df.select(am.tuner.model_selection.feature_cols).collect())
y = np.array(test_df.select(am.tuner.model_selection.label_cols).collect())
x = [x[:,i,np.newaxis] for i in range(x.shape[1])]
y = np.squeeze(y,1)

y

                                                                                

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

[Stage 11:>                                                         (0 + 1) / 1]

In [17]:
best_model.predict(x)

array([[0.25515392, 0.40035924, 0.3444868 ],
       [0.25539514, 0.40188965, 0.34271517],
       [0.25300097, 0.40175608, 0.34524292],
       [0.25154984, 0.4032627 , 0.34518752],
       [0.23178208, 0.41577715, 0.3524408 ],
       [0.2454039 , 0.407093  , 0.34750307],
       [0.24609289, 0.40734434, 0.34656283],
       [0.24646229, 0.40723473, 0.346303  ],
       [0.23969468, 0.41001257, 0.35029277],
       [0.22828826, 0.4212397 , 0.350472  ],
       [0.24255905, 0.409957  , 0.34748393],
       [0.24561352, 0.41063094, 0.3437555 ],
       [0.21964864, 0.43475255, 0.34559873],
       [0.21666595, 0.42030928, 0.3630248 ],
       [0.21594015, 0.42867857, 0.35538125],
       [0.20978123, 0.43204698, 0.35817182],
       [0.21055676, 0.4282944 , 0.3611488 ],
       [0.21078843, 0.4452729 , 0.34393877],
       [0.20773639, 0.43979594, 0.35246766],
       [0.20402719, 0.4345966 , 0.36137617],
       [0.20338775, 0.4247656 , 0.37184665],
       [0.19934897, 0.44698316, 0.3536678 ],
       [0.

[Stage 11:>                                                         (0 + 1) / 1]

In [18]:
spark_best_model = rel.get_best_model()

In [19]:
spark_best_model._get_metadata()

{'Species_OHE': {'spark_data_type': pyspark.sql.types.BinaryType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None}}

[Stage 11:>                                                         (0 + 1) / 1]

In [22]:
encoded.schema.fields

[StructField(SepalLengthCm,DoubleType,true),
 StructField(SepalWidthCm,DoubleType,true),
 StructField(PetalLengthCm,DoubleType,true),
 StructField(PetalWidthCm,DoubleType,true),
 StructField(Species,IntegerType,true),
 StructField(Species_OHE,VectorUDT,true)]

[Stage 11:>                                                         (0 + 1) / 1]

In [23]:
from cerebro.backend.spark.util import _get_metadata

In [26]:
_get_metadata(test_df)

                                                                                

{'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'Species': {'spark_data_type': pyspark.sql.types.IntegerType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'Species_OHE': {'spark_data_type': pyspark.ml.linalg.SparseVector,
  'is_sparse_vector_only': True,
  'shape': 3,
  'intermediate_format': 'custom_spar

[Stage 11:>                                                         (0 + 1) / 1]