In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from keras_tuner import HyperParameters

import autokeras as ak

from cerebro.nas.hphpmodel import HyperHyperModel

In [2]:
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
   .appName("NAS Iris") \
   .getOrCreate()

sc = spark.sparkContext

from cerebro.backend import SparkBackend
from cerebro.storage import LocalStore

backend = SparkBackend(spark_context=sc, num_workers=1)
store = LocalStore(prefix_path='/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/experiments')

from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.read.csv("/Users/zijian/Desktop/ucsd/cse234/project/cerebro-system/Iris_clean.csv", header=True, inferSchema=True)

encoder = OneHotEncoderEstimator(dropLast=False)
encoder.setInputCols(["Species"])
encoder.setOutputCols(["Species_OHE"])

encoder_model = encoder.fit(df)
encoded = encoder_model.transform(df)
train_df, test_df = encoded.randomSplit([0.8, 0.2], 100)

feature_columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
label_columns=['Species_OHE']

input_node = [ak.StructuredDataInput() for c in feature_columns]
# embeddings = [ak.StructuredDataBlock()(innode) for innode in input_node]
# output_node = ak.Merge()([embeddings])
concat = ak.Merge()(input_node)
output_node = ak.DenseBlock()(concat)
output_node = ak.ClassificationHead()(output_node)
am = HyperHyperModel(
    inputs=input_node, outputs=output_node, seed=100,
)

am.resource_bind(
    backend=backend, 
    store=store,
    feature_columns=feature_columns,
    label_columns=label_columns,
    evaluation_metric='accuracy', 
)

train_df.head(10)

21/11/22 06:05:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


CEREBRO => Time: 2021-11-22 06:05:58, Running 1 Workers


[Row(SepalLengthCm=4.3, SepalWidthCm=3.0, PetalLengthCm=1.1, PetalWidthCm=0.1, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=2.9, PetalLengthCm=1.4, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=3.0, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.4, SepalWidthCm=3.2, PetalLengthCm=1.3, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.1, PetalLengthCm=1.5, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.6, SepalWidthCm=3.2, PetalLengthCm=1.4, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.7, SepalWidthCm=3.2, PetalLengthCm=1.6, PetalWidthCm=0.2, Species=0, Species_OHE=SparseVector(3, {0: 1.0})),
 Row(SepalLengthCm=4.8, SepalWidthCm=3.0, PetalLengthCm=1.4, PetalWidthCm=0.1, Species=0, 

In [3]:
# cuz_hp = HyperParameters()
# cuz_hp.Fixed('learning_rate', value=0.01)
# cuz_hp.Fixed('optimizer', value='adam')
am.tuner_bind(
    tuner="randomsearch", 
    hyperparameters=None, 
    objective="val_loss",
    max_trials=20,
    overwrite=True,
)

In [4]:
rel = am.fit(train_df, epochs=10)

Trial 20 Complete [00h 01m 21s]
val_loss: 0.8462984561920166

Best val_loss So Far: 0.36613646149635315
Total elapsed time: 00h 27m 33s


In [5]:
# Inspect best model training history.
model_history = rel.get_history()

In [6]:
model_history

{'trial': <keras_tuner.engine.trial.Trial at 0x17f52d990>,
 'train_loss': [1.4132784207661946,
  1.3419185082117717,
  1.3790775537490845,
  1.2360303401947021,
  1.474471092224121,
  1.2787354389826457,
  1.155797004699707,
  1.1729756991068523,
  1.3275675773620605,
  1.2964895963668823],
 'train_accuracy': [0.2708333432674408,
  0.2604166567325592,
  0.3854166567325592,
  0.3125,
  0.28125,
  0.3541666567325592,
  0.4375,
  0.3854166567325592,
  0.375,
  0.34375],
 'val_loss': [1.383800983428955,
  1.3494577407836914,
  1.251705527305603,
  1.2998790740966797,
  1.2731083631515503,
  1.2475106716156006,
  1.2594752311706543,
  1.2106069326400757,
  1.2263107299804688,
  1.2092466354370117],
 'val_accuracy': [0.15625,
  0.15625,
  0.28125,
  0.15625,
  0.1875,
  0.21875,
  0.15625,
  0.25,
  0.15625,
  0.15625]}

In [7]:
output_df = rel.set_output_columns(['label_predicted']).transform(test_df)
output_df.select('Species_OHE', 'label_predicted').show(n=10)

[Stage 11:>                 (0 + 1) / 1][Stage 14:>                 (0 + 1) / 1]2021-11-22 06:33:50.356946: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-22 06:33:50.357373: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-22 06:33:50.484527: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[Stage 11:>                 (0 + 1) / 1][Stage 15:>                 (0 + 1) / 1]2021-11-22 06:33:53.720567: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-22 06:33:53.720971: I tensorflow/core/platform/cpu_feature_gua

+-------------+--------------------+
|  Species_OHE|     label_predicted|
+-------------+--------------------+
|(3,[0],[1.0])|[0.37491688132286...|
|(3,[0],[1.0])|[0.38793587684631...|
|(3,[0],[1.0])|[0.38491761684417...|
|(3,[0],[1.0])|[0.38491761684417...|
|(3,[0],[1.0])|[0.39399600028991...|
|(3,[0],[1.0])|[0.38692891597747...|
|(3,[0],[1.0])|[0.38692891597747...|
|(3,[0],[1.0])|[0.38692888617515...|
|(3,[0],[1.0])|[0.38995245099067...|
|(3,[0],[1.0])|[0.39298385381698...|
+-------------+--------------------+
only showing top 10 rows



                                                                                

In [8]:
best_model = rel.get_best_model().getModel()

In [9]:
best_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
______________________________________________________________________________________________

In [10]:
x = np.array(test_df.select(am.tuner.model_selection.feature_cols).collect())
y = np.array(test_df.select(am.tuner.model_selection.label_cols).collect())
x = [x[:,i,np.newaxis] for i in range(x.shape[1])]
y = np.squeeze(y,1)

y

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [11]:
best_model.predict(x)

2021-11-22 06:33:54.380870: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


array([[0.37491688, 0.30341023, 0.32167286],
       [0.38793588, 0.29485384, 0.31721035],
       [0.38491762, 0.29682648, 0.31825587],
       [0.38491762, 0.29682648, 0.31825587],
       [0.393996  , 0.2909125 , 0.3150915 ],
       [0.38692892, 0.29551122, 0.3175599 ],
       [0.38692892, 0.29551122, 0.3175599 ],
       [0.3869289 , 0.29551125, 0.31755987],
       [0.38995242, 0.2935394 , 0.31650814],
       [0.3929838 , 0.29156896, 0.31544715],
       [0.3929838 , 0.29156896, 0.31544715],
       [0.40314195, 0.2850128 , 0.3118453 ],
       [0.3990691 , 0.28763288, 0.31329796],
       [0.42264503, 0.27261984, 0.30473506],
       [0.3960228 , 0.2896001 , 0.3143771 ],
       [0.42574582, 0.27067238, 0.3035818 ],
       [0.41132367, 0.27978328, 0.30889308],
       [0.42781585, 0.2693757 , 0.3028085 ],
       [0.43923876, 0.26226836, 0.29849288],
       [0.44862646, 0.25648728, 0.29488617],
       [0.44758183, 0.25712797, 0.29529017],
       [0.44967154, 0.255847  , 0.2944815 ],
       [0.

In [12]:
best_model.evaluate(x, y)



[1.1142839193344116, 0.5]

In [13]:
best_model.optimizer.get_config()

{'name': 'SGD',
 'learning_rate': 1e-04,
 'decay': 0.0,
 'momentum': 0.0,
 'nesterov': False}

In [14]:
spark_best_model = rel.get_best_model()

In [15]:
spark_best_model._get_metadata()

{'Species_OHE': {'spark_data_type': pyspark.sql.types.BinaryType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': None,
  'intermediate_format': 'nochange',
  'max_size': None}}

In [16]:
encoded.schema.fields

[StructField(SepalLengthCm,DoubleType,true),
 StructField(SepalWidthCm,DoubleType,true),
 StructField(PetalLengthCm,DoubleType,true),
 StructField(PetalWidthCm,DoubleType,true),
 StructField(Species,IntegerType,true),
 StructField(Species_OHE,VectorUDT,true)]

In [17]:
from cerebro.backend.spark.util import _get_metadata

In [18]:
_get_metadata(test_df)

[Stage 11:>                                                         (0 + 1) / 1]                                                                                

{'SepalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'SepalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'PetalLengthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'PetalWidthCm': {'spark_data_type': pyspark.sql.types.DoubleType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'Species': {'spark_data_type': pyspark.sql.types.IntegerType,
  'is_sparse_vector_only': False,
  'shape': 1,
  'intermediate_format': 'nochange',
  'max_size': 1},
 'Species_OHE': {'spark_data_type': pyspark.ml.linalg.SparseVector,
  'is_sparse_vector_only': True,
  'shape': 3,
  'intermediate_format': 'custom_spar

[Stage 11:>                                                         (0 + 1) / 1]

In [21]:
for spmodel in rel.all_models:
    model = spmodel.getModel()
    print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
______________________________________________________________________________________________

[Stage 11:>                                                         (0 + 1) / 1]

In [23]:
am.tuner.oracle._tried_so_far

{'1d13b9daf356c746e74fb36a22be5cba',
 '1df78473d7eb0beadcebc4a6665853b8',
 '217b94f91ceb2d7196f3ce3e073be6ba',
 '46f5872d280a709859673af8d389b8ca',
 '4ea89569c6ecd802ce30aa61d9b74a92',
 '5b29e17452483172b48198030a343698',
 '67925b517d7d493145dfa536310c29d0',
 '70b015c7e186ce401125f73bccb722b0',
 '72a3195d68017860075009b1dacdcbf3',
 '7e69c12d0c876e0dcbf1df948001878e',
 '87f4a873b531a01281ff91149567ff7b',
 'b741a21bd822202fca8a9f39fb55ddbf',
 'ce4abdf3ec7cdf8b7830444fc366a589',
 'dc04fd1487b18abd83df22678963c725',
 'dfec8137a53ce41991f8931cef4e6d2d',
 'e600389267e922d0fdf7b860c0a73861',
 'e80967daa9e2578e6dae2ed580466650',
 'f19b7e6d721e1e1d94f9dd74db31e041',
 'f23dfd80a5340a38a7996bdb78dc4475',
 'fb3b254380b6fd14eed338431fb4a1a5'}

[Stage 11:>                                                         (0 + 1) / 1]