## Imports

In [1]:
import argparse
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext
from hops import hdfs
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from tensorflowonspark import TFCluster
from hops import util

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2007,application_1512575073636_0537,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


## Constants

In [2]:
project_path = "/Projects/" + hdfs.project_name()

TRAIN_FEATURES_PATH = project_path + "/HAR_Dataset/cleaned_data/train/features"
TRAIN_LABELS_PATH = project_path + "/HAR_Dataset/cleaned_data/train/labels"
TEST_FEATURES_PATH = project_path + "/HAR_Dataset/cleaned_data/train/features"
TEST_LABELS_PATH = project_path + "/HAR_Dataset/cleaned_data/train/labels"

sc = spark.sparkContext
sql = SQLContext(sc)

## Read Data

In [None]:
def parse_args(num_executors):
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', "--cluster", action='store_true', default=False)
    parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
    parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true")
    parser.add_argument("-X", "--mode", help="train|inference", default="train")
    parser.add_argument("-f", "--features", help="HDFS path to features in parallelized format", default=TRAIN_FEATURES_PATH)
    parser.add_argument("-l", "--labels", help="HDFS path to labels in parallelized format", default=TRAIN_LABELS_PATH)
    parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default=project_path + "/HAR_Dataset/saved_model")
    parser.add_argument("-r", "--rdma", help="use rdma connection", default=False)
    parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default=project_path + "/HAR_Dataset/predictions")
    parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=100000)
    parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100)
    parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=100)
    args = parser.parse_args()
    return args

## TensorFlow Model

In [None]:
def map_fun(args, ctx):
    """Training/Inference Function executed by parameter-servers and workers in distributed TFOS"""
    NUM_FEATURES = 8
    NUM_CLASSES = 7

    def print_log(worker_num, arg):
        print("%d: " % worker_num)
        print(arg)

    from tensorflowonspark import TFNode
    from datetime import datetime
    import getpass
    import math
    import numpy
    import os
    import signal
    import tensorflow as tf
    import time
    # Used to get TensorBoard logdir for TensorBoard that show up in HopsWorks
    from hops import tensorboard

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    print_log(worker_num, "task_index: {0}, job_name {1}, cluster_spec: {2}".format(task_index, job_name, cluster_spec))
    num_workers = len(cluster_spec['worker'])

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 10)

    batch_size = args.batch_size
    print_log(worker_num, "batch_size: {0}".format(batch_size))

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def read_csv_examples(feature_dir, label_dir, batch_size=100, num_epochs=None, task_index=None, num_workers=None):
        """ Reads pre-processed and parallelized CSV files from disk into TF-HDFS queues"""
        print_log(worker_num, "num_epochs: {0}".format(num_epochs))
        
        # Setup queue of csv feature filenames
        tf_record_pattern = os.path.join(feature_dir, 'part-*')
        features = tf.gfile.Glob(tf_record_pattern)
        print_log(worker_num, "features: {0}".format(features))
        feature_queue = tf.train.string_input_producer(features, shuffle=False, capacity=1000, num_epochs=num_epochs,
                                                       name="feature_queue")

        # Setup queue of csv label filenames
        tf_record_pattern = os.path.join(label_dir, 'part-*')
        labels = tf.gfile.Glob(tf_record_pattern)
        print_log(worker_num, "labels: {0}".format(labels))
        label_queue = tf.train.string_input_producer(labels, shuffle=False, capacity=1000, num_epochs=num_epochs,
                                                     name="label_queue")

        # Setup reader for feature queue
        feature_reader = tf.TextLineReader(name="feature_reader")
        _, feat_csv = feature_reader.read(feature_queue)
        feature_defaults = [[1.0] for col in range(NUM_FEATURES)]
        feature = tf.stack(tf.decode_csv(feat_csv, feature_defaults), name="input_features")
        print_log(worker_num, "feature: {0}".format(feature))

        # Setup reader for label queue
        label_reader = tf.TextLineReader(name="label_reader")
        _, label_csv = label_reader.read(label_queue)
        label_defaults = [tf.constant([], dtype=tf.int64)]
        label = tf.stack(tf.decode_csv(label_csv, label_defaults), name = "input_labels")
        print_log(worker_num, tf.shape(label))
        print_log(worker_num, "label: {0}".format(label))

        # Return a batch of examples
        return tf.train.batch([feature, label], batch_size, num_threads=10, name="batch_csv")

    if job_name == "ps":
        print_log(worker_num, "Parameter Server Joining")
        server.join()

    elif job_name == "worker":
        print_log(worker_num, "worker {0} starting")

        with tf.device(tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % task_index,
                cluster=cluster)):

            def build_graph(x):
                """Builds the computational graph of the model"""
                W = tf.Variable("W_1", tf.zeros([NUM_FEATURES, NUM_CLASSES]))
                tf.summary.histogram("input_weights", W) #for tensorboard
                b = tf.Variable("bias_weights", tf.zeros([NUM_CLASSES]))
                tf.summary.histogram("bias_weights", b) #for tensorboard
                logits = tf.matmul(x, W) + b
                return logits

            def define_optimizer(logits, labels):
                """Defines the optimizer of the model and calculates step, loss, prediction, accuracy"""
                #Global step to keep track of how long training have proceeded, 
                #incremented by one for each gradient computation
                global_step = tf.Variable("global_step",0)
                # Define loss and optimizer
                cross_entropy = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(labels, [-1]), logits=logits))
                tf.summary.scalar("loss", cross_entropy) #for tensorboard
                train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy, global_step=global_step)
                prediction = tf.argmax(tf.nn.softmax(logits), 1, name="prediction")
                # Test trained model
                correct_prediction = tf.equal(tf.argmax(logits, 1), labels)
                accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
                tf.summary.scalar("acc", accuracy) #for tensorboard
                return train_step, accuracy, cross_entropy, global_step, prediction

            # Placeholders or QueueRunner/Readers for input data
            num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
            index = task_index if args.mode == "inference" else None
            workers = num_workers if args.mode == "inference" else None

            features = TFNode.hdfs_path(ctx, args.features) #input csv files
            labels = TFNode.hdfs_path(ctx, args.labels) #input csv files
            
            x, y = read_csv_examples(features, labels, 100, num_epochs, index, workers)
            logits = build_graph(x)
            training_step, accuracy, cross_entropy_loss, global_step, pred = define_optimizer(logits, y)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        logdir = tensorboard.logdir()
        print_log(worker_num, "tensorflow model path: {0}".format(logdir))

        if job_name == "worker" and task_index == 0:
            summary_writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                         logdir=logdir,
                                         init_op=init_op,
                                         summary_op=None,
                                         summary_writer=None,
                                         saver=saver,
                                         global_step=global_step,
                                         stop_grace_secs=300,
                                         save_model_secs=10)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                         logdir=logdir,
                                         summary_op=None,
                                         saver=saver,
                                         global_step=global_step,
                                         stop_grace_secs=300,
                                         save_model_secs=0)

        output_dir = TFNode.hdfs_path(ctx, args.output)
        output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w')
        
        model_dir = TFNode.hdfs_path(ctx, args.model)
        
        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print_log(worker_num, "session ready, starting training")

            # Loop until the supervisor shuts down or maximum steps have completed.
            step = 0
            count = 0
            while not sv.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                if args.mode == "train":
                    _, summary, step = sess.run([training_step, summary_op, global_step])
                    # logging.info accuracy and save model checkpoint to HDFS every 100 steps
                    if (step % 100 == 0):
                        acc = sess.run(accuracy)
                        print_log(worker_num, "step: {0}, acc: {1}".format(step, acc))

                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)
                else:  # args.mode == "inference"
                    label, preds, acc = sess.run([labels, pred, accuracy])
                    for i in range(len(label)):
                        count += 1
                        output_file.write("{0} {1}\n".format(label[i], pred[i]))
                    print("count: {0}".format(count))

            if args.mode == "inference":
                output_file.close()
            # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
            # run inference and request stop before the other workers even start/sync their sessions.
            if task_index == 0:
                time.sleep(60)
            
            if sv.is_chief:
                save_path = saver.save(sess, model_dir)
                print_log(worker_num, "Model saved in file: {}".format(save_path))

            # Ask for all the services to stop.
            print("{0} stopping supervisor".format(datetime.now().isoformat()))
            sv.stop()


## Spark Cluster Setup For Training

In [None]:
def spark_setup_cluster_training():
    from hops import tensorboard
    num_executors = util.num_executors(spark)
    num_ps = util.num_param_servers(spark)
    args = parse_args(num_executors)
    cluster = TFCluster.run(sc, map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
    print("Finnished, cluster shutdown")

In [None]:
spark_setup_cluster_training()