## Imports

In [1]:
import argparse
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext
from hops import hdfs
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from tensorflowonspark import TFCluster
from hops import util

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1949,application_1512575073636_0477,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


## Constants

In [2]:
project_path = "/Projects/" + hdfs.project_name()

TRAIN_FEATURES_PATH = project_path + "/HAR_Dataset/cleaned_data/train/features"
TRAIN_LABELS_PATH = project_path + "/HAR_Dataset/cleaned_data/train/labels"
TEST_FEATURES_PATH = project_path + "/HAR_Dataset/cleaned_data/train/features"
TEST_LABELS_PATH = project_path + "/HAR_Dataset/cleaned_data/train/labels"

sc = spark.sparkContext
sql = SQLContext(sc)

## Read Data

In [3]:
def parse_args(num_executors):
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', "--cluster", action='store_true', default=False)
    parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
    parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true", default=True)
    parser.add_argument("-X", "--mode", help="train|inference", default="train")
    parser.add_argument("-f", "--features", help="HDFS path to features in parallelized format", default=TRAIN_FEATURES_PATH)
    parser.add_argument("-l", "--labels", help="HDFS path to labels in parallelized format", default=TRAIN_LABELS_PATH)
    parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default=project_path + "/HAR_Dataset/saved_model")
    parser.add_argument("-r", "--rdma", help="use rdma connection", default=False)
    parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default=project_path + "/HAR_Dataset/predictions")
    parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
    parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100)
    parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1)
    args = parser.parse_args()
    return args

In [4]:
def read_data(sc, featuresFile, labelsFile):
    labels = sc.textFile(labelsFile).map(lambda ln: [int(x) for x in ln.split(',')])
    features = sc.textFile(featuresFile).map(lambda ln: [float(x) for x in ln.split(',')]).repartition(labels.getNumPartitions())
    dataRDD = features.zip(labels)
    return dataRDD

## TensorFlow Model

In [5]:
def map_fun(args, ctx):
    from hops import util
    from tensorflowonspark import TFNode
    from datetime import datetime
    import getpass
    import math
    import numpy
    import os
    import signal
    import tensorflow as tf
    import time
    from hops import tensorboard # Used to get TensorBoard logdir for TensorBoard that show up in HopsWorks

    NUM_FEATURES = 8
    NUM_CLASSES = 7

    def print_log(worker_num, arg):
        print("%d: " % worker_num)
        print(arg)

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    print_log(worker_num, "task_index: {0}, job_name {1}, cluster_spec: {2}".format(task_index, job_name, cluster_spec))
    num_workers = len(cluster_spec['worker'])
    
    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 2) * 5)
    
    print_log(worker_num, "tensorboard: {0}".format(args.tensorboard))
    worker_num = ctx.worker_num

    batch_size = args.batch_size
    print_log(worker_num, "batch_size: {0}".format(batch_size))
    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def feed_dict(batch):
        # Convert from [(features, labels)] to two numpy arrays of the proper type
        features = []
        labels = []
        for item in batch:
            features.append(item[0])
            labels.append(item[1])
        xs = np.array(features)
        xs = xs.astype(np.float32)
        ys = np.array(labels)
        ys = ys.reshape(len(ys))
        ys = ys.astype(np.uint8)
        return (xs, ys)

    if job_name == "ps":
        print_log(worker_num, "Parameter Server Joining")
        server.join()

    elif job_name == "worker":
        print_log(worker_num, "worker starting")
        with tf.device(tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % task_index,
                cluster=cluster)):
            global_step = tf.Variable(0)

            x = tf.placeholder(tf.float32, [None, NUM_FEATURES])
            y = tf.placeholder(tf.int64, [None])

            # The model
            W = tf.Variable(tf.zeros([NUM_FEATURES, NUM_CLASSES]))
            tf.summary.histogram("hidden_weights", W)
            b = tf.Variable(tf.zeros([NUM_CLASSES]))
            logits = tf.matmul(x, W) + b

            # Define loss and optimizer
            cross_entropy = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits))

            tf.summary.scalar("loss", cross_entropy)

            train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy, global_step=global_step)

            prediction = tf.argmax(tf.nn.softmax(logits), 1, name="prediction")
            # Test trained model
            correct_prediction = tf.equal(tf.argmax(logits, 1), y)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()
            #logdir = TFNode.hdfs_path(ctx, args.model)
            logdir = tensorboard.logdir()
            print_log(worker_num, "tensorflow model path: {0}".format(logdir))

            if job_name == "worker" and task_index == 0:
                summary_writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph())

            if args.mode == "train":
                sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     summary_writer=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=10)
            else:
                sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)
        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print_log(worker_num, "session ready, starting training")

            # Loop until the supervisor shuts down or maximum steps have completed.
            step = 0
            # TFNode.DataFeed handles SPARK input mode, will convert the RDD into TF formats
            tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
            while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using feed_dict
                batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
                feed = {x: batch_xs, y: batch_ys}

                if len(batch_xs) > 0:
                    if args.mode == "train":
                        _, summary, step = sess.run([train_step, summary_op, global_step], feed_dict=feed)
                        # print accuracy and save model checkpoint to HDFS every 100 steps
                        if (step % 100 == 0):
                            acc = sess.run(accuracy, {x: batch_xs, y: batch_ys})
                            print_log(worker_num, "step: {0}, acc: {1}".format(step, acc))

                        if sv.is_chief:
                            summary_writer.add_summary(summary, step)
                    else:  # args.mode == "inference"
                        if (len(batch_ys == batch_size)):
                            pred, acc = sess.run([prediction, accuracy], feed_dict=feed)
                            # acc, loss = sess.run([accuracy, cross_entropy_loss], feed_dict=feed)
                            results = ["Label: {0}, Prediction: {1}".format(label, pred) for label, pred in
                                       zip(batch_ys, pred)]
                            print_log(worker_num, "len results: {}".format(len(results)))
                            tf_feed.batch_results(results)
                            print_log(worker_num, "acc: {0}".format(acc))
                        else:
                            print_log(worker_num, "Skipping last batch because it is not complete")

        if sv.should_stop() or step >= args.steps:
            print_log(worker_num, "terminating")
            tf_feed.terminate()

        # Ask for all the services to stop.
        print_log(worker_num, "stopping supervisor")
        sv.stop()

## Spark Cluster Setup For Training

In [6]:
def spark_setup_cluster_training():
    #from hops import tensorboard
    num_executors = util.num_executors(spark)
    num_ps = util.num_param_servers(spark)
    args = parse_args(num_executors)
    dataRDD = read_data(sc, args.features, args.labels)
    #args.model = tensorboard.logdir()
    #print(args.tensorboard)
    cluster = TFCluster.run(sc, map_fun, args, num_executors, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
    if args.mode == "train":
        print("servers for training")
        cluster.train(dataRDD, args.epochs)
    else:
        print("servers for inference")
        labelRDD = cluster.inference(dataRDD)
        print("--------------------------------received labelRDD-------------------------")
        labelRDD.saveAsTextFile(args.output)
    cluster.shutdown()
    print("Finnished, cluster shutdown")

In [7]:
spark_setup_cluster_training()

KeyboardInterrupt: 