In [8]:
import sys
sys.path.append('../code/python')

import findspark
import pyspark

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col
from pyspark.sql.types import StringType, FloatType, IntegerType
from pyspark.sql import functions as F

toDense = F.udf(lambda v: Vectors.dense(v.toArray()), VectorUDT())

import time

In [9]:
! rm -r metastore_db

rm: cannot remove ‘metastore_db’: No such file or directory


In [10]:
findspark.init('/users/snakanda/vista/spark-2.2.0-bin-hadoop2.7')

sc = pyspark.SparkContext(appName="vista")

In [11]:
from vista import Vista

In [12]:
sc

In [13]:
def downstream_ml_func(features_df, results_dict, layer_index):
    lr = LogisticRegression(labelCol="label", featuresCol="features", elasticNetParam=0.5, regParam=0.05)
    train_df, test_df = features_df.randomSplit([0.6, 0.4], seed=2019)
    valid_df, test_df = test_df.randomSplit([0.5, 0.5], seed=2019)
    
    model = lr.fit(train_df)
    result = {}

    predictions = model.transform(train_df)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    result['train_acc'] = evaluator.evaluate(predictions)
    
    predictions = model.transform(valid_df)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                 metricName="accuracy")
    result['valid_acc'] = evaluator.evaluate(predictions)
    
    predictions = model.transform(test_df)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    result['test_acc'] = evaluator.evaluate(predictions)
    
    results_dict[layer_index] = result
    return results_dict

## Foods dataset

In [14]:
prev_time = time.time()
# mem_sys_rsv is an optional parameter. If not set a default value of 3 will be used.
vista = Vista("vista-example", 150, 8, 1, 'alexnet', 4, 0, downstream_ml_func,
                  '/users/snakanda/vista/data/foods/foods_wo_interactions.csv',
                  '/users/snakanda/vista/data/foods/images',
                  20129, 130, mem_sys_rsv=3)

result = vista.run()
for k in [-1, -2, -3, -4]:
    print(k, result[k])
    
# print("Runtime (min): " + str((time.time()-prev_time)/60.0))

Vista Configs(join, cpu, np, heap, f_core, pers): b, 7, 238, 133, 0.972117558402, deser




(-1, {'train_acc': 0.7921960072595281, 'test_acc': 0.8097278225806451, 'valid_acc': 0.7841049764793265})
(-2, {'train_acc': 0.7972281801682891, 'test_acc': 0.8079637096774194, 'valid_acc': 0.7897994553107205})
(-3, {'train_acc': 0.7972281801682891, 'test_acc': 0.8077116935483871, 'valid_acc': 0.7959891062144095})
(-4, {'train_acc': 0.7953308034977726, 'test_acc': 0.8059475806451613, 'valid_acc': 0.7974746224312949})


In [15]:
prev_time = time.time()
# mem_sys_rsv is an optional parameter. If not set a default value of 3 will be used.
vista = Vista("vista-example", 150, 8, 1, 'alexnet', 4, 0, downstream_ml_func,
                  '/users/snakanda/vista/data/foods/foods_with_interactions.csv',
                  '/users/snakanda/vista/data/foods/images',
                  20129, 130, mem_sys_rsv=3)

result = vista.run()
for k in [-1, -2, -3, -4]:
    print(k, result[k])

# print("Runtime (min): " + str((time.time()-prev_time)/60.0))

Vista Configs(join, cpu, np, heap, f_core, pers): b, 7, 238, 133, 0.972117558402, deser
(-1, {'train_acc': 0.8091899026563273, 'test_acc': 0.8230846774193549, 'valid_acc': 0.8051497895518692})
(-2, {'train_acc': 0.8095198812077214, 'test_acc': 0.8203125, 'valid_acc': 0.804159445407279})
(-3, {'train_acc': 0.8105923114997525, 'test_acc': 0.8175403225806451, 'valid_acc': 0.8078732359494925})
(-4, {'train_acc': 0.8069625474344168, 'test_acc': 0.8162802419354839, 'valid_acc': 0.8125773706362961})


In [16]:
prev_time = time.time()
# mem_sys_rsv is an optional parameter. If not set a default value of 3 will be used.
vista = Vista("vista-example", 150, 8, 1, 'resnet50', 5, 0, downstream_ml_func,
                  '/users/snakanda/vista/data/foods/foods_wo_interactions.csv',
                  '/users/snakanda/vista/data/foods/images',
                  20129, 130, mem_sys_rsv=3)

result = vista.run()
for k in [-1, -2, -3, -4, -5]:
    print(k, result[k])
    
# print("Runtime (min): " + str((time.time()-prev_time)/60.0))

Vista Configs(join, cpu, np, heap, f_core, pers): b, 7, 238, 140, 0.973514674302, deser
(-1, {'train_acc': 0.7987130836495627, 'test_acc': 0.8122479838709677, 'valid_acc': 0.8011884129735083})
(-2, {'train_acc': 0.8020128691635043, 'test_acc': 0.8084677419354839, 'valid_acc': 0.7994553107204754})
(-3, {'train_acc': 0.7986305890117142, 'test_acc': 0.8009072580645161, 'valid_acc': 0.7967318643228521})
(-4, {'train_acc': 0.7995380300280481, 'test_acc': 0.8051915322580645, 'valid_acc': 0.7922753156721961})
(-5, {'train_acc': 0.7950833195842271, 'test_acc': 0.8009072580645161, 'valid_acc': 0.8029215152265412})


In [17]:
prev_time = time.time()
# mem_sys_rsv is an optional parameter. If not set a default value of 3 will be used.
vista = Vista("vista-example", 150, 8, 1, 'resnet50', 5, 0, downstream_ml_func,
                  '/users/snakanda/vista/data/foods/foods_with_interactions.csv',
                  '/users/snakanda/vista/data/foods/images',
                  20129, 130, mem_sys_rsv=3)


result = vista.run()
for k in [-1, -2, -3, -4, -5]:
    print(k, result[k])
    
# print("Runtime (min): " + str((time.time()-prev_time)/60.0))

Vista Configs(join, cpu, np, heap, f_core, pers): b, 7, 238, 140, 0.973514674302, deser
(-1, {'train_acc': 0.8141395809272397, 'test_acc': 0.8251008064516129, 'valid_acc': 0.8138153008170339})
(-2, {'train_acc': 0.8135621184623, 'test_acc': 0.8170362903225806, 'valid_acc': 0.8130725427085912})
(-3, {'train_acc': 0.8099323543969642, 'test_acc': 0.8130040322580645, 'valid_acc': 0.8145580589254766})
(-4, {'train_acc': 0.8118297310674806, 'test_acc': 0.819304435483871, 'valid_acc': 0.8021787571180985})
(-5, {'train_acc': 0.8072100313479624, 'test_acc': 0.8107358870967742, 'valid_acc': 0.814310472889329})


### Foods Structured Features Only

In [18]:
def get_struct_df(sc, data_file_path):
    sql_context = pyspark.SQLContext(sc)
    struct_df = sql_context.read.format('csv').options(header='false', inferSchema='true').load(data_file_path)
    col_names = struct_df.schema.names
        
    struct_df = VectorAssembler(inputCols=col_names[1:-1], outputCol="features").transform(struct_df)
    struct_df = struct_df.withColumn("features", toDense("features"))
    struct_df = struct_df.withColumn("id", struct_df[col_names[0]].cast(StringType())) \
        .withColumn("label", struct_df[col_names[-1]].cast(IntegerType())) \
        .select("id", "features", "label")
    return struct_df

In [19]:
features_df = get_struct_df(sc, '/users/snakanda/vista/data/foods/foods_wo_interactions.csv')
print(downstream_ml_func(features_df, {}, 0))

{0: {'train_acc': 0.7952549051801987, 'test_acc': 0.8086670045615814, 'valid_acc': 0.795352323838081}}


In [20]:
features_df = get_struct_df(sc, '/users/snakanda/vista/data/foods/foods_with_interactions.csv')
print(downstream_ml_func(features_df, {}, 0))

{0: {'train_acc': 0.8068959759559192, 'test_acc': 0.811587147030185, 'valid_acc': 0.8162255750680188}}


## Amazon Dataset

In [21]:
prev_time = time.time()
# mem_sys_rsv is an optional parameter. If not set a default value of 3 will be used.
vista = Vista("vista-example", 150, 8, 1, 'alexnet', 4, 0, downstream_ml_func,
                  '/users/snakanda/vista/data/amazon/amazon.csv',
                  '/users/snakanda/vista/data/amazon/images',
                  20129, 130, mem_sys_rsv=3)

result = vista.run()
for k in [-1, -2, -3, -4]:
    print(k, result[k])
    
# print("Runtime (min): " + str((time.time()-prev_time)/60.0))

Vista Configs(join, cpu, np, heap, f_core, pers): b, 7, 238, 133, 0.972117558402, deser
(-1, {'train_acc': 0.6541223075018568, 'test_acc': 0.6593489780469342, 'valid_acc': 0.676530612244898})
(-2, {'train_acc': 0.6602294297268301, 'test_acc': 0.6578349735049205, 'valid_acc': 0.6739795918367347})
(-3, {'train_acc': 0.6603944870842617, 'test_acc': 0.6563209689629069, 'valid_acc': 0.6762755102040816})
(-4, {'train_acc': 0.6499133448873483, 'test_acc': 0.6515266212465304, 'valid_acc': 0.6691326530612245})


In [22]:
prev_time = time.time()
# mem_sys_rsv is an optional parameter. If not set a default value of 3 will be used.
vista = Vista("vista-example", 150, 8, 1, 'resnet50', 5, 0, downstream_ml_func,
                  '/users/snakanda/vista/data/amazon/amazon.csv',
                  '/users/snakanda/vista/data/amazon/images',
                  20129, 130, mem_sys_rsv=3)

result = vista.run()
for k in [-1, -2, -3, -4, -5]:
    print(k, result[k])
    
# print("Runtime (min): " + str((time.time()-prev_time)/60.0))

Vista Configs(join, cpu, np, heap, f_core, pers): b, 7, 238, 140, 0.973514674302, deser


Py4JJavaError: An error occurred while calling o9852.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 3896.0 failed 1 times, most recent failure: Lost task 1.0 in stage 3896.0 (TID 8343, localhost, executor driver): java.io.FileNotFoundException: /users/snakanda/vista/data/amazon/images/B000MD3NUE.jpg (Too many open files)
	at java.io.FileInputStream.open0(Native Method)
	at java.io.FileInputStream.open(FileInputStream.java:195)
	at java.io.FileInputStream.<init>(FileInputStream.java:138)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileInputStream.<init>(RawLocalFileSystem.java:106)
	at org.apache.hadoop.fs.RawLocalFileSystem.open(RawLocalFileSystem.java:202)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:143)
	at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:769)
	at org.apache.spark.input.PortableDataStream.open(PortableDataStream.scala:183)
	at org.apache.spark.input.PortableDataStream.toArray(PortableDataStream.scala:191)
	at vista.udf.VistaUDFs$$anonfun$getImagesDF$1.apply(VistaUDFs.scala:107)
	at vista.udf.VistaUDFs$$anonfun$getImagesDF$1.apply(VistaUDFs.scala:107)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.tensorframes.impl.DebugRowOps$$anonfun$39.apply(DebugRowOps.scala:477)
	at org.tensorframes.impl.DebugRowOps$$anonfun$39.apply(DebugRowOps.scala:475)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:336)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:334)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:336)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:334)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2119)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1026)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1008)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1128)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:517)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:487)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:278)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.GeneratedMethodAccessor168.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: /users/snakanda/vista/data/amazon/images/B000MD3NUE.jpg (Too many open files)
	at java.io.FileInputStream.open0(Native Method)
	at java.io.FileInputStream.open(FileInputStream.java:195)
	at java.io.FileInputStream.<init>(FileInputStream.java:138)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileInputStream.<init>(RawLocalFileSystem.java:106)
	at org.apache.hadoop.fs.RawLocalFileSystem.open(RawLocalFileSystem.java:202)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:143)
	at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:769)
	at org.apache.spark.input.PortableDataStream.open(PortableDataStream.scala:183)
	at org.apache.spark.input.PortableDataStream.toArray(PortableDataStream.scala:191)
	at vista.udf.VistaUDFs$$anonfun$getImagesDF$1.apply(VistaUDFs.scala:107)
	at vista.udf.VistaUDFs$$anonfun$getImagesDF$1.apply(VistaUDFs.scala:107)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.tensorframes.impl.DebugRowOps$$anonfun$39.apply(DebugRowOps.scala:477)
	at org.tensorframes.impl.DebugRowOps$$anonfun$39.apply(DebugRowOps.scala:475)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:336)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:334)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:336)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:334)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
features_df = get_struct_df(sc, '/users/snakanda/vista/data/amazon/amazon.csv')
print(downstream_ml_func(features_df, {}, 0))

## Hog Features

In [None]:
def get_hog_df(sc, data_file_path):
    sql_context = pyspark.SQLContext(sc)
    struct_df = sql_context.read.format('csv').options(header='false', inferSchema='true').load(data_file_path)
    
    col_names = struct_df.schema.names
            
    struct_df = VectorAssembler(inputCols=col_names[:-1], outputCol="features").transform(struct_df)
    struct_df = struct_df.withColumn("features", toDense("features"))
    struct_df = struct_df.withColumn("label", struct_df[col_names[-1]].cast(IntegerType())) \
        .select("features", "label")
    return struct_df

In [None]:
hog_df = get_hog_df(sc, '/users/snakanda/vista/data/foods/foods_wo_interactions_with_hog.csv')
print(downstream_ml_func(hog_df, {}, 0))

In [None]:
hog_df = get_hog_df(sc, '/users/snakanda/vista/data/foods/foods_with_interactions_with_hog.csv')
print(downstream_ml_func(hog_df, {}, 0))

In [None]:
hog_df = get_hog_df(sc, '/users/snakanda/vista/data/amazon/amazon_with_hog.csv')
print(downstream_ml_func(hog_df, {}, 0))

In [7]:
sc.stop()