#### Initialize Spark session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [2]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

In [3]:
df.select('latent_next_pitch', 'latent_pitch_type').show()

+-----------------+-----------------+
|latent_next_pitch|latent_pitch_type|
+-----------------+-----------------+
|              0.0|              0.0|
|              0.0|              0.0|
|              7.0|              0.0|
|              7.0|              7.0|
|              0.0|              7.0|
|              6.0|              0.0|
|              6.0|              6.0|
|              7.0|              2.0|
|              7.0|              7.0|
|              3.0|              7.0|
|              0.0|              2.0|
|              2.0|              0.0|
|              6.0|              3.0|
|              6.0|              6.0|
|              6.0|              6.0|
|              6.0|              6.0|
|              0.0|              6.0|
|              2.0|              7.0|
|              0.0|              3.0|
|              0.0|              0.0|
+-----------------+-----------------+
only showing top 20 rows



In [4]:
df.dtypes

[('_c0', 'int'),
 ('outs', 'double'),
 ('pfx_x', 'double'),
 ('pfx_z', 'double'),
 ('pitch_num', 'double'),
 ('px', 'double'),
 ('pz', 'double'),
 ('start_speed', 'double'),
 ('sz_bot', 'double'),
 ('sz_top', 'double'),
 ('x0', 'double'),
 ('y0', 'double'),
 ('z0', 'double'),
 ('batter_id', 'int'),
 ('inning', 'int'),
 ('p_throws', 'int'),
 ('pitcher_id', 'int'),
 ('stand', 'int'),
 ('top', 'int'),
 ('score_difference', 'double'),
 ('latent_pitch_type', 'double'),
 ('latent_next_pitch', 'double'),
 ('count_status', 'int'),
 ('base_status', 'int'),
 ('binned_score_difference', 'int')]

In [134]:
df.select('binned_score_difference').distinct().rdd.map(lambda r: r[0]).collect()

[-1, 1, 3, -5, 5, 4, -4, -2, 2, -3, 0]

In [135]:
df = df.na.drop(subset=["count_status"])

In [136]:
df.groupby('binned_score_difference').count().show()

+-----------------------+------+
|binned_score_difference| count|
+-----------------------+------+
|                     -1|200888|
|                      1|204003|
|                      3| 95738|
|                     -5|115029|
|                      5|120903|
|                      4| 66887|
|                     -4| 63171|
|                     -2|137364|
|                      2|142395|
|                     -3| 90426|
|                      0|450747|
+-----------------------+------+



In [137]:
df.groupby('latent_pitch_type').count().show()

+-----------------+------+
|latent_pitch_type| count|
+-----------------+------+
|              8.0| 24866|
|              0.0|611638|
|              7.0| 40498|
|              1.0|265031|
|              4.0|143130|
|              3.0|169526|
|              2.0|195344|
|             10.0|   916|
|              6.0| 87888|
|              5.0|142323|
|              9.0|  6391|
+-----------------+------+



In [138]:
df.groupby('latent_next_pitch').count().show()

+-----------------+------+
|latent_next_pitch| count|
+-----------------+------+
|              8.0| 29491|
|              0.0|588341|
|              7.0| 39122|
|              1.0|279071|
|              4.0|136854|
|              3.0|190917|
|              2.0|191363|
|             10.0|   854|
|              6.0| 90903|
|              5.0|134291|
|              9.0|  6344|
+-----------------+------+



In [139]:
df.groupby('count_status').count().show()

+------------+------+
|count_status| count|
+------------+------+
|           1|191344|
|           6| 15064|
|           3|184912|
|           5| 93422|
|           9| 24083|
|           4| 63691|
|           8|127543|
|           7| 91912|
|          10|100675|
|          11| 31991|
|           2|240205|
|           0|522709|
+------------+------+



In [140]:
df.columns

['_c0',
 'outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'top',
 'score_difference',
 'latent_pitch_type',
 'latent_next_pitch',
 'count_status',
 'base_status',
 'binned_score_difference']

In [5]:
df = df.select('outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0', 'batter_id', 'inning', 'p_throws', 'pitcher_id', 'stand', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [6]:
df.columns

['outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'latent_pitch_type',
 'count_status',
 'base_status',
 'binned_score_difference',
 'latent_next_pitch']

In [7]:
df = df.withColumn('binned_score_difference', df.binned_score_difference +5)

In [8]:
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(inputCols =["latent_pitch_type", "pitch_num", "base_status","binned_score_difference",
                                            "count_status"],
                                 outputCols =["latent_pitch_typeH", "pitch_numH", "base_statusH","binned_score_differenceH"
                                              ,"count_statusH"])
model = encoder.fit(df)
df = model.transform(df)

In [9]:
df.select('latent_pitch_typeH').take(5)

[Row(latent_pitch_typeH=SparseVector(9, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(9, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(9, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(9, {7: 1.0})),
 Row(latent_pitch_typeH=SparseVector(9, {7: 1.0}))]

In [10]:
df.columns

['outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'latent_pitch_type',
 'count_status',
 'base_status',
 'binned_score_difference',
 'latent_next_pitch',
 'count_statusH',
 'base_statusH',
 'binned_score_differenceH',
 'latent_pitch_typeH',
 'pitch_numH']

In [11]:
df = df.select('outs','pfx_x','pfx_z','px','pz','start_speed','sz_bot','sz_top','x0',
               'y0','z0','batter_id','inning','p_throws','pitcher_id','stand','latent_pitch_typeH','pitch_numH',
               'base_statusH','binned_score_differenceH','count_statusH','latent_next_pitch')

In [146]:
df.select('latent_pitch_typeH','pitch_numH','base_statusH','binned_score_differenceH','count_statusH').take(4)

[Row(latent_pitch_typeH=SparseVector(10, {0: 1.0}), pitch_numH=SparseVector(14, {1: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), count_statusH=SparseVector(11, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(10, {0: 1.0}), pitch_numH=SparseVector(14, {2: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), count_statusH=SparseVector(11, {2: 1.0})),
 Row(latent_pitch_typeH=SparseVector(10, {0: 1.0}), pitch_numH=SparseVector(14, {1: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {}), count_statusH=SparseVector(11, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(10, {7: 1.0}), pitch_numH=SparseVector(14, {2: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {}), count_statusH=SparseVector(11, {2: 1.0}))]

In [12]:
def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-6]), r[-2], r[-3], r[-4], r[-5], r[-6]]).\
           toDF(['label','features', 'count_statusH','binned_score_differenceH','base_statusH', 'pitch_numH',
                 'latent_pitch_typeH'])

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+
|label|            features|  count_statusH|binned_score_differenceH| base_statusH|    pitch_numH|latent_pitch_typeH|
+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+
|  0.0|[1.0,6.08,9.83,-0...| (11,[0],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(14,[1],[1.0])|     (9,[0],[1.0])|
|  0.0|[1.0,4.54,12.83,-...| (11,[2],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(14,[2],[1.0])|     (9,[0],[1.0])|
|  7.0|[0.0,-3.71,9.05,-...| (11,[0],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[1],[1.0])|     (9,[0],[1.0])|
|  7.0|[0.0,4.87,-6.37,0...| (11,[2],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[2],[1.0])|     (9,[7],[1.0])|
|  0.0|[0.0,1.64,-4.12,0...| (11,[3],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[3],[1.0])|     (9,[7],[1.0])|
|  6.0|[0.0,-2.47,9.54,-...| (11,[8],[1.0])|            

In [13]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

norm = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)
data = norm.transform(data)

In [151]:
data.select('features_norm').take(1)

[Row(features_norm=DenseVector([0.0, 0.0, 0.0, -0.0, 0.0, 0.0002, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.2178, 0.0, 0.0, 0.7818, 0.0]))]

In [14]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['features_norm', 'latent_pitch_typeH','pitch_numH','base_statusH',
                                         'binned_score_differenceH','count_statusH'], outputCol = 'features_fin')

data = assembler.transform(data)

In [15]:
data = data.select('label', 'features_fin')
data.select('features_fin').take(1)

[Row(features_fin=SparseVector(67, {0: 0.0, 1: 0.0, 2: 0.0, 3: -0.0, 4: 0.0, 5: 0.0002, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0001, 10: 0.0, 11: 0.2178, 12: 0.0, 14: 0.7818, 16: 1.0, 26: 1.0, 39: 1.0, 50: 1.0, 56: 1.0}))]

In [232]:
from pyspark.ml.feature import PCA

pca = PCA(k=2, inputCol="features_norm", outputCol="pcaFeatures")
model = pca.fit(data)

In [233]:
data = model.transform(data)

#### Split data into training and test data

In [21]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

train = data.filter(data['label'] == 9.0).sample(1165/8285)\
.union(data.filter(data['label'] == 8.0).sample(1165/30932)).union(data.filter(data['label'] == 7.0).sample(1165/50315))\
.union(data.filter(data['label'] == 6.0).sample(1165/109587)).union(data.filter(data['label'] == 5.0).sample(1165/177058))\
.union(data.filter(data['label'] == 4.0).sample(1165/178452)).union(data.filter(data['label'] == 3.0).sample(1165/211755))\
.union(data.filter(data['label'] == 2.0).sample(1165/244465)).union(data.filter(data['label'] == 1.0).sample(1165/330376))\
.union(data.filter(data['label'] == 0.0).sample(1165/763599))

In [22]:
df.columns

['outs',
 'pfx_x',
 'pfx_z',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'latent_pitch_typeH',
 'pitch_numH',
 'base_statusH',
 'binned_score_differenceH',
 'count_statusH',
 'latent_next_pitch']

#### specify layers for the neural network: input layer of size 11 (features), two intermediate of size 5 and 4 and output of size 7 (classes)

# Currently not working so experimenting with other code
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

layers = [11, 5, 4, 4, 3 , 11]

# create the trainer and set its parameters
FNN = MultilayerPerceptronClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",\
                                         maxIter=100, layers=layers, blockSize=128, seed=1234)
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
# Chain indexers and forest in a Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, FNN, labelConverter])
# train the model
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)


In [23]:
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

layers = [67, 10, 5, 10]

trainer = MultilayerPerceptronClassifier(maxIter=1000, featuresCol = 'features_fin',
                                         layers=layers, blockSize=128, seed=1234)

#### Train the model

In [24]:
model = trainer.fit(train)

Py4JJavaError: An error occurred while calling o387.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 2227.0 failed 1 times, most recent failure: Lost task 1.0 in stage 2227.0 (TID 41098, localhost, executor driver): java.io.IOException: Cannot allocate memory
	at java.io.FileInputStream.readBytes(Native Method)
	at java.io.FileInputStream.read(FileInputStream.java:255)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
	at net.jpountz.lz4.LZ4BlockInputStream.readFully(LZ4BlockInputStream.java:269)
	at net.jpountz.lz4.LZ4BlockInputStream.refill(LZ4BlockInputStream.java:245)
	at net.jpountz.lz4.LZ4BlockInputStream.read(LZ4BlockInputStream.java:157)
	at java.io.ObjectInputStream$PeekInputStream.read(ObjectInputStream.java:2663)
	at java.io.ObjectInputStream$PeekInputStream.readFully(ObjectInputStream.java:2679)
	at java.io.ObjectInputStream$BlockDataInputStream.readDoubles(ObjectInputStream.java:3376)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1962)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1567)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:220)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:298)
	at org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1312)
	at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:612)
	at org.apache.spark.storage.BlockManager.get(BlockManager.scala:815)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:875)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.mllib.optimization.LBFGS$CostFun.calculate(LBFGS.scala:261)
	at org.apache.spark.mllib.optimization.LBFGS$CostFun.calculate(LBFGS.scala:230)
	at breeze.optimize.CachedDiffFunction.calculate(CachedDiffFunction.scala:23)
	at breeze.optimize.LineSearch$$anon$1.calculate(LineSearch.scala:41)
	at breeze.optimize.LineSearch$$anon$1.calculate(LineSearch.scala:30)
	at breeze.optimize.StrongWolfeLineSearch.breeze$optimize$StrongWolfeLineSearch$$phi$1(StrongWolfe.scala:76)
	at breeze.optimize.StrongWolfeLineSearch$$anonfun$minimizeWithBound$1.apply$mcVI$sp(StrongWolfe.scala:149)
	at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:160)
	at breeze.optimize.StrongWolfeLineSearch.minimizeWithBound(StrongWolfe.scala:148)
	at breeze.optimize.StrongWolfeLineSearch.minimize(StrongWolfe.scala:62)
	at breeze.optimize.LBFGS.determineStepSize(LBFGS.scala:76)
	at breeze.optimize.LBFGS.determineStepSize(LBFGS.scala:39)
	at breeze.optimize.FirstOrderMinimizer$$anonfun$infiniteIterations$1.apply(FirstOrderMinimizer.scala:64)
	at breeze.optimize.FirstOrderMinimizer$$anonfun$infiniteIterations$1.apply(FirstOrderMinimizer.scala:62)
	at scala.collection.Iterator$$anon$7.next(Iterator.scala:129)
	at breeze.util.IteratorImplicits$RichIterator$$anon$2.next(Implicits.scala:71)
	at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:212)
	at org.apache.spark.mllib.optimization.LBFGS.optimize(LBFGS.scala:142)
	at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:854)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$train$1.apply(MultilayerPerceptronClassifier.scala:249)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$train$1.apply(MultilayerPerceptronClassifier.scala:205)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:183)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:183)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:205)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Cannot allocate memory
	at java.io.FileInputStream.readBytes(Native Method)
	at java.io.FileInputStream.read(FileInputStream.java:255)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
	at net.jpountz.lz4.LZ4BlockInputStream.readFully(LZ4BlockInputStream.java:269)
	at net.jpountz.lz4.LZ4BlockInputStream.refill(LZ4BlockInputStream.java:245)
	at net.jpountz.lz4.LZ4BlockInputStream.read(LZ4BlockInputStream.java:157)
	at java.io.ObjectInputStream$PeekInputStream.read(ObjectInputStream.java:2663)
	at java.io.ObjectInputStream$PeekInputStream.readFully(ObjectInputStream.java:2679)
	at java.io.ObjectInputStream$BlockDataInputStream.readDoubles(ObjectInputStream.java:3376)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1962)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1567)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:220)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:298)
	at org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1312)
	at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:612)
	at org.apache.spark.storage.BlockManager.get(BlockManager.scala:815)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:875)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


#### Compute accuracy on the test set

In [None]:
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 38128)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pysp

With layers = [23, 20, 15, 6, 11] accuracy = 0.36312009234584086

With layers = [23, 20, 15, 11] accuracy = 0.3631949095780308

With layers = [22, 30, 15, 11] accuracy = 0.3613826699538746

With layers = [68, 10, 5, 11] accuracy = 0.42275469889813827

In [158]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
evaluator.evaluate(predictionAndLabels)

0.40100858600575917

In [159]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
evaluator.evaluate(predictionAndLabels)

0.42275469889813827