#### Initialize Spark session

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [7]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

In [8]:
df.dtypes

[('_c0', 'int'),
 ('outs', 'double'),
 ('pfx_x', 'double'),
 ('pfx_z', 'double'),
 ('pitch_num', 'double'),
 ('px', 'double'),
 ('pz', 'double'),
 ('start_speed', 'double'),
 ('sz_bot', 'double'),
 ('sz_top', 'double'),
 ('x0', 'double'),
 ('y0', 'double'),
 ('z0', 'double'),
 ('batter_id', 'int'),
 ('inning', 'int'),
 ('p_throws', 'int'),
 ('pitcher_id', 'int'),
 ('stand', 'int'),
 ('top', 'int'),
 ('score_difference', 'double'),
 ('latent_pitch_type', 'double'),
 ('latent_next_pitch', 'double'),
 ('count_status', 'int'),
 ('base_status', 'int'),
 ('binned_score_difference', 'int')]

In [9]:
df.select('binned_score_difference').distinct().rdd.map(lambda r: r[0]).collect()

[-1, 1, 3, -5, 5, 4, -4, -2, 2, -3, 0]

In [10]:
df = df.na.drop(subset=["count_status"])

In [11]:
df.groupby('binned_score_difference').count().show()

+-----------------------+------+
|binned_score_difference| count|
+-----------------------+------+
|                     -1|251268|
|                      1|255069|
|                      3|119318|
|                     -5|143507|
|                      5|150485|
|                      4| 83214|
|                     -4| 78692|
|                     -2|171465|
|                      2|177304|
|                     -3|113469|
|                      0|562198|
+-----------------------+------+



In [12]:
df.groupby('latent_pitch_type').count().show()

+-----------------+------+
|latent_pitch_type| count|
+-----------------+------+
|              8.0| 30932|
|              0.0|763599|
|              7.0| 50315|
|              1.0|330376|
|              4.0|178452|
|              3.0|211755|
|              2.0|244465|
|             10.0|  1165|
|              6.0|109587|
|              5.0|177058|
|              9.0|  8285|
+-----------------+------+



In [13]:
df.groupby('latent_next_pitch').count().show()

+-----------------+------+
|latent_next_pitch| count|
+-----------------+------+
|              8.0| 30932|
|              0.0|763599|
|              7.0| 50315|
|              1.0|330376|
|              4.0|178452|
|              3.0|211755|
|              2.0|244465|
|             10.0|  1165|
|              6.0|109587|
|              5.0|177058|
|              9.0|  8285|
+-----------------+------+



In [14]:
df.columns

['_c0',
 'outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'top',
 'score_difference',
 'latent_pitch_type',
 'latent_next_pitch',
 'count_status',
 'base_status',
 'binned_score_difference']

In [15]:
df = df.select('outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0', 'batter_id', 'inning', 'p_throws', 'pitcher_id', 'stand', 'top', 'score_difference', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [16]:
df.columns

['outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'top',
 'score_difference',
 'latent_pitch_type',
 'count_status',
 'base_status',
 'binned_score_difference',
 'latent_next_pitch']

In [17]:
def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-1])]).\
           toDF(['label','features'])

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[1.0,6.08,9.83,1....|
|  0.0|[1.0,4.54,12.83,2...|
|  0.0|[0.0,-3.71,9.05,1...|
|  7.0|[0.0,4.87,-6.37,2...|
|  7.0|[0.0,1.64,-4.12,3...|
|  0.0|[0.0,-2.47,9.54,4...|
|  6.0|[0.0,1.98,6.25,5....|
|  2.0|[2.0,-6.24,7.53,1...|
|  7.0|[2.0,2.25,-7.86,2...|
|  7.0|[2.0,-0.03,-2.33,...|
|  2.0|[2.0,-7.05,6.16,4...|
|  0.0|[2.0,-3.57,7.96,5...|
|  3.0|[1.0,-6.56,5.71,1...|
|  6.0|[1.0,-1.15,10.16,...|
|  6.0|[1.0,2.3,8.94,3.0...|
|  6.0|[1.0,-0.29,8.41,4...|
|  6.0|[1.0,1.36,8.65,5....|
|  7.0|[0.0,7.81,-6.61,1...|
|  3.0|[0.0,-4.02,7.13,2...|
|  0.0|[0.0,-3.36,12.2,3...|
+-----+--------------------+
only showing top 20 rows



#### Split data into training and test data

In [18]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

#### specify layers for the neural network: input layer of size 11 (features), two intermediate of size 5 and 4 and output of size 7 (classes)

# Currently not working so experimenting with other code
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

layers = [11, 5, 4, 4, 3 , 11]

# create the trainer and set its parameters
FNN = MultilayerPerceptronClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",\
                                         maxIter=100, layers=layers, blockSize=128, seed=1234)
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
# Chain indexers and forest in a Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, FNN, labelConverter])
# train the model
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)


In [25]:
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

layers = [23, 20, 15, 11]

trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

#### Train the model

In [26]:
model = trainer.fit(train)

#### Compute accuracy on the test set

In [27]:
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.3631949095780308


With layers = [23, 20, 15, 6, 11] accuracy = 0.36312009234584086

With layers = [23, 20, 15, 11] accuracy = 0.3631949095780308