#### Initialize Spark session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [2]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

In [3]:
df.select('latent_next_pitch', 'latent_pitch_type').show()

+-----------------+-----------------+
|latent_next_pitch|latent_pitch_type|
+-----------------+-----------------+
|              0.0|              0.0|
|              0.0|              0.0|
|              7.0|              0.0|
|              7.0|              7.0|
|              0.0|              7.0|
|              6.0|              0.0|
|              6.0|              6.0|
|              7.0|              2.0|
|              7.0|              7.0|
|              3.0|              7.0|
|              0.0|              2.0|
|              2.0|              0.0|
|              6.0|              3.0|
|              6.0|              6.0|
|              6.0|              6.0|
|              6.0|              6.0|
|              0.0|              6.0|
|              2.0|              7.0|
|              0.0|              3.0|
|              0.0|              0.0|
+-----------------+-----------------+
only showing top 20 rows



In [4]:
df.dtypes

[('_c0', 'int'),
 ('outs', 'double'),
 ('pfx_x', 'double'),
 ('pfx_z', 'double'),
 ('pitch_num', 'double'),
 ('px', 'double'),
 ('pz', 'double'),
 ('start_speed', 'double'),
 ('sz_bot', 'double'),
 ('sz_top', 'double'),
 ('x0', 'double'),
 ('y0', 'double'),
 ('z0', 'double'),
 ('batter_id', 'int'),
 ('inning', 'int'),
 ('p_throws', 'int'),
 ('pitcher_id', 'int'),
 ('stand', 'int'),
 ('top', 'int'),
 ('score_difference', 'double'),
 ('latent_pitch_type', 'double'),
 ('latent_next_pitch', 'double'),
 ('count_status', 'int'),
 ('base_status', 'int'),
 ('binned_score_difference', 'int')]

In [134]:
df.select('binned_score_difference').distinct().rdd.map(lambda r: r[0]).collect()

[-1, 1, 3, -5, 5, 4, -4, -2, 2, -3, 0]

In [135]:
df = df.na.drop(subset=["count_status"])

In [136]:
df.groupby('binned_score_difference').count().show()

+-----------------------+------+
|binned_score_difference| count|
+-----------------------+------+
|                     -1|200888|
|                      1|204003|
|                      3| 95738|
|                     -5|115029|
|                      5|120903|
|                      4| 66887|
|                     -4| 63171|
|                     -2|137364|
|                      2|142395|
|                     -3| 90426|
|                      0|450747|
+-----------------------+------+



In [137]:
df.groupby('latent_pitch_type').count().show()

+-----------------+------+
|latent_pitch_type| count|
+-----------------+------+
|              8.0| 24866|
|              0.0|611638|
|              7.0| 40498|
|              1.0|265031|
|              4.0|143130|
|              3.0|169526|
|              2.0|195344|
|             10.0|   916|
|              6.0| 87888|
|              5.0|142323|
|              9.0|  6391|
+-----------------+------+



In [138]:
df.groupby('latent_next_pitch').count().show()

+-----------------+------+
|latent_next_pitch| count|
+-----------------+------+
|              8.0| 29491|
|              0.0|588341|
|              7.0| 39122|
|              1.0|279071|
|              4.0|136854|
|              3.0|190917|
|              2.0|191363|
|             10.0|   854|
|              6.0| 90903|
|              5.0|134291|
|              9.0|  6344|
+-----------------+------+



In [139]:
df.groupby('count_status').count().show()

+------------+------+
|count_status| count|
+------------+------+
|           1|191344|
|           6| 15064|
|           3|184912|
|           5| 93422|
|           9| 24083|
|           4| 63691|
|           8|127543|
|           7| 91912|
|          10|100675|
|          11| 31991|
|           2|240205|
|           0|522709|
+------------+------+



In [140]:
df.columns

['_c0',
 'outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'top',
 'score_difference',
 'latent_pitch_type',
 'latent_next_pitch',
 'count_status',
 'base_status',
 'binned_score_difference']

In [5]:
df = df.select('outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0', 'batter_id', 'inning', 'p_throws', 'pitcher_id', 'stand', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [6]:
df.columns

['outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'latent_pitch_type',
 'count_status',
 'base_status',
 'binned_score_difference',
 'latent_next_pitch']

In [7]:
df = df.withColumn('binned_score_difference', df.binned_score_difference +5)

In [8]:
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(inputCols =["latent_pitch_type", "pitch_num", "base_status","binned_score_difference",
                                            "count_status"],
                                 outputCols =["latent_pitch_typeH", "pitch_numH", "base_statusH","binned_score_differenceH"
                                              ,"count_statusH"])
model = encoder.fit(df)
df = model.transform(df)

In [9]:
df.select('latent_pitch_typeH').take(5)

[Row(latent_pitch_typeH=SparseVector(9, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(9, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(9, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(9, {7: 1.0})),
 Row(latent_pitch_typeH=SparseVector(9, {7: 1.0}))]

In [10]:
df.columns

['outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'latent_pitch_type',
 'count_status',
 'base_status',
 'binned_score_difference',
 'latent_next_pitch',
 'count_statusH',
 'base_statusH',
 'binned_score_differenceH',
 'latent_pitch_typeH',
 'pitch_numH']

In [11]:
df = df.select('outs','pfx_x','pfx_z','px','pz','start_speed','sz_bot','sz_top','x0',
               'y0','z0','batter_id','inning','p_throws','pitcher_id','stand','latent_pitch_typeH','pitch_numH',
               'base_statusH','binned_score_differenceH','count_statusH','latent_next_pitch')

In [146]:
df.select('latent_pitch_typeH','pitch_numH','base_statusH','binned_score_differenceH','count_statusH').take(4)

[Row(latent_pitch_typeH=SparseVector(10, {0: 1.0}), pitch_numH=SparseVector(14, {1: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), count_statusH=SparseVector(11, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(10, {0: 1.0}), pitch_numH=SparseVector(14, {2: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), count_statusH=SparseVector(11, {2: 1.0})),
 Row(latent_pitch_typeH=SparseVector(10, {0: 1.0}), pitch_numH=SparseVector(14, {1: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {}), count_statusH=SparseVector(11, {0: 1.0})),
 Row(latent_pitch_typeH=SparseVector(10, {7: 1.0}), pitch_numH=SparseVector(14, {2: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {}), count_statusH=SparseVector(11, {2: 1.0}))]

In [12]:
def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-6]), r[-2], r[-3], r[-4], r[-5], r[-6]]).\
           toDF(['label','features', 'count_statusH','binned_score_differenceH','base_statusH', 'pitch_numH',
                 'latent_pitch_typeH'])

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+
|label|            features|  count_statusH|binned_score_differenceH| base_statusH|    pitch_numH|latent_pitch_typeH|
+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+
|  0.0|[1.0,6.08,9.83,-0...| (11,[0],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(14,[1],[1.0])|     (9,[0],[1.0])|
|  0.0|[1.0,4.54,12.83,-...| (11,[2],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(14,[2],[1.0])|     (9,[0],[1.0])|
|  7.0|[0.0,-3.71,9.05,-...| (11,[0],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[1],[1.0])|     (9,[0],[1.0])|
|  7.0|[0.0,4.87,-6.37,0...| (11,[2],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[2],[1.0])|     (9,[7],[1.0])|
|  0.0|[0.0,1.64,-4.12,0...| (11,[3],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[3],[1.0])|     (9,[7],[1.0])|
|  6.0|[0.0,-2.47,9.54,-...| (11,[8],[1.0])|            

In [13]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

norm = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)
data = norm.transform(data)

In [151]:
data.select('features_norm').take(1)

[Row(features_norm=DenseVector([0.0, 0.0, 0.0, -0.0, 0.0, 0.0002, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.2178, 0.0, 0.0, 0.7818, 0.0]))]

In [14]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['features_norm', 'latent_pitch_typeH','pitch_numH','base_statusH',
                                         'binned_score_differenceH','count_statusH'], outputCol = 'features_fin')

data = assembler.transform(data)

In [15]:
data = data.select('label', 'features_fin')
data.select('features_fin').take(1)

[Row(features_fin=SparseVector(67, {0: 0.0, 1: 0.0, 2: 0.0, 3: -0.0, 4: 0.0, 5: 0.0002, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0001, 10: 0.0, 11: 0.2178, 12: 0.0, 14: 0.7818, 16: 1.0, 26: 1.0, 39: 1.0, 50: 1.0, 56: 1.0}))]

In [232]:
from pyspark.ml.feature import PCA

pca = PCA(k=2, inputCol="features_norm", outputCol="pcaFeatures")
model = pca.fit(data)

In [233]:
data = model.transform(data)

#### Split data into training and test data

In [16]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [17]:
train = data.filter(data['label'] == 9.0).sample(1165/8285)\
.union(data.filter(data['label'] == 8.0).sample(1165/30932)).union(data.filter(data['label'] == 7.0).sample(1165/50315))\
.union(data.filter(data['label'] == 6.0).sample(1165/109587)).union(data.filter(data['label'] == 5.0).sample(1165/177058))\
.union(data.filter(data['label'] == 4.0).sample(1165/178452)).union(data.filter(data['label'] == 3.0).sample(1165/211755))\
.union(data.filter(data['label'] == 2.0).sample(1165/244465)).union(data.filter(data['label'] == 1.0).sample(1165/330376))\
.union(data.filter(data['label'] == 0.0).sample(1165/763599))

In [222]:
df.columns

['outs',
 'pfx_x',
 'pfx_z',
 'pitch_num',
 'px',
 'pz',
 'start_speed',
 'sz_bot',
 'sz_top',
 'x0',
 'y0',
 'z0',
 'batter_id',
 'inning',
 'p_throws',
 'pitcher_id',
 'stand',
 'latent_pitch_type',
 'count_status',
 'base_status',
 'binned_score_difference',
 'latent_next_pitch']

#### specify layers for the neural network: input layer of size 11 (features), two intermediate of size 5 and 4 and output of size 7 (classes)

# Currently not working so experimenting with other code
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

layers = [11, 5, 4, 4, 3 , 11]

# create the trainer and set its parameters
FNN = MultilayerPerceptronClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",\
                                         maxIter=100, layers=layers, blockSize=128, seed=1234)
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
# Chain indexers and forest in a Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, FNN, labelConverter])
# train the model
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)


In [18]:
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

layers = [67, 10, 5, 10]

trainer = MultilayerPerceptronClassifier(maxIter=1000, featuresCol = 'features_fin',
                                         layers=layers, blockSize=128, seed=1234)

#### Train the model

In [19]:
model = trainer.fit(train)

#### Compute accuracy on the test set

In [None]:
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

With layers = [23, 20, 15, 6, 11] accuracy = 0.36312009234584086

With layers = [23, 20, 15, 11] accuracy = 0.3631949095780308

With layers = [22, 30, 15, 11] accuracy = 0.3613826699538746

With layers = [68, 10, 5, 11] accuracy = 0.42275469889813827

In [158]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
evaluator.evaluate(predictionAndLabels)

0.40100858600575917

In [159]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
evaluator.evaluate(predictionAndLabels)

0.42275469889813827