#### Initialize Spark session

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler
from pyspark.mllib.linalg.distributed import RowMatrix

In [55]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [56]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

In [57]:
df = df.select('pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0','outs', 'inning', 'p_throws', 'stand', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [58]:
df = df.withColumn('binned_score_difference', df.binned_score_difference +5)

In [59]:
encoder = OneHotEncoderEstimator(inputCols =['outs', 'inning','p_throws', 'stand',"latent_pitch_type",
                                             "pitch_num", "base_status","binned_score_difference",
                                            "count_status"],
                                 outputCols =['outsH','inningH','p_throwsH', 'standH', "latent_pitch_typeH",
                                              "pitch_numH", "base_statusH","binned_score_differenceH"
                                              ,"count_statusH"])
encoder = encoder.fit(df)

In [60]:
def transData(data):
    return data.rdd.map(lambda r: [r[-10], Vectors.dense(r[:10]), r[-1], r[-2], r[-3], r[-4], r[-5], r[-6], r[-7],
                                  r[-8], r[-9]]).\
           toDF(['label','features', 'standH', 'pitch_numH', 'inningH', 'latent_pitch_typeH',
                 'binned_score_differenceH', 'outsH', 'base_statusH', 'count_statusH', 'p_throwsH'])


In [61]:
norm = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)

In [62]:


assembler = VectorAssembler(inputCols = ['features_norm', 'latent_pitch_typeH', 'standH', 'pitch_numH',
                                         'inningH', 'latent_pitch_typeH',
                                         'binned_score_differenceH', 'outsH', 'base_statusH', 'count_statusH',
                                         'p_throwsH'],
                            outputCol = 'features_fin')



#### Pipeline

In [63]:
data = assembler.transform(norm.transform(transData(encoder.transform(df))))

In [64]:
data = data.select('label', 'features_fin')
data.take(1)

[Row(label=0.0, features_fin=SparseVector(94, {0: 0.0352, 1: 0.0569, 2: 0.0116, 3: -0.0031, 4: 0.0156, 5: 0.5426, 6: 0.0108, 7: 0.0222, 8: 0.0125, 9: 0.2895, 10: 1.0, 19: 1.0, 22: 1.0, 36: 1.0, 54: 1.0, 67: 1.0, 74: 1.0, 75: 1.0, 84: 1.0, 93: 1.0}))]

#### Split data into training and test data

In [42]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

#### specify layers for the neural network: input layer of size 11 (features), two intermediate of size 5 and 4 and output of size 7 (classes)

In [46]:
layers = [94, 20, 15, 10]

trainer = MultilayerPerceptronClassifier(maxIter=100, featuresCol = 'features_fin',
                                         layers=layers, blockSize=128, seed=1234)

#### Train the model

In [47]:
model = trainer.fit(train)

#### Compute accuracy on the test set

In [48]:
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.4203125316541679


With layers = [68, 10, 6, 10] accuracy = 0.42275469889813827

With layers = [66, 10, 5, 10] accuracy = 0.4239524407592986

With layers = [66, 5, 5, 5, 5, 10] accuracy = 0.41438654646756

With layers = [66, 5, 2, 10] accuracy = 0.415957348573834985

With layers = [66, 10] accuracy = 0.412857958928289572

With layers = [94, 10, 5, 10] accuracy = 0.4254594248683485

With layers = [94, 20, 15, 10] accuracy = 0.4203125316541679

In [158]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
evaluator.evaluate(predictionAndLabels)

0.40100858600575917

In [159]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
evaluator.evaluate(predictionAndLabels)

0.42275469889813827

In [62]:
pred = ['outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed',
        'sz_bot', 'sz_top', 'x0', 'y0','z0', 'inning', 'p_throws', 'stand']

In [65]:
assembler = VectorAssembler(
                            inputCols=[c for c in df.columns if c in pred],
                            outputCol='features').setHandleInvalid('skip')
output = assembler.transform(df)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scaleroutput = scaler.fit(output)
scaledoutput = scaleroutput.transform(output)

In [66]:
pca = PCA(k=5, inputCol="scaledFeatures", outputCol="pcaFeatures")
PC = pca.fit(scaledoutput)

In [69]:
scaledoutput = PC.transform(scaledoutput)

In [70]:
scaledoutput.take(1)

[Row(outs=1.0, pfx_x=6.08, pfx_z=9.83, px=-0.532, pz=2.702, start_speed=93.7, sz_bot=1.86, sz_top=3.83, x0=2.161, y0=50.0, z0=6.151, inning=1, p_throws=0, stand=0, latent_pitch_typeH=SparseVector(9, {0: 1.0}), pitch_numH=SparseVector(14, {1: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), count_statusH=SparseVector(11, {2: 1.0}), latent_next_pitch=0.0, features=DenseVector([1.0, 6.08, 9.83, -0.532, 2.702, 93.7, 1.86, 3.83, 2.161, 50.0, 6.151, 1.0, 0.0, 0.0]), scaledFeatures=DenseVector([0.0226, 1.1516, 0.9073, -0.5945, 0.4672, 0.8873, 1.8885, 1.7978, 1.6659, 0.0, 0.7366, -1.4967, -1.652, -1.176]), pcaFeatures=DenseVector([-1.7682, -2.8534, -1.4608, -1.8148, -0.4139]))]

In [76]:
layers = [5, 10, 5, 10]

trainer = MultilayerPerceptronClassifier(maxIter=1000, featuresCol = 'pcaFeatures', labelCol = 'latent_next_pitch',
                                         layers =layers, blockSize=128, seed=1234)

In [78]:
splits = scaledoutput.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [None]:
model = trainer.fit(train)

In [None]:
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

#### Undersampled Training set

In [65]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = data.filter(data.label == 9.0).union(data.filter(data.label == 8.0).sample(8029/36597))\
.union(data.filter(data.label == 7.0).sample(8029/48439)).union(data.filter(data.label == 6.0).sample(8029/113084))\
.union(data.filter(data.label == 5.0).sample(8029/166967)).union(data.filter(data.label == 4.0).sample(8029/169810))\
.union(data.filter(data.label == 3.0).sample(8029/238241)).union(data.filter(data.label == 2.0).sample(8029/238362))\
.union(data.filter(data.label == 1.0).sample(8029/346583)).union(data.filter(data.label == 6.0).sample(8029/732538))
test = splits[1]

#### specify layers for the neural network: input layer of size 11 (features), two intermediate of size 5 and 4 and output of size 7 (classes)

In [66]:
layers = [94, 10, 5, 10]

trainer = MultilayerPerceptronClassifier(maxIter=1000, featuresCol = 'features_fin',
                                         layers=layers, blockSize=128, seed=1234)

#### Train the model

In [None]:
model = trainer.fit(train)

#### Compute accuracy on the test set

In [None]:
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

In [None]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
evaluator.evaluate(predictionAndLabels)

In [None]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
evaluator.evaluate(predictionAndLabels)