#### Initialize Spark session

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [9]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

In [10]:
df = df.select('outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0', 'batter_id', 'inning', 'p_throws', 'pitcher_id', 'stand', 'score_difference', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [11]:
df = df.withColumn('binned_score_difference', df.binned_score_difference +5)

In [12]:
encoder = OneHotEncoderEstimator(inputCols =['outs', 'inning','p_throws', 'stand',"latent_pitch_type",
                                             "pitch_num", "base_status","binned_score_difference",
                                            "count_status"],
                                 outputCols =['outsH','inningH','p_throwsH', 'standH', "latent_pitch_typeH",
                                              "pitch_numH", "base_statusH","binned_score_differenceH"
                                              ,"count_statusH"])
encoder = encoder.fit(df)

In [13]:
def weight(pitch):
    if  pitch == 0.0: return 1.0/0.36
    elif pitch == 1.0: return 1.0/0.17
    elif pitch == 2.0: return 1.0/0.12
    elif pitch == 3.0: return 1.0/0.1
    elif pitch == 4.0: return 1.0/0.08
    elif pitch == 5.0: return 1.0/0.08
    elif pitch == 6.0: return 1.0/0.05
    elif pitch == 7.0: return 1.0/0.02
    elif pitch == 8.0: return 1.0/0.01
    elif pitch == 9.0: return 1.0/0.01
    elif pitch == 10.0: return 1.0/0.01
    
udfweight = udf(weight, DoubleType())
df = df.withColumn("weights", udfweight('latent_next_pitch'))

In [14]:
df.select('pfx_x','pfx_z','pitch_num','px','pz','start_speed','sz_bot','sz_top','x0','y0','z0','outs',
          'batter_id','inning', 'p_throws','pitcher_id','stand', 'score_difference', 'latent_pitch_type',
          'count_status', 'base_status','binned_score_difference', 'latent_next_pitch', 'weights')

DataFrame[pfx_x: double, pfx_z: double, pitch_num: double, px: double, pz: double, start_speed: double, sz_bot: double, sz_top: double, x0: double, y0: double, z0: double, outs: double, batter_id: int, inning: int, p_throws: int, pitcher_id: int, stand: int, score_difference: double, latent_pitch_type: double, count_status: int, base_status: int, binned_score_difference: int, latent_next_pitch: double, weights: double]

In [15]:
def transData(data):
    return data.rdd.map(lambda r: [r[-11], Vectors.dense(r[:10]), r[-1], r[-2], r[-3], r[-4], r[-5], r[-6], r[-7],
                                  r[-8], r[-9], r[-10]]).\
           toDF(['label','features', 'standH', 'pitch_numH', 'inningH', 'latent_pitch_typeH',
                 'binned_score_differenceH', 'outsH', 'base_statusH', 'count_statusH', 'p_throwsH', 'weights'])


In [16]:
norm = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)

In [17]:
assembler = VectorAssembler(inputCols = ['features_norm', 'latent_pitch_typeH', 'standH', 'pitch_numH',
                                         'inningH', 'latent_pitch_typeH',
                                         'binned_score_differenceH', 'outsH', 'base_statusH', 'count_statusH',
                                         'p_throwsH'],
                            outputCol = 'features_fin')



#### Pipeline

In [18]:
data = assembler.transform(norm.transform(transData(encoder.transform(df))))

In [19]:
data = data.select('label', 'features_fin', 'weights')
data.take(1)

[Row(label=0.0, features_fin=SparseVector(94, {0: 0.0081, 1: 0.0492, 2: 0.0795, 3: 0.0162, 4: -0.0043, 5: 0.0218, 6: 0.7575, 7: 0.015, 8: 0.031, 9: 0.0175, 10: 1.0, 19: 1.0, 22: 1.0, 36: 1.0, 54: 1.0, 67: 1.0, 74: 1.0, 75: 1.0, 84: 1.0, 93: 1.0}), weights=2.7777777777777777)]

#### Split data into training and test data

In [20]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]


In [21]:
lr = LogisticRegression(labelCol="label", featuresCol="features_fin", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0, family="multinomial", weightCol = 'weights')

#### Fit the model

In [22]:
lrModel = lr.fit(train)

#### Save model statistics

In [23]:
trainingSummary = lrModel.summary

#### Obtain the objective per iteration

In [24]:
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))


False positive rate by label:
label 0: 0.07207912963261243
label 1: 0.1228666865073894
label 2: 0.08200473941289314
label 3: 0.0799067781125613
label 4: 0.043635873663262686
label 5: 0.07053141180144477
label 6: 0.03687442400761443
label 7: 0.01746843426134221
label 8: 0.2751180590796402
label 9: 0.002164586692254908


In [25]:
print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

True positive rate by label:
label 0: 0.14775679416691478
label 1: 0.33026759072095485
label 2: 0.23250638580776095
label 3: 0.39841133323544015
label 4: 0.5451212079507697
label 5: 0.23530291697830966
label 6: 0.3149396917742812
label 7: 0.3002651423849041
label 8: 0.6002366540754562
label 9: 0.696165191740413


In [26]:
print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

Precision by label:
label 0: 0.5239323334327324
label 1: 0.3468558402369556
label 2: 0.2662392819937495
label 3: 0.38974772901389954
label 4: 0.5232747964104064
label 5: 0.2239549717149474
label 6: 0.32793330273825916
label 7: 0.2886079300986298
label 8: 0.0372937237736532
label 9: 0.5488372093023256
Recall by label:
label 0: 0.14775679416691478
label 1: 0.33026759072095485
label 2: 0.23250638580776095
label 3: 0.39841133323544015
label 4: 0.5451212079507697
label 5: 0.23530291697830966
label 6: 0.3149396917742812
label 7: 0.3002651423849041
label 8: 0.6002366540754562
label 9: 0.696165191740413
F-measure by label:
label 0: 0.23050711636515783
label 1: 0.33835852503432995
label 2: 0.24823206380633955
label 3: 0.3940319150775401
label 4: 0.5339746460021473
label 5: 0.22948874440137917
label 6: 0.3213051852184926
label 7: 0.2943211543329677
label 8: 0.0702242928452579
label 9: 0.6137841352405723


In [27]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.27751889853742273
FPR: 0.08016902170843816
TPR: 0.27751889853742273
F-measure: 0.2983342102476528
Precision: 0.401896255801729
Recall: 0.27751889853742273


In [28]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

predictions = lrModel.transform(test)
evaluator.evaluate(predictions)

0.27772607183246795

Test Data accuracy with weights 0.27522829730666976

#### Logistic Reggression using PCA

In [29]:
pred = ['outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed',
        'sz_bot', 'sz_top', 'x0', 'y0','z0', 'inning', 'p_throws', 'stand']

In [30]:
assembler = VectorAssembler(
                            inputCols=[c for c in df.columns if c in pred],
                            outputCol='features').setHandleInvalid('skip')
output = assembler.transform(df)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scaleroutput = scaler.fit(output)
scaledoutput = scaleroutput.transform(output)

In [31]:
pca = PCA(k=5, inputCol="scaledFeatures", outputCol="pcaFeatures")
PC = pca.fit(scaledoutput)

In [32]:
scaledoutput = PC.transform(scaledoutput)

In [33]:
scaledoutput.take(1)

[Row(outs=1.0, pfx_x=6.08, pfx_z=9.83, pitch_num=2.0, px=-0.532, pz=2.702, start_speed=93.7, sz_bot=1.86, sz_top=3.83, x0=2.161, y0=50.0, z0=6.151, batter_id=120074, inning=1, p_throws=0, pitcher_id=430935, stand=0, score_difference=-1.0, latent_pitch_type=0.0, count_status=2, base_status=0, binned_score_difference=4, latent_next_pitch=0.0, weights=2.7777777777777777, features=DenseVector([1.0, 6.08, 9.83, 2.0, -0.532, 2.702, 93.7, 1.86, 3.83, 2.161, 50.0, 6.151, 1.0, 0.0, 0.0]), scaledFeatures=DenseVector([0.0229, 1.1514, 0.9067, -1.0167, -0.5944, 0.4667, 0.8868, 1.8927, 1.7975, 1.6667, 0.0, 0.7376, -1.4967, -1.6522, -1.1755]), pcaFeatures=DenseVector([-1.7811, -2.8995, -1.4889, -1.8688, -0.383]))]

In [34]:
lr = LogisticRegression(labelCol="latent_next_pitch", featuresCol="pcaFeatures", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0, family="multinomial", weightCol = 'weights')

In [35]:
splits = scaledoutput.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [36]:
lrModel = lr.fit(train)

In [37]:
trainingSummary = lrModel.summary

#### Obtain the objective per iteration

In [38]:
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))


False positive rate by label:
label 0: 4.8752663114222614e-05
label 1: 0.0019330044085063614
label 2: 0.0
label 3: 0.04772848384973288
label 4: 0.08876871980780623
label 5: 0.0
label 6: 0.0
label 7: 0.057678256933150956
label 8: 0.774735981159234
label 9: 0.018773531671517792


In [39]:
print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

True positive rate by label:
label 0: 6.377827028775389e-05
label 1: 0.0023228748090213754
label 2: 0.0
label 3: 0.053384542198343404
label 4: 0.18119833285710063
label 5: 0.0
label 6: 0.0
label 7: 0.0645216792842395
label 8: 0.8824643694750135
label 9: 0.050031269543464665


In [40]:
print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

Precision by label:
label 0: 0.4117647058823529
label 1: 0.19276629570747217
label 2: 0.0
label 3: 0.1252585272971997
label 4: 0.1517527066279377
label 5: 0.0
label 6: 0.0
label 7: 0.025739937400472243
label 8: 0.02000286251737957
label 9: 0.010086152553057365
Recall by label:
label 0: 6.377827028775389e-05
label 1: 0.0023228748090213754
label 2: 0.0
label 3: 0.053384542198343404
label 4: 0.18119833285710063
label 5: 0.0
label 6: 0.0
label 7: 0.0645216792842395
label 8: 0.8824643694750135
label 9: 0.050031269543464665
F-measure by label:
label 0: 0.0001275367863918249
label 1: 0.004590433914315056
label 2: 0.0
label 3: 0.07486290014028822
label 4: 0.16517345907713035
label 5: 0.0
label 6: 0.0
label 7: 0.03679934055581724
label 8: 0.03911901248785403
label 9: 0.016787912702853947


In [41]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.03828139824960758
FPR: 0.027948128742670074
TPR: 0.03828139824960758
F-measure: 0.02421341585935951
Precision: 0.2029140264961137
Recall: 0.03828139824960758


In [43]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

predictions = lrModel.transform(test)
predictions = predictions.withColumnRenamed('latent_next_pitch', 'label')
evaluator.evaluate(predictions)

0.03841162513838457

## With PCA and Cross Validation 

In [44]:
lrpca = LogisticRegression(labelCol="label", featuresCol="pcaFeatures",
                        family="multinomial")

In [45]:
paramGrid = ParamGridBuilder().addGrid(lrpca.regParam, [0.1, 0.01])\
.addGrid(lrpca.elasticNetParam, [0, 1])\
.addGrid(lrpca.maxIter, [1, 5, 10]).build()

In [46]:
crossval = CrossValidator(
    estimator=lrpca,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5)

In [48]:
train = train.withColumnRenamed('latent_next_pitch', 'label')

In [49]:
lrModelpca = crossval.fit(train)
lrModelpca = lrModelpca.bestModel

In [50]:
test = test.withColumnRenamed('latent_next_pitch', 'label')

In [51]:
predictionspca = lrModelpca.transform(test)

In [53]:
accuracypca = evaluator.evaluate(predictionspca)
print("Accuracy = %g" % (accuracypca))

Accuracy = 0.350325


In [54]:
precisionpca = MulticlassClassificationEvaluator(metricName="weightedPrecision").evaluate(predictionspca)
print("Weighted Precision = %g" % precisionpca)

Weighted Precision = 0.190237


In [55]:
recallpca = MulticlassClassificationEvaluator(metricName="weightedRecall").evaluate(predictionspca)
print("Weighted Recall = %g" % recallpca)

Weighted Recall = 0.350325


In [56]:
f1pca = MulticlassClassificationEvaluator(metricName="f1").evaluate(predictionspca)
print("F1 = %g" % f1pca)

F1 = 0.183037
