#### Initialize Spark session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [2]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

In [3]:
df.count()

2098636

In [4]:
df = df.select('outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0', 'batter_id', 'inning', 'p_throws', 'pitcher_id', 'stand', 'score_difference', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [5]:
df = df.withColumn('binned_score_difference', df.binned_score_difference +5)

In [6]:
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(inputCols =["latent_pitch_type", "pitch_num", "base_status","binned_score_difference",
                                            "count_status"],
                                 outputCols =["latent_pitch_typeH", "pitch_numH", "base_statusH","binned_score_differenceH"
                                              ,"count_statusH"])
model = encoder.fit(df)
df = model.transform(df)

In [7]:
df = df.select('outs','pfx_x','pfx_z','px','pz','start_speed','sz_bot','sz_top','x0',
               'y0','z0','batter_id','inning','p_throws','pitcher_id','stand','latent_pitch_typeH','pitch_numH',
               'base_statusH','binned_score_differenceH','count_statusH','latent_next_pitch')

In [8]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

def weight(pitch):
    if  pitch == 0.0: return 1.0/0.36
    elif pitch == 1.0: return 1.0/0.17
    elif pitch == 2.0: return 1.0/0.12
    elif pitch == 3.0: return 1.0/0.1
    elif pitch == 4.0: return 1.0/0.08
    elif pitch == 5.0: return 1.0/0.08
    elif pitch == 6.0: return 1.0/0.05
    elif pitch == 7.0: return 1.0/0.02
    elif pitch == 8.0: return 1.0/0.01
    elif pitch == 9.0: return 1.0/0.01
    elif pitch == 10.0: return 1.0/0.01
    
udfweight = udf(weight, DoubleType())
df = df.withColumn("weights", udfweight('latent_next_pitch'))

In [19]:
df.show()

+----+-----+-----+--------------------+------------------+-----------+------+------+-------------------+----+------------------+---------+------+--------+----------+-----+------------------+--------------+-------------+------------------------+---------------+-----------------+------------------+
|outs|pfx_x|pfx_z|                  px|                pz|start_speed|sz_bot|sz_top|                 x0|  y0|                z0|batter_id|inning|p_throws|pitcher_id|stand|latent_pitch_typeH|    pitch_numH| base_statusH|binned_score_differenceH|  count_statusH|latent_next_pitch|           weights|
+----+-----+-----+--------------------+------------------+-----------+------+------+-------------------+----+------------------+---------+------+--------+----------+-----+------------------+--------------+-------------+------------------------+---------------+-----------------+------------------+
| 1.0| 6.08| 9.83|              -0.532|             2.702|       93.7|  1.86|  3.83|              2.161|50

In [9]:
def transData(data):
    return data.rdd.map(lambda r: [r[-2], Vectors.dense(r[:-7]), r[-3], r[-4], r[-5], r[-6], r[-7], r[-1]]).\
           toDF(['label','features', 'count_statusH','binned_score_differenceH','base_statusH', 'pitch_numH',
                 'latent_pitch_typeH', 'weights'])

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+------------------+
|label|            features|  count_statusH|binned_score_differenceH| base_statusH|    pitch_numH|latent_pitch_typeH|           weights|
+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+------------------+
|  0.0|[1.0,6.08,9.83,-0...| (11,[2],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(15,[2],[1.0])|     (9,[0],[1.0])|2.7777777777777777|
|  0.0|[1.0,4.54,12.83,-...| (11,[5],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(15,[3],[1.0])|     (9,[0],[1.0])|2.7777777777777777|
|  7.0|[0.0,-3.71,9.05,-...| (11,[2],[1.0])|              (10,[],[])|(7,[0],[1.0])|(15,[2],[1.0])|     (9,[0],[1.0])|              50.0|
|  7.0|[0.0,4.87,-6.37,0...| (11,[3],[1.0])|              (10,[],[])|(7,[0],[1.0])|(15,[3],[1.0])|     (9,[7],[1.0])|              50.0|
|  0.0|[0.0,1.64,-4.12,0...| (11,[8],[1.0

In [10]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

norm = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)
data = norm.transform(data)

In [11]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['features_norm', 'latent_pitch_typeH','pitch_numH','base_statusH',
                                         'binned_score_differenceH','count_statusH'], outputCol = 'features_fin')

data = assembler.transform(data)

#### Split data into training and test data

In [186]:
maj = df.filter(df['latent_next_pitch']==0.0)
minor = df.filter(df['latent_next_pitch']!=0.0)

In [98]:
from pyspark.sql.functions import rand

maj_split = maj.randomSplit([0.3, 0.7], 1234)
minor_split = minor.randomSplit([0.8, 0.2], 1234)
maj_train = maj_split[0]
maj_test = maj_split[1]
minor_train = minor_split[0]
minor_test = minor_split[1]

train_trial = maj_train.union(minor_train)
test_trial = maj_test.union(minor_test)

train_trial = train_trial.orderBy(rand())
test_trial = test_trial.orderBy(rand())

In [6]:
train_trial = transData(train_trial)
test_trial = transData(test_trial)

NameError: name 'train_trial' is not defined

In [194]:
from pyspark.ml.feature import PCA

df = transData(df)
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

In [195]:
df = model.transform(df)

In [12]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]


In [13]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features_fin", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0, family="multinomial", weightCol = 'weights')

#### Fit the model

In [14]:
df.take(1)

[Row(outs=1.0, pfx_x=6.08, pfx_z=9.83, px=-0.532, pz=2.702, start_speed=93.7, sz_bot=1.86, sz_top=3.83, x0=2.161, y0=50.0, z0=6.151, batter_id=120074, inning=1, p_throws=0, pitcher_id=430935, stand=0, latent_pitch_typeH=SparseVector(9, {0: 1.0}), pitch_numH=SparseVector(15, {2: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), count_statusH=SparseVector(11, {2: 1.0}), latent_next_pitch=0.0, weights=2.7777777777777777)]

In [15]:
train.take(1)

[Row(label=0.0, features=DenseVector([0.0, -15.78, 5.94, -2.249, 2.089, 82.8, 1.66, 3.54, -1.975, 50.0, 6.783, 457454.0, 7.0, 1.0, 607352.0, 0.0]), count_statusH=SparseVector(11, {1: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), pitch_numH=SparseVector(15, {2: 1.0}), latent_pitch_typeH=SparseVector(9, {3: 1.0}), weights=2.7777777777777777, features_norm=DenseVector([0.0, -0.0, 0.0, -0.0, 0.0, 0.0001, 0.0, 0.0, -0.0, 0.0, 0.0, 0.4295, 0.0, 0.0, 0.5703, 0.0]), features_fin=SparseVector(68, {1: -0.0, 2: 0.0, 3: -0.0, 4: 0.0, 5: 0.0001, 6: 0.0, 7: 0.0, 8: -0.0, 9: 0.0, 10: 0.0, 11: 0.4295, 12: 0.0, 13: 0.0, 14: 0.5703, 19: 1.0, 27: 1.0, 40: 1.0, 51: 1.0, 58: 1.0}))]

In [16]:
lrModel = lr.fit(train)

#### Print the coefficients and intercept for multinomial logistic regression

print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

#### Save model statistics

In [17]:
trainingSummary = lrModel.summary

#### Obtain the objective per iteration

objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

In [18]:
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))


False positive rate by label:
label 0: 0.07194978499504585
label 1: 0.12156024147363957
label 2: 0.08153455956238347
label 3: 0.07885883798666518
label 4: 0.04329297101392945
label 5: 0.07105590512191881
label 6: 0.03688869267144639
label 7: 0.017252252837990714
label 8: 0.27249991111226035
label 9: 0.009252093855076299


In [19]:
print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

True positive rate by label:
label 0: 0.14901141660965558
label 1: 0.32804408508999905
label 2: 0.2306238846705623
label 3: 0.38692985952326014
label 4: 0.5388199673633093
label 5: 0.2380154574918973
label 6: 0.31501314880926146
label 7: 0.29389483833201335
label 8: 0.6040140172029309
label 9: 0.6639275179098187


In [20]:
print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

Precision by label:
label 0: 0.5264886568962056
label 1: 0.3477477845178999
label 2: 0.26577470240975515
label 3: 0.3859394615706514
label 4: 0.5223424024396055
label 5: 0.22466019051922137
label 6: 0.32789943724002935
label 7: 0.2867654470315492
label 8: 0.03786644602633418
label 9: 0.21348238482384824
Recall by label:
label 0: 0.14901141660965558
label 1: 0.32804408508999905
label 2: 0.2306238846705623
label 3: 0.38692985952326014
label 4: 0.5388199673633093
label 5: 0.2380154574918973
label 6: 0.31501314880926146
label 7: 0.29389483833201335
label 8: 0.6040140172029309
label 9: 0.6639275179098187
F-measure by label:
label 0: 0.23228071667215827
label 1: 0.3376086897299814
label 2: 0.24695474931713693
label 3: 0.38643402597130516
label 4: 0.530453254363426
label 5: 0.2311450721753321
label 6: 0.3213271491619149
label 7: 0.29028637507652544
label 8: 0.07126518251232321
label 9: 0.3230800779247411


In [21]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.2755816049207258
FPR: 0.07972685555108189
TPR: 0.2755816049207258
F-measure: 0.2965007673905309
Precision: 0.4011340963039981
Recall: 0.2755816049207258


In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

predictions = lrModel.transform(test)
evaluator.evaluate(predictions)

0.27522829730666976

Test Data accuracy with weights 0.27522829730666976

In [None]:
testSummary = predictions.summary

#### Results for prediction

In [104]:
accuracy = testSummary.accuracy
falsePositiveRate = testSummary.weightedFalsePositiveRate
truePositiveRate = testSummary.weightedTruePositiveRate
fMeasure = testSummary.weightedFMeasure()
precision = testSummary.weightedPrecision
recall = testSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

AttributeError: 'function' object has no attribute 'accuracy'

In [105]:
test.groupby('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  8.0| 17414|
|  0.0|406407|
|  7.0| 26761|
|  1.0|179941|
|  4.0| 96871|
|  3.0|117153|
|  2.0|134783|
| 10.0|   593|
|  6.0| 59836|
|  5.0| 93880|
|  9.0|  4589|
+-----+------+



In [106]:
test.count()

1138228

In [23]:
pred = ['outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed',
        'sz_bot', 'sz_top', 'x0', 'y0','z0', 'inning', 'p_throws', 'stand']

In [24]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

assembler = VectorAssembler(
                            inputCols=[c for c in df.columns if c in pred],
                            outputCol='features').setHandleInvalid('skip')
output = assembler.transform(df)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scaleroutput = scaler.fit(output)
scaledoutput = scaleroutput.transform(output)

In [25]:
pca = PCA(k=5, inputCol="scaledFeatures", outputCol="pcaFeatures")
PC = pca.fit(scaledoutput)

In [26]:
scaledoutput = PC.transform(scaledoutput)

In [27]:
scaledoutput.take(1)

[Row(outs=1.0, pfx_x=6.08, pfx_z=9.83, px=-0.532, pz=2.702, start_speed=93.7, sz_bot=1.86, sz_top=3.83, x0=2.161, y0=50.0, z0=6.151, batter_id=120074, inning=1, p_throws=0, pitcher_id=430935, stand=0, latent_pitch_typeH=SparseVector(9, {0: 1.0}), pitch_numH=SparseVector(15, {2: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), count_statusH=SparseVector(11, {2: 1.0}), latent_next_pitch=0.0, weights=2.7777777777777777, features=DenseVector([1.0, 6.08, 9.83, -0.532, 2.702, 93.7, 1.86, 3.83, 2.161, 50.0, 6.151, 1.0, 0.0, 0.0]), scaledFeatures=DenseVector([0.0229, 1.1514, 0.9067, -0.5944, 0.4667, 0.8868, 1.8927, 1.7975, 1.6667, 0.0, 0.7376, -1.4967, -1.6522, -1.1755]), pcaFeatures=DenseVector([-1.7737, -2.8571, -1.4434, -1.8198, -0.413]))]

In [28]:
lr = LogisticRegression(labelCol="latent_next_pitch", featuresCol="pcaFeatures", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0, family="multinomial", weightCol = 'weights')

In [29]:
splits = scaledoutput.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [30]:
lrModel = lr.fit(train)

In [31]:
trainingSummary = lrModel.summary

#### Obtain the objective per iteration

objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

In [32]:
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))


False positive rate by label:
label 0: 6.216229722597702e-05
label 1: 0.0017388655972056324
label 2: 0.0
label 3: 0.04792352012352567
label 4: 0.09028928599494468
label 5: 0.0
label 6: 0.0
label 7: 0.055790028908898365
label 8: 0.7749497699033222
label 9: 0.01864751517895143


In [33]:
print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

True positive rate by label:
label 0: 7.971648263547247e-05
label 1: 0.002375433420815693
label 2: 0.0
label 3: 0.05381225517627308
label 4: 0.1849064039408867
label 5: 0.0
label 6: 0.0
label 7: 0.06360071546505229
label 8: 0.882928368570527
label 9: 0.04800667919014819


In [34]:
print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

Precision by label:
label 0: 0.4069767441860465
label 1: 0.21351700387430048
label 2: 0.0
label 3: 0.1257005604483587
label 4: 0.15218695772043916
label 5: 0.0
label 6: 0.0
label 7: 0.02622918262547167
label 8: 0.020017660838317478
label 9: 0.009734625640157446
Recall by label:
label 0: 7.971648263547247e-05
label 1: 0.002375433420815693
label 2: 0.0
label 3: 0.05381225517627308
label 4: 0.1849064039408867
label 5: 0.0
label 6: 0.0
label 7: 0.06360071546505229
label 8: 0.882928368570527
label 9: 0.04800667919014819
F-measure by label:
label 0: 0.00015940174248876216
label 1: 0.00469859373741871
label 2: 0.0
label 3: 0.07536209168254465
label 4: 0.16695874958856338
label 5: 0.0
label 6: 0.0
label 7: 0.037141192776650664
label 8: 0.039147767531897755
label 9: 0.01618692378070237


In [35]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.03863233422443547
FPR: 0.028033482228509442
TPR: 0.03863233422443547
F-measure: 0.024451888348187328
Precision: 0.20479665909090616
Recall: 0.03863233422443547


## With PCA and Cross Validation 

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")

In [None]:
lrpca = LogisticRegression(labelCol="label", featuresCol="pcaFeatures",
                        family="multinomial")

In [None]:
paramGrid = ParamGridBuilder().addGrid(lrpca.regParam, [0.1, 0.01]).addGrid(lrpca.elasticNetParam, [0, 1]).addGrid(lrpca.maxIter, [1, 5, 10]).build()

In [None]:
crossval = CrossValidator(
    estimator=lrpca,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5)

In [None]:
lrModelpca = crossval.fit(train)

In [None]:
predictionspca = lrModelpca.transform(test)

In [None]:
accuracypca = evaluatortwo.evaluate(predictionspca)
print("Accuracy = %g" % (accuracypca))

In [None]:
precisionpca = evaluatorthree.evaluate(predictionspca)
print("Weighted Precision = %g" % precisionpca)

In [None]:
recallpca = evaluatorfour.evaluate(predictionspca)
print("Weighted Recall = %g" % recallpca)

In [None]:
f1pca = evaluator.evaluate(predictionspca)
print("F1 = %g" % f1pca)