#### Initialize Spark session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [5]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

In [6]:
df.count()

2098643

In [7]:
df = df.select('outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0', 'batter_id', 'inning', 'p_throws', 'pitcher_id', 'stand', 'score_difference', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [8]:
df = df.withColumn('binned_score_difference', df.binned_score_difference +5)

In [9]:
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(inputCols =["latent_pitch_type", "pitch_num", "base_status","binned_score_difference",
                                            "count_status"],
                                 outputCols =["latent_pitch_typeH", "pitch_numH", "base_statusH","binned_score_differenceH"
                                              ,"count_statusH"])
model = encoder.fit(df)
df = model.transform(df)

In [10]:
df = df.select('outs','pfx_x','pfx_z','px','pz','start_speed','sz_bot','sz_top','x0',
               'y0','z0','batter_id','inning','p_throws','pitcher_id','stand','latent_pitch_typeH','pitch_numH',
               'base_statusH','binned_score_differenceH','count_statusH','latent_next_pitch')

In [11]:
def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-6]), r[-2], r[-3], r[-4], r[-5], r[-6]]).\
           toDF(['label','features', 'count_statusH','binned_score_differenceH','base_statusH', 'pitch_numH',
                 'latent_pitch_typeH'])

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+
|label|            features|  count_statusH|binned_score_differenceH| base_statusH|    pitch_numH|latent_pitch_typeH|
+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+
|  0.0|[1.0,6.08,9.83,-0...| (11,[0],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(14,[1],[1.0])|     (9,[0],[1.0])|
|  0.0|[1.0,4.54,12.83,-...| (11,[2],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(14,[2],[1.0])|     (9,[0],[1.0])|
|  7.0|[0.0,-3.71,9.05,-...| (11,[0],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[1],[1.0])|     (9,[0],[1.0])|
|  7.0|[0.0,4.87,-6.37,0...| (11,[2],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[2],[1.0])|     (9,[7],[1.0])|
|  0.0|[0.0,1.64,-4.12,0...| (11,[3],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[3],[1.0])|     (9,[7],[1.0])|
|  6.0|[0.0,-2.47,9.54,-...| (11,[8],[1.0])|            

In [12]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

norm = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)
data = norm.transform(data)

In [13]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['features_norm', 'latent_pitch_typeH','pitch_numH','base_statusH',
                                         'binned_score_differenceH','count_statusH'], outputCol = 'features_fin')

data = assembler.transform(data)

In [14]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

def weight(pitch):
    if  pitch == 0.0: return 1.0/0.36
    elif pitch == 1.0: return 1.0/0.17
    elif pitch == 2.0: return 1.0/0.12
    elif pitch == 3.0: return 1.0/0.1
    elif pitch == 4.0: return 1.0/0.08
    elif pitch == 5.0: return 1.0/0.08
    elif pitch == 6.0: return 1.0/0.05
    elif pitch == 7.0: return 1.0/0.02
    elif pitch == 8.0: return 1.0/0.01
    elif pitch == 9.0: return 1.0/0.01
    elif pitch == 10.0: return 1.0/0.01
    
udfweight = udf(weight, DoubleType())
df = df.withColumn("weights", udfweight('latent_next_pitch'))

#### Split data into training and test data

In [186]:
maj = df.filter(df['latent_next_pitch']==0.0)
minor = df.filter(df['latent_next_pitch']!=0.0)

In [98]:
from pyspark.sql.functions import rand

maj_split = maj.randomSplit([0.3, 0.7], 1234)
minor_split = minor.randomSplit([0.8, 0.2], 1234)
maj_train = maj_split[0]
maj_test = maj_split[1]
minor_train = minor_split[0]
minor_test = minor_split[1]

train_trial = maj_train.union(minor_train)
test_trial = maj_test.union(minor_test)

train_trial = train_trial.orderBy(rand())
test_trial = test_trial.orderBy(rand())

In [6]:
train_trial = transData(train_trial)
test_trial = transData(test_trial)

NameError: name 'train_trial' is not defined

In [194]:
from pyspark.ml.feature import PCA

df = transData(df)
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

In [195]:
df = model.transform(df)

In [15]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]


In [16]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features_fin", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0, family="multinomial" )

#### Fit the model

In [17]:
df.take(1)

[Row(outs=1.0, pfx_x=6.08, pfx_z=9.83, px=-0.532, pz=2.702, start_speed=93.7, sz_bot=1.86, sz_top=3.83, x0=2.161, y0=50.0, z0=6.151, batter_id=120074, inning=1, p_throws=0, pitcher_id=430935, stand=0, latent_pitch_typeH=SparseVector(9, {0: 1.0}), pitch_numH=SparseVector(14, {1: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), count_statusH=SparseVector(11, {0: 1.0}), latent_next_pitch=0.0, weights=2.7777777777777777)]

In [18]:
train.take(1)

[Row(label=0.0, features=DenseVector([0.0, -15.78, 5.94, -2.249, 2.089, 82.8, 1.66, 3.54, -1.975, 50.0, 6.783, 457454.0, 7.0, 1.0, 607352.0, 0.0]), count_statusH=SparseVector(11, {0: 1.0}), binned_score_differenceH=SparseVector(10, {4: 1.0}), base_statusH=SparseVector(7, {0: 1.0}), pitch_numH=SparseVector(14, {1: 1.0}), latent_pitch_typeH=SparseVector(9, {3: 1.0}), features_norm=DenseVector([0.0, -0.0, 0.0, -0.0, 0.0, 0.0001, 0.0, 0.0, -0.0, 0.0, 0.0, 0.4295, 0.0, 0.0, 0.5703, 0.0]), features_fin=SparseVector(67, {1: -0.0, 2: 0.0, 3: -0.0, 4: 0.0, 5: 0.0001, 6: 0.0, 7: 0.0, 8: -0.0, 9: 0.0, 10: 0.0, 11: 0.4295, 12: 0.0, 13: 0.0, 14: 0.5703, 19: 1.0, 26: 1.0, 39: 1.0, 50: 1.0, 56: 1.0}))]

In [19]:
lrModel = lr.fit(train)

#### Print the coefficients and intercept for multinomial logistic regression

print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

#### Save model statistics

In [20]:
trainingSummary = lrModel.summary

#### Obtain the objective per iteration

objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

In [21]:
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))


False positive rate by label:
label 0: 0.5982999535086339
label 1: 0.07296303022638649
label 2: 0.0001889597363519129
label 3: 0.07995847522755324
label 4: 0.04350173224374583
label 5: 1.7252831620989795e-06
label 6: 0.016696912263148442
label 7: 0.00013734142753817522
label 8: 0.005840155737486333
label 9: 0.0


In [22]:
print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

True positive rate by label:
label 0: 0.8061566574768341
label 1: 0.21704710066461652
label 2: 0.0006580513280035842
label 3: 0.3980243841055899
label 4: 0.543931039227828
label 5: 0.0
label 6: 0.1311061531235322
label 7: 0.004106847045831033
label 8: 0.18597574821744856
label 9: 0.0


In [23]:
print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

Precision by label:
label 0: 0.4197523559087956
label 1: 0.3701727746542455
label 2: 0.3081967213114754
label 3: 0.38942163005116137
label 4: 0.5235329177033764
label 5: 0.0
label 6: 0.3098806550097141
label 7: 0.4131944444444444
label 8: 0.3616852146263911
label 9: 0.0
Recall by label:
label 0: 0.8061566574768341
label 1: 0.21704710066461652
label 2: 0.0006580513280035842
label 3: 0.3980243841055899
label 4: 0.543931039227828
label 5: 0.0
label 6: 0.1311061531235322
label 7: 0.004106847045831033
label 8: 0.18597574821744856
label 9: 0.0
F-measure by label:
label 0: 0.5520575385491782
label 1: 0.27364512292794213
label 2: 0.0013132985448931548
label 3: 0.39367601504591215
label 4: 0.533537085064741
label 5: 0.0
label 6: 0.18425612674313063
label 7: 0.008132859486057955
label 8: 0.24564350199454127
label 9: 0.0


In [24]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.4170796658319346
FPR: 0.234667951485941
TPR: 0.4170796658319346
F-measure: 0.3404037634786078
Precision: 0.3617674850822722
Recall: 0.4170796658319346


In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator()

evaluator.evaluate(predictions)

NameError: name 'predictions' is not defined

In [None]:
predictions = lrModel.transform(test)

In [None]:
testSummary = predictions.summary

#### Results for prediction

In [104]:
accuracy = testSummary.accuracy
falsePositiveRate = testSummary.weightedFalsePositiveRate
truePositiveRate = testSummary.weightedTruePositiveRate
fMeasure = testSummary.weightedFMeasure()
precision = testSummary.weightedPrecision
recall = testSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

AttributeError: 'function' object has no attribute 'accuracy'

In [105]:
test.groupby('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  8.0| 17414|
|  0.0|406407|
|  7.0| 26761|
|  1.0|179941|
|  4.0| 96871|
|  3.0|117153|
|  2.0|134783|
| 10.0|   593|
|  6.0| 59836|
|  5.0| 93880|
|  9.0|  4589|
+-----+------+



In [106]:
test.count()

1138228