#### Initialize Spark session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [33]:
pitches = spark.read.option("inferSchema", "true").csv('Data/pitches.csv', header = True)
atbats = spark.read.option("inferSchema", "true").csv('Data/atbats.csv', header = True).select("ab_id", "batter_id", 
                                                                                               "inning", "p_score", 
                                                                                               "p_throws", "pitcher_id",
                                                                                               "stand", "top")

df = pitches.join(atbats, "ab_id")

#### Drop unnecessary variables

In [34]:
df = df.drop("ax", "ay", "az", "batter_id", "break_angle", "break_length", "break_y", "code", "event", "g_id", "o",
            
             "p_throws", 'nasty',"pfx_x", "pfx_z", "px", "pz", "spin_dir", "end_speed", "start_speed"
             
             "sz_bot", "sz_top", "vx0", "vy0", "vz0", "x", "x0", "y", "y0", "z", "z0", "zone", "spin_rate")

#### Create new variable score_difference

In [35]:
df=df.withColumn("score_difference", df.p_score-df.b_score)

#### Remove low frequency observations (look at pitch_type to decide which ones to remove)

In [36]:
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import udf
from pyspark.sql import functions as f
from pyspark.sql.functions import col

In [37]:
df = df.filter(
    (col('pitch_type') != 'UN') &
    (col('pitch_type') != 'EP') &
    (col('pitch_type') != 'AB') &
    (col('pitch_type') != 'FA') &
    (col('pitch_type') != 'IN') &
    (col('pitch_type') != 'SC'))

#### FO and PO are the same so consolidate

In [38]:
df=df.na.replace(['FO'], ['PO'], 'pitch_type')

#### Create new column that is a latent variable based on pitch_type

In [39]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import CrossValidator

indexer = StringIndexer(inputCol="pitch_type", outputCol="latent_pitch_type")
df = indexer.fit(df).transform(df)
df.head(2)

[Row(ab_id=2015000044, b_count=0, b_score=0, on_1b=False, on_2b=True, on_3b=False, outs=1, pitch_num=1, pitch_type='FC', s_count=0, start_speed=84.6, sz_bot=1.52, type='B', type_confidence=2.0, inning=5, p_score=3, pitcher_id=425794, stand='L', top=False, score_difference=3, latent_pitch_type=6.0),
 Row(ab_id=2015000044, b_count=1, b_score=0, on_1b=False, on_2b=True, on_3b=False, outs=1, pitch_num=2, pitch_type='FC', s_count=0, start_speed=88.4, sz_bot=1.52, type='B', type_confidence=2.0, inning=5, p_score=3, pitcher_id=425794, stand='L', top=False, score_difference=3, latent_pitch_type=6.0)]

In [40]:
df.groupBy("latent_pitch_type").count().show()
df.groupBy("pitch_type").count().show()

+-----------------+-------+
|latent_pitch_type|  count|
+-----------------+-------+
|              8.0|  43705|
|              0.0|1014880|
|              7.0|  66484|
|              1.0| 450581|
|              4.0| 242506|
|              3.0| 292789|
|              2.0| 337983|
|             10.0|   1438|
|              6.0| 149756|
|              5.0| 234391|
|              9.0|  11260|
+-----------------+-------+

+----------+-------+
|pitch_type|  count|
+----------+-------+
|        FT| 337983|
|        SL| 450581|
|        FC| 149756|
|        FF|1014880|
|        FS|  43705|
|        PO|   1438|
|        KC|  66484|
|        CH| 292789|
|        CU| 234391|
|        KN|  11260|
|        SI| 242506|
+----------+-------+



In [41]:
## udf_latent_base = udf(lambda z: if)

#### Create new column that is latent variable based on balls and strikes

In [42]:
def count_status(b_count, s_count):
    if   b_count==0 and s_count==0: return 0
    elif b_count==1 and s_count==0: return 1
    elif b_count==0 and s_count==1: return 2
    elif b_count==1 and s_count==1: return 3
    elif b_count==2 and s_count==0: return 4
    elif b_count==0 and s_count==2: return 5
    elif b_count==3 and s_count==0: return 6
    elif b_count==2 and s_count==1: return 7
    elif b_count==1 and s_count==2: return 8
    elif b_count==3 and s_count==1: return 9
    elif b_count==2 and s_count==2: return 10
    elif b_count==3 and s_count==2: return 11
    
udfcount_status = udf(count_status, IntegerType())
df = df.withColumn("count_status", udfcount_status("b_count", "s_count"))

#### Create new column that is latent variable based on on_1b, on_2b, and on_3b

In [43]:
def base_status(on_1b, on_2b, on_3b):
    if   on_1b==0 and on_2b==0 and on_3b==0: return 0
    elif on_1b==1 and on_2b==0 and on_3b==0: return 1
    elif on_1b==0 and on_2b==1 and on_3b==0: return 2
    elif on_1b==0 and on_2b==0 and on_3b==1: return 3
    elif on_1b==1 and on_2b==1 and on_3b==0: return 4
    elif on_1b==1 and on_2b==0 and on_3b==1: return 5
    elif on_1b==0 and on_2b==1 and on_3b==1: return 6
    elif on_1b==1 and on_2b==1 and on_3b==1: return 7
    
udfbase_status = udf(base_status, IntegerType())
df = df.withColumn("base_status", udfbase_status("on_1b", "on_2b", "on_3b"))

#### Create new column binning score_difference

In [44]:
def bin_score(score_difference):
    if score_difference<(-4): return (-5)
    elif score_difference>(4): return (5)
    elif score_difference==(-4): return (-4)
    elif score_difference==(-3): return (-3) 
    elif score_difference==(-2): return (-2)
    elif score_difference==(-1): return (-1) 
    elif score_difference==(0): return (0) 
    elif score_difference==(1): return (1)
    elif score_difference==(2): return (2)
    elif score_difference==(3): return (3)
    elif score_difference==(4): return (4)
udfbin_score = udf(bin_score, IntegerType())
df = df.withColumn("binned_score_difference", udfbin_score("score_difference"))

#### Create new column binning pitch_num

In [45]:
def bin_pitch_num(pitch_num):
    if pitch_num>(14): 
        return (14)
    else: 
        return(pitch_num)
udfpitch_num = udf(bin_pitch_num, IntegerType())
df = df.withColumn("pitch_num", udfpitch_num("pitch_num"))

In [46]:
df.groupby("pitch_num").count().show()

+---------+------+
|pitch_num| count|
+---------+------+
|       12|   409|
|        1|735198|
|       13|   150|
|        6|145005|
|        3|539590|
|        5|268666|
|        9|  8831|
|        4|406352|
|        8| 23528|
|        7| 60001|
|       10|  3210|
|       11|  1130|
|       14|    80|
|        2|653623|
+---------+------+



#### Convert data to dense vector

##### We need to dummy encode some of these because it offers useful info

In [47]:
df = df.drop('on_1b', 'on_2b', 'on_3b', 'pitch_type', 'type', 'stand', 'top', 'start_speed', 'sz_bot', 'type_confidence')


In [48]:
df.dtypes

[('ab_id', 'int'),
 ('b_count', 'int'),
 ('b_score', 'int'),
 ('outs', 'int'),
 ('pitch_num', 'int'),
 ('s_count', 'int'),
 ('inning', 'int'),
 ('p_score', 'int'),
 ('pitcher_id', 'int'),
 ('score_difference', 'int'),
 ('latent_pitch_type', 'double'),
 ('count_status', 'int'),
 ('base_status', 'int'),
 ('binned_score_difference', 'int')]

In [49]:
df = df.select("ab_id","b_count","b_score","outs", 'pitch_num', 's_count', 'inning', 'p_score', 'pitcher_id', 'score_difference',
              'count_status', 'base_status', 'binned_score_difference', 'latent_pitch_type')

In [50]:
df.dtypes

[('ab_id', 'int'),
 ('b_count', 'int'),
 ('b_score', 'int'),
 ('outs', 'int'),
 ('pitch_num', 'int'),
 ('s_count', 'int'),
 ('inning', 'int'),
 ('p_score', 'int'),
 ('pitcher_id', 'int'),
 ('score_difference', 'int'),
 ('count_status', 'int'),
 ('base_status', 'int'),
 ('binned_score_difference', 'int'),
 ('latent_pitch_type', 'double')]

In [51]:
df.select('binned_score_difference').distinct().rdd.map(lambda r: r[0]).collect()

[-1, 1, 3, -5, 5, 4, -4, -2, 2, -3, 0]

In [52]:
df = df.na.drop(subset=["count_status"])

In [53]:
df.groupby('binned_score_difference').count().show()

+-----------------------+------+
|binned_score_difference| count|
+-----------------------+------+
|                     -1|340572|
|                      1|344147|
|                      3|160611|
|                     -5|194611|
|                      5|203694|
|                      4|112138|
|                     -4|106880|
|                     -2|232629|
|                      2|238861|
|                     -3|154079|
|                      0|757537|
+-----------------------+------+



In [64]:
def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-1])]).\
           toDF(['label','features'])

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  6.0|[2.015000044E9,0....|
|  6.0|[2.015000044E9,1....|
|  6.0|[2.015000044E9,2....|
|  6.0|[2.015000044E9,3....|
|  0.0|[2.015000044E9,3....|
|  0.0|[2.015000044E9,3....|
|  0.0|[2.015000059E9,0....|
|  0.0|[2.015000059E9,0....|
|  2.0|[2.015000059E9,1....|
|  3.0|[2.015000059E9,2....|
|  3.0|[2.015000059E9,2....|
|  3.0|[2.015000059E9,3....|
|  2.0|[2.015000183E9,0....|
|  2.0|[2.015000183E9,1....|
|  2.0|[2.015000183E9,2....|
|  0.0|[2.015000183E9,3....|
|  2.0|[2.015000183E9,3....|
|  2.0|[2.015000183E9,3....|
|  2.0|[2.015000294E9,0....|
|  2.0|[2.015000294E9,1....|
+-----+--------------------+
only showing top 20 rows



#### Split data into training and test data

In [65]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [75]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

#### Fit the model

In [76]:
lrModel = lr.fit(train)

#### Print the coefficients and intercept for multinomial logistic regression

In [77]:
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

Coefficients: 
11 X 13 CSRMatrix

Intercept: [2.2653598943433737,1.455193415867617,1.1685980203011852,1.0228151674030306,0.8354762225721429,0.7996999370747043,0.35332721747466383,-0.4636378393412704,-0.8763287240980145,-2.2476738514124417,-4.312829460184991]


#### Save model statistics

In [78]:
trainingSummary = lrModel.summary

#### Obtain the objective per iteration

In [79]:
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

objectiveHistory:
1.8945803507804027
1.8945803507760104


In [80]:
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))


False positive rate by label:
label 0: 1.0
label 1: 0.0
label 2: 0.0
label 3: 0.0
label 4: 0.0
label 5: 0.0
label 6: 0.0
label 7: 0.0
label 8: 0.0
label 9: 0.0
label 10: 0.0


In [81]:
print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

True positive rate by label:
label 0: 1.0
label 1: 0.0
label 2: 0.0
label 3: 0.0
label 4: 0.0
label 5: 0.0
label 6: 0.0
label 7: 0.0
label 8: 0.0
label 9: 0.0
label 10: 0.0


In [82]:
print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

Precision by label:
label 0: 0.3563455070508237
label 1: 0.0
label 2: 0.0
label 3: 0.0
label 4: 0.0
label 5: 0.0
label 6: 0.0
label 7: 0.0
label 8: 0.0
label 9: 0.0
label 10: 0.0
Recall by label:
label 0: 1.0
label 1: 0.0
label 2: 0.0
label 3: 0.0
label 4: 0.0
label 5: 0.0
label 6: 0.0
label 7: 0.0
label 8: 0.0
label 9: 0.0
label 10: 0.0
F-measure by label:
label 0: 0.5254494598882039
label 1: 0.0
label 2: 0.0
label 3: 0.0
label 4: 0.0
label 5: 0.0
label 6: 0.0
label 7: 0.0
label 8: 0.0
label 9: 0.0
label 10: 0.0


In [83]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.3563455070508237
FPR: 0.3563455070508237
TPR: 0.3563455070508237
F-measure: 0.18724155421344346
Precision: 0.12698212039530862
Recall: 0.3563455070508237


In [103]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator()

evaluator.evaluate(predictions)

0.18788721056974578

In [99]:
predictions = lrModel.transform(test)

In [100]:
testSummary = predictions.summary
testSummary

<bound method DataFrame.summary of DataFrame[label: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]>

#### Results for prediction

In [104]:
accuracy = testSummary.accuracy
falsePositiveRate = testSummary.weightedFalsePositiveRate
truePositiveRate = testSummary.weightedTruePositiveRate
fMeasure = testSummary.weightedFMeasure()
precision = testSummary.weightedPrecision
recall = testSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

AttributeError: 'function' object has no attribute 'accuracy'

In [105]:
test.groupby('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  8.0| 17414|
|  0.0|406407|
|  7.0| 26761|
|  1.0|179941|
|  4.0| 96871|
|  3.0|117153|
|  2.0|134783|
| 10.0|   593|
|  6.0| 59836|
|  5.0| 93880|
|  9.0|  4589|
+-----+------+



In [106]:
test.count()

1138228