#### Import necessary libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#### Initialize Spark Session 

In [2]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data

In [3]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

#### Preprocessing

In [4]:
df = df.select('outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0', 'batter_id', 'inning', 'p_throws', 'pitcher_id', 'stand', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [5]:
df = df.withColumn('binned_score_difference', df.binned_score_difference +5)

In [6]:
encoder = OneHotEncoderEstimator(inputCols =['outs', 'inning','p_throws', 'stand',"latent_pitch_type",
                                             "pitch_num", "base_status","binned_score_difference",
                                            "count_status"],
                                 outputCols =['outsH','inningH','p_throwsH', 'standH', "latent_pitch_typeH",
                                              "pitch_numH", "base_statusH","binned_score_differenceH"
                                              ,"count_statusH"])
encoder = encoder.fit(df)

In [7]:
def transData(data):
    return data.rdd.map(lambda r: [r[-10], Vectors.dense(r[:10]), r[-1], r[-2], r[-3], r[-4], r[-5], r[-6], r[-7],
                                  r[-8], r[-9]]).\
           toDF(['label','features', 'standH', 'pitch_numH', 'inningH', 'latent_pitch_typeH',
                 'binned_score_differenceH', 'outsH', 'base_statusH', 'count_statusH', 'p_throwsH'])


In [8]:
norm = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)

In [9]:
assembler = VectorAssembler(inputCols = ['features_norm', 'latent_pitch_typeH', 'standH', 'pitch_numH',
                                         'inningH', 'latent_pitch_typeH',
                                         'binned_score_differenceH', 'outsH', 'base_statusH', 'count_statusH',
                                         'p_throwsH'],
                            outputCol = 'features_fin')



#### Pipeline

In [10]:
data = assembler.transform(norm.transform(transData(encoder.transform(df))))

In [11]:
data = data.select('label', 'features_fin')
data.take(1)

[Row(label=0.0, features_fin=SparseVector(94, {0: 0.0081, 1: 0.0492, 2: 0.0795, 3: 0.0162, 4: -0.0043, 5: 0.0218, 6: 0.7575, 7: 0.015, 8: 0.031, 9: 0.0175, 10: 1.0, 19: 1.0, 22: 1.0, 36: 1.0, 54: 1.0, 67: 1.0, 74: 1.0, 75: 1.0, 84: 1.0, 93: 1.0}))]

#### Split Data

In [12]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#### Random Forest with cross validation

In [13]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features_fin")
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

In [14]:
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [3, 6, 10]).build()

In [15]:
crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3)

In [16]:
model = crossval.fit(trainingData)

#### Evaluate model via the test set

In [30]:
model = model.bestModel

In [31]:
model.getNumTrees

3

In [32]:
predictions = model.transform(testData)

In [33]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)

Accuracy = 0.4118


In [34]:
evaluatortwo = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluatortwo.evaluate(predictions)
print("F1 = %g" % f1)

F1 = 0.294976


In [35]:
evaluatorthree = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluatorthree.evaluate(predictions)
print("Weighted Precision = %g" % precision)

Weighted Precision = 0.245403


In [36]:
evaluatorfour = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluatorfour.evaluate(predictions)
print("Weighted Recall = %g" % recall)

Weighted Recall = 0.4118
