In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


In [2]:
df = spark.read.option("inferSchema", "true").csv('pitches_preprocessed.csv', header = True)

In [3]:
df = df.select('outs', 'pfx_x', 'pfx_z', 'pitch_num', 'px', 'pz', 'start_speed', 'sz_bot', 'sz_top', 'x0', 'y0',
 'z0', 'batter_id', 'inning', 'p_throws', 'pitcher_id', 'stand', 'latent_pitch_type',
 'count_status','base_status', 'binned_score_difference','latent_next_pitch')

In [4]:
df = df.withColumn('binned_score_difference', df.binned_score_difference +5)

In [5]:
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(inputCols =["latent_pitch_type", "pitch_num", "base_status","binned_score_difference",
                                            "count_status"],
                                 outputCols =["latent_pitch_typeH", "pitch_numH", "base_statusH","binned_score_differenceH"
                                              ,"count_statusH"])
model = encoder.fit(df)
df = model.transform(df)

In [6]:
df = df.select('outs','pfx_x','pfx_z','px','pz','start_speed','sz_bot','sz_top','x0',
               'y0','z0','batter_id','inning','p_throws','pitcher_id','stand','latent_pitch_typeH','pitch_numH',
               'base_statusH','binned_score_differenceH','count_statusH','latent_next_pitch')

In [7]:
def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-6]), r[-2], r[-3], r[-4], r[-5], r[-6]]).\
           toDF(['label','features', 'count_statusH','binned_score_differenceH','base_statusH', 'pitch_numH',
                 'latent_pitch_typeH'])

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+
|label|            features|  count_statusH|binned_score_differenceH| base_statusH|    pitch_numH|latent_pitch_typeH|
+-----+--------------------+---------------+------------------------+-------------+--------------+------------------+
|  0.0|[1.0,6.08,9.83,-0...| (11,[0],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(14,[1],[1.0])|     (9,[0],[1.0])|
|  0.0|[1.0,4.54,12.83,-...| (11,[2],[1.0])|          (10,[4],[1.0])|(7,[0],[1.0])|(14,[2],[1.0])|     (9,[0],[1.0])|
|  7.0|[0.0,-3.71,9.05,-...| (11,[0],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[1],[1.0])|     (9,[0],[1.0])|
|  7.0|[0.0,4.87,-6.37,0...| (11,[2],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[2],[1.0])|     (9,[7],[1.0])|
|  0.0|[0.0,1.64,-4.12,0...| (11,[3],[1.0])|              (10,[],[])|(7,[0],[1.0])|(14,[3],[1.0])|     (9,[7],[1.0])|
|  6.0|[0.0,-2.47,9.54,-...| (11,[8],[1.0])|            

In [8]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

norm = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)
data = norm.transform(data)

In [9]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['features_norm', 'latent_pitch_typeH','pitch_numH','base_statusH',
                                         'binned_score_differenceH','count_statusH'], outputCol = 'features_fin')

data = assembler.transform(data)

In [10]:
data = data.select('label', 'features_fin')

In [11]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [13]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features_fin", numTrees=10)


In [14]:
model = rf.fit(trainingData)

In [15]:
predictions = model.transform(testData)

In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.616408
