In [30]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


In [31]:
pitches = spark.read.option("inferSchema", "true").csv('Data/pitches.csv', header = True)
atbats = spark.read.option("inferSchema", "true").csv('Data/atbats.csv', header = True).select("ab_id", "batter_id", "inning", "p_score", "p_throws", "pitcher_id", "stand", "top")

df = pitches.join(atbats, "ab_id")

In [32]:
df = df.drop("ax", "ay", "az", "batter_id", "break_angle", "break_length", "break_y", "code", "event", "g_id", "o", "p_throws",
             
            "pfx_x", "pfx_z", "px", "pz", "spin_dir", "end_speed", "start_speed"
             
             "sz_bot", "sz_top", "vx0", "vy0", "vz0", "x", "x0", "y", "y0", "z", "z0", "zone", "spin_rate")

In [33]:
df=df.withColumn("score_difference", df.p_score-df.b_score)

In [34]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql import functions as f

In [35]:
df=df.filter(df["pitch_type"]!="UN")
df=df.filter(df["pitch_type"]!="EP")

In [36]:
df=df.na.replace(['FO'], ['PO'], 'pitch_type')

In [37]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="pitch_type", outputCol="latent_pitch_type")
df = indexer.fit(df).transform(df)
df.show()

+----------+-------+-------+-----+-----+-----+-----+----+---------+----------+-------+-----------+------+----+---------------+------+-------+----------+-----+-----+----------------+-----------------+
|     ab_id|b_count|b_score|nasty|on_1b|on_2b|on_3b|outs|pitch_num|pitch_type|s_count|start_speed|sz_bot|type|type_confidence|inning|p_score|pitcher_id|stand|  top|score_difference|latent_pitch_type|
+----------+-------+-------+-----+-----+-----+-----+----+---------+----------+-------+-----------+------+----+---------------+------+-------+----------+-----+-----+----------------+-----------------+
|2015000044|      0|      0|   63|false| true|false|   1|        1|        FC|      0|       84.6|  1.52|   B|            2.0|     5|      3|    425794|    L|false|               3|              6.0|
|2015000044|      1|      0|   29|false| true|false|   1|        2|        FC|      0|       88.4|  1.52|   B|            2.0|     5|      3|    425794|    L|false|               3|              6.0|
