#### Initialize Spark session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext


#### Read in data and merge dataset on ab_id

In [3]:
pitches = spark.read.option("inferSchema", "true").csv('Data/pitches.csv', header = True)
atbats = spark.read.option("inferSchema", "true").csv('Data/atbats.csv', header = True).select("ab_id", "batter_id", "inning", "p_score", "p_throws", "pitcher_id", "stand", "top")

df = pitches.join(atbats, "ab_id")

#### Drop unnecessary variables

In [4]:
df = df.drop("ax", "ay", "az", "batter_id", "break_angle", "break_length", "break_y", "code", "event", "g_id", "o", "p_throws",
             
            "pfx_x", "pfx_z", "px", "pz", "spin_dir", "end_speed", "start_speed"
             
             "sz_bot", "sz_top", "vx0", "vy0", "vz0", "x", "x0", "y", "y0", "z", "z0", "zone", "spin_rate")

#### Create new variable score_difference

In [5]:
df=df.withColumn("score_difference", df.p_score-df.b_score)

#### Remove low frequency observations (look at pitch_type to decide which ones to remove)

In [6]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql import functions as f
from pyspark.sql.functions import col

In [7]:
df = df.filter(
    (col('pitch_type') != 'UN') &
    (col('pitch_type') != 'EP') &
    (col('pitch_type') != 'AB') &
    (col('pitch_type') != 'FA') &
    (col('pitch_type') != 'SC'))

#### FO and PO are the same so consolidate

In [8]:
df=df.na.replace(['FO'], ['PO'], 'pitch_type')

#### Create new column that is a latent variable based on pitch_type

In [9]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="pitch_type", outputCol="latent_pitch_type")
df = indexer.fit(df).transform(df)
df.show()

+----------+-------+-------+-----+-----+-----+-----+----+---------+----------+-------+-----------+------+----+---------------+------+-------+----------+-----+-----+----------------+-----------------+
|     ab_id|b_count|b_score|nasty|on_1b|on_2b|on_3b|outs|pitch_num|pitch_type|s_count|start_speed|sz_bot|type|type_confidence|inning|p_score|pitcher_id|stand|  top|score_difference|latent_pitch_type|
+----------+-------+-------+-----+-----+-----+-----+----+---------+----------+-------+-----------+------+----+---------------+------+-------+----------+-----+-----+----------------+-----------------+
|2015000044|      0|      0|   63|false| true|false|   1|        1|        FC|      0|       84.6|  1.52|   B|            2.0|     5|      3|    425794|    L|false|               3|              6.0|
|2015000044|      1|      0|   29|false| true|false|   1|        2|        FC|      0|       88.4|  1.52|   B|            2.0|     5|      3|    425794|    L|false|               3|              6.0|


In [10]:
df.groupBy("latent_pitch_type").count().show()
df.groupBy("pitch_type").count().show()

+-----------------+-------+
|latent_pitch_type|  count|
+-----------------+-------+
|              8.0|  43705|
|              0.0|1014880|
|              7.0|  66484|
|              1.0| 450581|
|              4.0| 242506|
|             11.0|   1438|
|              3.0| 292789|
|              2.0| 337983|
|             10.0|   6197|
|              6.0| 149756|
|              5.0| 234391|
|              9.0|  11260|
+-----------------+-------+

+----------+-------+
|pitch_type|  count|
+----------+-------+
|        FT| 337983|
|        SL| 450581|
|        FC| 149756|
|        FF|1014880|
|        FS|  43705|
|        PO|   1438|
|        KC|  66484|
|        IN|   6197|
|        CH| 292789|
|        CU| 234391|
|        KN|  11260|
|        SI| 242506|
+----------+-------+



In [None]:
## udf_latent_base = udf(lambda z: if)