In [1]:
import polars as pl

from src.processing import Stringifier, TimeTokenizer, make_vocabulary

In [2]:
data_path = "./data/raw_data.parquet"


# Config info ----------------------------
sequence_keys = ["game_pk", "player_name", "at_bat_number"]
order_columns = ["pitch_number"]
time_column = "inning"

feature_column_metadata = {
    "pitch_name": {
        "data_type": "categorical",
        "group": "pitch",
        "explicit_missing": True,
    },
    "release_speed": {
        "data_type": "numeric",
        "group": "velo",
        "explicit_missing": True,
    },
    "plate_x": {
        "data_type": "numeric",
        "group": "plate_x",
        "explicit_missing": True,
    },
    "plate_z": {
        "data_type": "numeric",
        "group": "plate_z",
        "explicit_missing": True,
    },
    "description": {
        "data_type": "categorical",
        "group": "description",
        "explicit_missing": True,
    },
    "events": {
        "data_type": "categorical",
        "group": "events",
        "explicit_missing": False,
    },
}

keyword_args = {
    "n_buckets": 32,
}
# -----------------------------------------

df = (
    pl.read_parquet(data_path)
    .filter(pl.col("game_type").is_in(["R", "F", "D", "L", "W"]))
    .select(
        # Keys and order
        *sequence_keys,
        *order_columns,
        # I need to transform the time column, so...
        (
            pl.col("inning")
            + pl.when(pl.col("inning_topbot") == "Top").then(0).otherwise(0.5)
        ).alias("inning"),
        # features
        *[column for column in feature_column_metadata],
    )
)

# A little more setup ----------------------------------

# This is nasty I'd rather not do it.
df = df.with_columns(
    (pl.col(time_column) - pl.col(time_column).shift(1))
    .over(partition_by=sequence_keys, order_by=order_columns)
    .alias("time_diffs")
)

time_tokenizer = TimeTokenizer.from_data(df["time_diffs"])

stringifiers = {
    column: Stringifier.from_data(df[column], **col_args, kwargs=keyword_args)
    for column, col_args in feature_column_metadata.items()
}

complete_vocab = make_vocabulary(
    stringifiers.values(), time_tokenizer, special_tokens=["<SOS>", "<EOS>"]
)
print(f"Vocab size: {len(complete_vocab)}")

df = (
    df.select(
        *sequence_keys,
        *order_columns,
        time_tokenizer.transform(pl.col("time_diffs")).alias("time_diffs"),
        *[s.transform(pl.col(n)).alias(n) for n, s in stringifiers.items()],
    )
    .with_columns(
        pl.concat_list("time_diffs", *[pl.col(n) for n in stringifiers])
        .list.drop_nulls()
        .alias("feature_list")
    )
    .select(*sequence_keys, *order_columns, "feature_list")
    .explode("feature_list")
    .with_columns(
        pl.col("feature_list")
        .replace(complete_vocab)
        .cast(pl.Int64)
        .alias("processed_list"),
    )
)

df = (
    df.sort(*order_columns)
    .group_by(*sequence_keys)  # Within-group order is always kept
    .agg("processed_list", "feature_list")
)

# Add SOS and EOS I guess?
df = df.with_columns(
    pl.concat_list(
        pl.lit(complete_vocab["<SOS>"]),
        pl.col("processed_list"),
        pl.lit(complete_vocab["<EOS>"]),
    ).alias("processed_list"),
    pl.col("processed_list").list.len().alias("sequence_length"),
)

print(f"Max length: {df["sequence_length"].max()}")
print(df)

Vocab size: 163
Max length: 81
shape: (187_634, 6)
┌─────────┬──────────────────┬───────────────┬─────────────────┬─────────────────┬─────────────────┐
│ game_pk ┆ player_name      ┆ at_bat_number ┆ processed_list  ┆ feature_list    ┆ sequence_length │
│ ---     ┆ ---              ┆ ---           ┆ ---             ┆ ---             ┆ ---             │
│ i64     ┆ str              ┆ i64           ┆ list[i64]       ┆ list[str]       ┆ u32             │
╞═════════╪══════════════════╪═══════════════╪═════════════════╪═════════════════╪═════════════════╡
│ 717517  ┆ Willingham, Amos ┆ 70            ┆ [130, 117, …    ┆ ["pitch =       ┆ 21              │
│         ┆                  ┆               ┆ 17]             ┆ Slider", "velo  ┆                 │
│         ┆                  ┆               ┆                 ┆ = (84…          ┆                 │
│ 717778  ┆ Brown, Hunter    ┆ 51            ┆ [130, 125, …    ┆ ["pitch =       ┆ 6               │
│         ┆                  ┆          

In [3]:
df[0]

game_pk,player_name,at_bat_number,processed_list,feature_list,sequence_length
i64,str,i64,list[i64],list[str],u32
717517,"""Willingham, Amos""",70,"[130, 117, … 17]","[""pitch = Slider"", ""velo = (84.4. 85.1]"", … ""events = field_out""]",21


In [4]:
df[0]["feature_list"].item().to_list()

['pitch = Slider',
 'velo = (84.4. 85.1]',
 'plate_x = (0.39. 0.46]',
 'plate_z = (-inf, 0.43]',
 'description = ball',
 'pitch = 4-Seam Fastball',
 'velo = (95.2. 95.7]',
 'plate_x = (0.07. 0.13]',
 'plate_z = (3.04. 3.15]',
 'description = called_strike',
 'pitch = 4-Seam Fastball',
 'velo = (95.7. 96.2]',
 'plate_x = (-0.93. -0.82]',
 'plate_z = (2.77. 2.86]',
 'description = foul',
 'pitch = 4-Seam Fastball',
 'velo = (95.7. 96.2]',
 'plate_x = (-0.39. -0.32]',
 'plate_z = (2.86. 2.95]',
 'description = hit_into_play',
 'events = field_out']

In [5]:
df[0]["processed_list"].item().to_list()

[130,
 117,
 57,
 33,
 50,
 118,
 2,
 158,
 116,
 77,
 138,
 2,
 156,
 142,
 53,
 151,
 2,
 156,
 139,
 102,
 39,
 134,
 17]

In [6]:
print(complete_vocab["<SOS>"])
print(complete_vocab["<EOS>"])

130
17
