# Association Rule Mining for 2019 Season

## Preprocess

In [31]:
import pandas as pd

batters = pd.read_csv("Batters_With_Clusters.csv")
pitchers = pd.read_csv("Pitchers_With_Clusters.csv")

In [32]:
print(batters.columns.tolist())
print(pitchers.columns.tolist())


['year', 'player_id', 'last_name, first_name', 'HitterType2_Group', 'HitterType2', 'ClusteringStats_Group', 'ClusteringStats']
['player_id', 'year', 'last_name, first_name', 'kmeans_quality_cluster', 'quality_archetype', 'impact_cluster', 'impact_archetype']


### Read play data

In [33]:
import pandas as pd

chunksize = 200_000
reader = pd.read_csv("./pitch_level_playbyplay_data/statcast_pitch_by_pitch_2019.csv", chunksize=chunksize)

dfs = []
for chunk in reader:
    dfs.append(chunk)          # or process chunk immediately
df = pd.concat(dfs, ignore_index=True)


In [34]:
print(df.columns.tolist())


['pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'player_name', 'batter', 'pitcher', 'events', 'description', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des', 'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk', 'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'babip_value', 'iso_value', 'launch_speed_a

In [35]:
print(df['events'].unique())
print(df['description'].unique()[:50])  # sample first 50
print(df['bb_type'].unique())


['strikeout' nan 'field_out' 'single' 'walk' 'force_out' 'home_run'
 'grounded_into_double_play' 'sac_bunt' 'double' 'hit_by_pitch' 'sac_fly'
 'intent_walk' 'field_error' 'fielders_choice_out' 'triple' 'double_play'
 'fielders_choice' 'catcher_interf' 'sac_fly_double_play'
 'strikeout_double_play' 'truncated_pa' 'sac_bunt_double_play'
 'triple_play']
['swinging_strike' 'foul' 'ball' 'called_strike' 'hit_into_play'
 'swinging_strike_blocked' 'blocked_ball' 'hit_by_pitch' 'foul_bunt'
 'foul_tip' 'automatic_ball' 'missed_bunt' 'pitchout' 'bunt_foul_tip']
[nan 'popup' 'fly_ball' 'ground_ball' 'line_drive']


In [36]:
import pandas as pd

def make_matchup(stand, p_throws):
    """
    stand: batter handedness ('L' or 'R')
    p_throws: pitcher handedness ('L' or 'R')
    """
    return f"{stand}vs{p_throws}"   # e.g. 'LvsR'

def bucket_count(balls, strikes):
    """
    Bucket balls/strikes into a small number of baseball-meaningful states.
    """
    # Full count first (since it's also two strikes)
    if balls == 3 and strikes == 2:
        return "full"

    # Two strikes (non-full counts)
    if strikes == 2:
        return "two_strikes"

    # Hitter clearly ahead
    if balls - strikes >= 2:
        return "hitter_ahead"

    # Pitcher clearly ahead
    if strikes > balls:
        return "pitcher_ahead"

    # Everything else (0-0, 1-1, 2-1, etc.)
    return "even"

def clean_outcome(events):
    """
    Use Statcast 'events' as the outcome label.
    If it's NaN or something weird, return NaN so we can drop it later.
    """
    if pd.isna(events):
        return pd.NA
    return str(events)


In [37]:
def build_plate_appearances(df_raw):
    """
    From pitch-by-pitch Statcast, keep only the final pitch of each PA.
    """
    # Ensure sort order within each PA
    df_sorted = df_raw.sort_values(['game_pk', 'at_bat_number', 'pitch_number'])

    # Last pitch in each (game_pk, at_bat_number) group = PA outcome pitch
    df_pa = (df_sorted
             .groupby(['game_pk', 'at_bat_number'], as_index=False)
             .tail(1)
             .copy())

    # Make sure 'year' column exists and is numeric
    if 'year' not in df_pa.columns and 'game_year' in df_pa.columns:
        df_pa['year'] = df_pa['game_year']
    df_pa['year'] = df_pa['year'].astype(int)

    return df_pa


In [38]:
def enrich_context(df_pa):
    """
    Add matchup (LvsR, etc.), count_bucket, and outcome from 'events'.
    """

    # Matchup
    df_pa['matchup'] = df_pa.apply(
        lambda row: make_matchup(row['stand'], row['p_throws']),
        axis=1
    )

    # Count bucket
    df_pa['count_bucket'] = df_pa.apply(
        lambda row: bucket_count(row['balls'], row['strikes']),
        axis=1
    )

    # Outcome
    df_pa['outcome'] = df_pa['events'].apply(clean_outcome)

    # Drop rows with missing critical info
    df_pa = df_pa.dropna(subset=['batter_arch', 'pitcher_arch', 'outcome'])

    return df_pa


In [39]:
def add_archetypes(
    df_pa,
    hitters_df,
    pitchers_df,
    # batter_cluster_col="HitterType2",      # or "ClusteringStats"
    batter_cluster_col="ClusteringStats",
    # pitcher_cluster_col="impact_archetype", # or "quality_archetype"
    pitcher_cluster_col="quality_archetype"
):
    """
    Merge batter & pitcher archetypes into the PA-level DataFrame.
    """

    # --- Merge batters ---
    bat_cols = ['player_id', 'year', batter_cluster_col]
    hitters_small = hitters_df[bat_cols].rename(
        columns={'player_id': 'batter'}
    )

    df_pa = df_pa.merge(
        hitters_small,
        on=['batter', 'year'],
        how='left'
    )

    # --- Merge pitchers ---
    pit_cols = ['player_id', 'year', pitcher_cluster_col]
    pitchers_small = pitchers_df[pit_cols].rename(
        columns={'player_id': 'pitcher'}
    )

    df_pa = df_pa.merge(
        pitchers_small,
        on=['pitcher', 'year'],
        how='left',
        suffixes=('_bat', '_pit')
    )

    # Rename cluster columns to unified names
    df_pa = df_pa.rename(columns={
        batter_cluster_col: 'batter_arch',
        pitcher_cluster_col: 'pitcher_arch'
    })

    return df_pa


In [40]:
def build_transactions_df(df_pa):
    """
    Keep just the fields we want as "items" in each transaction.
    """
    df_tx = df_pa[['batter_arch', 'pitcher_arch', 'matchup', 'count_bucket', 'outcome']].copy()
    return df_tx


In [41]:
def encode_transactions(df_tx):
    """
    One-hot encode the transactional fields into a boolean 0/1 DataFrame.
    Columns will look like:
      - batter_arch_<value>
      - pitcher_arch_<value>
      - matchup_LvsR
      - count_bucket_two_strikes
      - outcome_strikeout
    """
    cols = ['batter_arch', 'pitcher_arch', 'matchup', 'count_bucket', 'outcome']

    encoded_parts = []
    for col in cols:
        dummies = pd.get_dummies(df_tx[col], prefix=col)
        encoded_parts.append(dummies)

    df_encoded = pd.concat(encoded_parts, axis=1)

    # Boolean is lighter than int
    df_encoded = df_encoded.astype(bool)

    return df_encoded


In [42]:
def preprocess_statcast(
    df_raw,
    hitters_df,
    pitchers_df,
    batter_cluster_col="HitterType2",
    pitcher_cluster_col="impact_archetype"
):
    # 1) Pitch → PA
    df_pa = build_plate_appearances(df_raw)

    # 2) Merge archetypes
    df_pa = add_archetypes(
        df_pa,
        hitters_df=hitters_df,
        pitchers_df=pitchers_df,
        batter_cluster_col=batter_cluster_col,
        pitcher_cluster_col=pitcher_cluster_col
    )

    # 3) Add matchup, count bucket, and outcome
    df_pa = enrich_context(df_pa)

    # 4) Build transaction-level
    df_tx = build_transactions_df(df_pa)

    # 5) One-hot encode
    df_encoded = encode_transactions(df_tx)

    return df_pa, df_tx, df_encoded


In [43]:
df_pa_2019, df_tx_2019, df_encoded_2019 = preprocess_statcast(
    df_raw=df,              # your big 2019 pitch-by-pitch DF
    hitters_df=batters,
    pitchers_df=pitchers,
    # batter_cluster_col="HitterType2",        # or "ClusteringStats"
    batter_cluster_col="ClusteringStats",
    # pitcher_cluster_col="impact_archetype"   # or "quality_archetype"
    pitcher_cluster_col="quality_archetype"   # or "quality_archetype"
)


In [44]:
df_pa_2019.to_csv("./2019/pa_2019.csv", index=False)
df_tx_2019.to_csv("./2019/tx_2019.csv", index=False)
df_encoded_2019.to_csv("./2019/encoded_2019.csv", index=False)

In [45]:
unique_outcomes = df_tx_2019['outcome'].dropna().unique()

In [46]:
for attr in unique_outcomes:
    print(f'{attr}: {(df_tx_2019['outcome'] == attr).sum() / len(df_tx_2019) * 100}%')

strikeout: 20.435644781947026%
single: 14.526616758642952%
walk: 8.136442592962661%
field_out: 39.89977403040598%
home_run: 4.138164266059982%
double: 4.856040459317787%
hit_by_pitch: 1.0360782746376032%
grounded_into_double_play: 1.8907659908074956%
field_error: 0.7809017262847217%
double_play: 0.22750680214594252%
intent_walk: 0.45040197992406195%
force_out: 1.8062195440640707%
strikeout_double_play: 0.09684411172428635%
fielders_choice: 0.2090603046746499%
sac_fly: 0.6210320815335189%
triple: 0.4565508124144928%
catcher_interf: 0.038430203065193%
truncated_pa: 0.13066269042165618%
fielders_choice_out: 0.13834873103469478%
sac_bunt: 0.115290609195579%
sac_fly_double_play: 0.00922324873564632%


In [47]:
home_run_percentage = (df_tx_2019['outcome'] == 'home_run').sum() / len(df_tx_2019)
print(f'Home run percentage: {home_run_percentage * 100}%')

Home run percentage: 4.138164266059982%


In [49]:
from mlxtend.frequent_patterns import fpgrowth, association_rules

minsup_count = 40
minsup = minsup_count / len(df_encoded_2019)
# minconf = 0.20

frequent_itemsets = fpgrowth(df_encoded_2019, min_support=minsup, use_colnames=True)
# rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=minconf)


In [50]:
minconf = home_run_percentage
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=minconf)


In [51]:
good_outcomes = [
    'outcome_single',
    'outcome_double',
    'outcome_triple',
    'outcome_home_run',
    # 'outcome_walk',
    'outcome_intent_walk',
    # 'outcome_hit_by_pitch',
    # 'outcome_field_error'
]


In [52]:
rules = rules[
    rules['consequents'].apply(
        lambda s: any(item in good_outcomes for item in s)
    )
]


In [53]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
466,(pitcher_arch_Sinkers Get Hit (Bad SI)),(outcome_single),0.061242,0.145266,0.009254,0.151104,1.040190,1.0,0.000358,1.006877,0.041158,0.046914,0.006830,0.107404
945,"(batter_arch_Aggressive Fastball, pitcher_arch...",(outcome_single),0.046193,0.145266,0.007010,0.151747,1.044614,1.0,0.000299,1.007640,0.044777,0.038003,0.007582,0.100001
948,(pitcher_arch_Sinkers Get Hit (Bad SI)),"(batter_arch_Aggressive Fastball, outcome_single)",0.061242,0.112478,0.007010,0.114458,1.017606,1.0,0.000121,1.002236,0.018430,0.042047,0.002231,0.088389
950,"(count_bucket_even, pitcher_arch_Sinkers Get H...",(outcome_single),0.020645,0.145266,0.004366,0.211467,1.455720,1.0,0.001367,1.083954,0.319654,0.027024,0.077452,0.120760
952,(pitcher_arch_Sinkers Get Hit (Bad SI)),"(count_bucket_even, outcome_single)",0.061242,0.063748,0.004366,0.071285,1.118233,1.0,0.000462,1.008116,0.112630,0.036192,0.008050,0.069884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19460,"(matchup_RvsL, pitcher_arch_Poor Changeup Grou...","(batter_arch_Aggressive Fastball, outcome_single)",0.008532,0.112478,0.000968,0.113514,1.009211,1.0,0.000009,1.001169,0.009205,0.008068,0.001167,0.061062
19545,"(batter_arch_Aggressive Fastball, pitcher_arch...",(outcome_home_run),0.018692,0.041382,0.000984,0.052632,1.271858,1.0,0.000210,1.011875,0.217820,0.016649,0.011736,0.038203
19547,"(count_bucket_even, pitcher_arch_Poor Changeup...",(outcome_home_run),0.008332,0.041382,0.000722,0.086716,2.095515,1.0,0.000378,1.049639,0.527183,0.014747,0.047291,0.052088
19575,"(batter_arch_Aggressive Fastball, pitcher_arch...",(outcome_double),0.018692,0.048560,0.001015,0.054276,1.117707,1.0,0.000107,1.006044,0.107317,0.015317,0.006008,0.037585


In [54]:
rules['ante_len'] = rules['antecedents'].apply(lambda x: len(x))


In [55]:
max_len = rules['ante_len'].max()
print("Max antecedent size:", max_len)


Max antecedent size: 4


In [56]:
rules['cons_len'] = rules['consequents'].apply(lambda x: len(x))

In [57]:
rules_sorted = rules.sort_values(
    by=[ 'lift', 'confidence'],
    ascending=[False, False]
)


In [58]:
# rules_sorted = rules_sorted[rules_sorted['ante_len'] == 4]

In [59]:
rules_sorted = rules_sorted[
    (rules_sorted['cons_len'] == 1) &
    (rules_sorted['consequents'].apply(lambda s: list(s)[0].startswith('outcome_')))
]


In [60]:
rules_sorted = rules_sorted[
    (rules_sorted['ante_len'] == 2) &
    (rules_sorted['antecedents'].apply(
        lambda s: any(item.startswith('batter_arch_') for item in s)
        and any(item.startswith('pitcher_arch_') for item in s)
    ))
]


In [61]:
rules_sorted.to_csv("./2019/rules_sorted_by_conf_support.csv", index=False)
