In [303]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB

pd.set_option('display.max_columns', None)

In [262]:
df = pd.read_csv('reg_season_statcast_bangs.csv')
playoff_df = pd.read_csv('post_season_statcast.csv')

In [263]:
playoff_df['bangs'] = 2

In [264]:
df = pd.concat([df, playoff_df]).reset_index(drop=True)

In [265]:
df.loc[:,'score_diff'] = df.home_score - df.away_score
df.drop(['home_score', 'away_score'], axis=1, inplace=True)

In [266]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9567 entries, 0 to 9566
Data columns (total 92 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   pitch_type                       9531 non-null   object 
 1   game_date                        9567 non-null   object 
 2   release_speed                    9531 non-null   float64
 3   release_pos_x                    9531 non-null   float64
 4   release_pos_z                    9531 non-null   float64
 5   player_name                      9567 non-null   object 
 6   batter                           9567 non-null   int64  
 7   pitcher                          9567 non-null   int64  
 8   events                           2475 non-null   object 
 9   description                      9567 non-null   object 
 10  spin_dir                         0 non-null      float64
 11  spin_rate_deprecated             0 non-null      float64
 12  break_angle_deprecat

In [361]:
# nulls = df.isna().sum()
# keep_cols = nulls[nulls < 200].index
# keep_cols = ['pitch_type', 'release_speed', 'batter', 'description',

#        'zone', 'stand', 'p_throws', 'type', 'balls', 'strikes',
#        'outs_when_up', 'inning', 'effective_speed', 'release_spin_rate', 
#        'release_extension', 'at_bat_number', 'pitch_number', 'home_score',
#        'away_score', 'spin_axis', 'delta_home_win_exp', 'delta_run_exp']
cols = ['pitch_type', 'batter', 'stand', 'p_throws', 'balls', 'strikes',
       'outs_when_up', 'inning', 'pitch_number', 'score_diff', 'delta_home_win_exp',
       'delta_run_exp', 'bangs']
non_feature_cols = ['delta_home_win_exp', 'delta_run_exp', 'bangs']

In [351]:
small_df = df[cols].copy()

In [358]:
small_df['bangs'].fillna(0, inplace=True)
small_df.loc[(small_df['bangs'] != 0) & (small_df['bangs'] != 2), 'bangs'] = 1

In [359]:
small_df = small_df[small_df.isna().sum(axis=1) == 0].reset_index(drop=True)

In [360]:
small_df.head()

Unnamed: 0,pitch_type,batter,stand,p_throws,balls,strikes,outs_when_up,inning,pitch_number,score_diff,delta_home_win_exp,delta_run_exp,bangs
0,SI,543807,R,R,0,0,0,1,1,0,0.0,0.036,0
1,SI,543807,R,R,1,0,0,1,2,0,0.0,-0.046,0
2,SL,543807,R,R,1,1,0,1,3,0,0.0,0.05,0
3,SI,543807,R,R,2,1,0,1,4,0,0.102,0.96,0
4,FF,608324,R,R,0,0,0,1,1,1,0.03,0.387,0


In [362]:
cat_df = small_df.copy()
for col in cat_df.columns:
    if col not in non_feature_cols:
        cat_df[col] = cat_df[col].astype('category').cat.codes

In [363]:
train_mask = cat_df.bangs != 2

In [365]:
x_train = cat_df[train_mask].copy().drop(non_feature_cols, axis=1)
x_test = cat_df[~train_mask].copy().drop(non_feature_cols, axis=1)

y_train = cat_df.loc[train_mask, 'bangs']
y_train = y_train.astype('category').cat.codes

In [366]:
clf = CategoricalNB()

In [367]:
clf.fit(x_train, y_train)

CategoricalNB()

In [368]:
clf.predict(x_train.loc[[0]])
clf.predict_proba(x_train.loc[[0]])

array([[0.98652432, 0.01347568]])

In [390]:
bang_preds = clf.predict(x_test)
bang_probability = clf.predict_proba(x_test)
bang_probability = pd.Series(list(map(lambda x: x[1], bang_probability)), name='bang_probability')

In [401]:
test_deltas = small_df.loc[~train_mask, ['delta_home_win_exp', 'delta_run_exp']].reset_index(drop=True)

In [413]:
res = small_df.loc[~train_mask, ['delta_home_win_exp', 'delta_run_exp']].reset_index(drop=True)
res['bang_probability'] = bang_probability
res['exp_delta_win_exp'] = res['bang_probability'] * res['delta_home_win_exp']
res['exp_delta_runs'] = res['bang_probability'] * res['delta_run_exp']

In [416]:
res.exp_delta_runs.sum()

1.7475203186777697

In [417]:
res.exp_delta_win_exp.sum()

0.2635107842797165