In [2]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('../../data/combined/game_events.csv')

col_dict = {}
for i, col in enumerate(df.columns):
    col_dict[col] = i
col_dict['bounce_1'] = 7
col_dict['bounce_2'] = 8
col_dict['bounce_3'] = 9

In [15]:
# filter for balls in play where the next event is a bounce
gb_in_play_mask = (
    (df['event_code'] == 4) &
    (df['event_code'].shift(-1).eq(16) | df['event_code'].shift(-1).isna())
)

gb_df = df[gb_in_play_mask].copy()
gb_df.reset_index(inplace=True)
gb_df.rename({'index': 'game_event_index'}, axis=1, inplace=True)

fb_gb_ind = []
inf_gb_with_throw_1b_ind = []
acquire_time_list = []
throw_time_list = []
receieve_time_list = []

bounce_1, bounce_2, bounce_3 = [], [], []
fb_bounce_1, fb_bounce_2, fb_bounce_3 = [], [], []

first_base_gb_df = pd.DataFrame(columns=df.columns)
inf_gb_with_throw_1b_df = pd.DataFrame()

for game_event_index in gb_df['game_event_index']:
    game_event_index += 1

    # skip the bounces
    bounce_1_t, bounce_2_t, bounce_3_t = np.nan, np.nan, np.nan
    bounces = 0
    while df.iloc[game_event_index, col_dict['event_code']] == 16:
        bounces += 1
        if bounces == 1:
            bounce_1_t = df.iloc[game_event_index, col_dict['timestamp']]
        elif bounces == 2:
            bounce_2_t = df.iloc[game_event_index, col_dict['timestamp']]
        elif bounces == 3:
            bounce_3_t = df.iloc[game_event_index, col_dict['timestamp']]
        game_event_index += 1
    
    # check if the next event is the ball being received by an infielder
    if df.iloc[game_event_index, col_dict['event_code']] == 2:
        if df.iloc[game_event_index, col_dict['player_position']] == 3: # first baseman
            # append the row at game_event_index to first_base_gb_df
            fb_gb_ind.append(game_event_index)
            fb_bounce_1.append(bounce_1_t)
            fb_bounce_2.append(bounce_2_t)
            fb_bounce_3.append(bounce_3_t)
        elif 4 <= df.iloc[game_event_index, col_dict['player_position']] <= 6: # all other infielders
            acquire_index = game_event_index
            acquire_time = df.iloc[game_event_index, col_dict['timestamp']]
            game_event_index += 1
            if df.iloc[game_event_index, col_dict['event_code']] == 3: # throw]
                throw_time = df.iloc[game_event_index, col_dict['timestamp']]
                game_event_index += 1
                # skip bounces on the throw
                while df.iloc[game_event_index, col_dict['event_code']] == 16:
                    game_event_index += 1
                
                # if the throw is received by the first baseman, add the play to the dataframe
                if df.iloc[game_event_index, col_dict['event_code']] == 2 and df.iloc[game_event_index, col_dict['player_position']] == 3:
                    receive_time = df.iloc[game_event_index, col_dict['timestamp']]
                    acquire_time_list.append(acquire_time)
                    throw_time_list.append(throw_time)
                    receieve_time_list.append(receive_time)
                    inf_gb_with_throw_1b_ind.append(acquire_index)
                    bounce_1.append(bounce_1_t)
                    bounce_2.append(bounce_2_t)
                    bounce_3.append(bounce_3_t)

first_base_gb_df = df.iloc[fb_gb_ind].copy()
first_base_gb_df.reset_index(inplace=True)
first_base_gb_df.rename({'index': 'game_event_index'}, axis=1, inplace=True)
first_base_gb_df['bounce_1'] = fb_bounce_1
first_base_gb_df['bounce_2'] = fb_bounce_2
first_base_gb_df['bounce_3'] = fb_bounce_3

inf_gb_with_throw_1b_df = df.iloc[inf_gb_with_throw_1b_ind].copy()
inf_gb_with_throw_1b_df['acquire_time'] = acquire_time_list
inf_gb_with_throw_1b_df['throw_time'] = throw_time_list
inf_gb_with_throw_1b_df['1b_receive_time'] = receieve_time_list
inf_gb_with_throw_1b_df['bounce_1'] = bounce_1
inf_gb_with_throw_1b_df['bounce_2'] = bounce_2
inf_gb_with_throw_1b_df['bounce_3'] = bounce_3
inf_gb_with_throw_1b_df.reset_index(inplace=True)
inf_gb_with_throw_1b_df.rename({'index': 'game_event_index'}, axis=1, inplace=True)

# print lengths of dataframes
print(f'Number of ground balls in play: {len(gb_df)}')
print(f'Number of ground balls in play hit to first baseman: {len(first_base_gb_df)}')
print(f'Number of ground balls in play fielded by infielder and thrown to first: {len(inf_gb_with_throw_1b_df)}')

Number of ground balls in play: 2786
Number of ground balls in play hit to first baseman: 198
Number of ground balls in play fielded by infielder and thrown to first: 837


In [7]:
gb_df.head()

Unnamed: 0,game_event_index,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code
0,28,1903_01_TeamNE_TeamA2,10,2.0,10,177335,10,4
1,60,1903_01_TeamNE_TeamA2,18,4.0,18,458331,10,4
2,87,1903_01_TeamNE_TeamA2,25,6.0,25,612606,10,4
3,105,1903_01_TeamNE_TeamA2,28,7.0,28,711639,10,4
4,134,1903_01_TeamNE_TeamA2,36,8.0,36,1009761,10,4


In [8]:
first_base_gb_df.head()

Unnamed: 0,game_event_index,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,bounce_1,bounce_2,bounce_3
0,1737,1903_16_TeamNI_TeamA3,121,35.0,122,3763018,3,2,3761718,3762518.0,
1,1750,1903_16_TeamNI_TeamA3,124,36.0,125,3840368,3,2,3839168,3839718.0,3840068.0
2,2526,1902_02_TeamMG_TeamA3,43,10.0,43,1395969,3,2,1394319,1395869.0,
3,4374,1903_08_TeamNJ_TeamB,171,52.0,172,6068717,3,2,6067067,6068317.0,
4,6101,1903_23_TeamNA_TeamA1,124,34.0,128,3935444,3,2,3934794,3935194.0,


In [9]:
inf_gb_with_throw_1b_df.head()

Unnamed: 0,game_event_index,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,acquire_time,throw_time,1b_receive_time,bounce_1,bounce_2,bounce_3
0,64,1903_01_TeamNE_TeamA2,18,4.0,18,460212,6,2,460212,461301,462225,458529,459255.0,459816.0
1,148,1903_01_TeamNE_TeamA2,38,9.0,38,1092459,5,2,1092459,1094175,1095198,1091832,1092294.0,
2,214,1903_01_TeamNE_TeamA2,58,13.0,58,1616896,6,2,1616896,1618117,1619239,1615081,1616104.0,
3,441,1903_01_TeamNE_TeamA2,117,29.0,117,3638082,6,2,3638082,3639039,3640161,3636300,3637587.0,
4,453,1903_01_TeamNE_TeamA2,119,30.0,119,3794238,6,2,3794238,3795327,3796416,3792093,3793149.0,3793776.0


In [10]:
# Save dataframes to csv files
gb_df.to_csv('../../data/derived/plays/gb_df.csv')
first_base_gb_df.to_csv('../../data/derived/plays/first_base_gb_df.csv')
inf_gb_with_throw_1b_df.to_csv('../../data/derived/plays/inf_gb_with_throw_1b_df.csv')