In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def read_csv_files_from_folder(folder_path):
    """
    Read CSV files from a folder and return a dataframe
    """
    dfs = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):
                dfs.append(pd.read_csv(os.path.join(root, file)))
    return pd.concat(dfs, ignore_index=True)

# Read the CSV files from the raw folder and combine them into one dataframe. Write the combined dataframe to a new folder
# called combined. If the combined folder already exists, read the files from there instead.
if not os.path.exists('../data/combined'):
    ball_pos_df = read_csv_files_from_folder('../data/raw/ball_pos').drop(columns=['Unnamed: 0'])
    game_events_df = read_csv_files_from_folder('../data/raw/game_events').drop(columns=['Unnamed: 0'])
    game_info_df = read_csv_files_from_folder('../data/raw/game_info').drop(columns=['Unnamed: 0'])
    player_pos_df = read_csv_files_from_folder('../data/raw/player_pos').drop(columns=['Unnamed: 0'])
else:
    ball_pos_df = pd.read_csv('../data/combined/ball_pos.csv').drop(columns=['Unnamed: 0'])
    game_events_df = pd.read_csv('../data/combined/game_events.csv').drop(columns=['Unnamed: 0'])
    game_info_df = pd.read_csv('../data/combined/game_info.csv').drop(columns=['Unnamed: 0'])
    player_pos_df = pd.read_csv('../data/combined/player_pos.csv').drop(columns=['Unnamed: 0'])

team_info = pd.read_csv("../data/raw/team_info.csv")

if not os.path.exists('../data/combined'):
    os.makedirs('../data/combined')
    ball_pos_df.to_csv('../data/combined/ball_pos.csv')
    game_events_df.to_csv('../data/combined/game_events.csv')
    game_info_df.to_csv('../data/combined/game_info.csv')
    player_pos_df.to_csv('../data/combined/player_pos.csv')
    team_info.to_csv('../data/combined/team_info.csv')

In [3]:
col_dict = {}
for i, col in enumerate(game_events_df.columns):
    col_dict[col] = i
col_dict['bounce_1'] = 7
col_dict['bounce_2'] = 8
col_dict['bounce_3'] = 9

# Mask for all ground ball events
gb_in_play_mask = (
    (game_events_df['event_code'] == 4) &
    (game_events_df['event_code'].shift(-1).eq(16))
)

gb_df = game_events_df[gb_in_play_mask].copy()
gb_df.reset_index(inplace=True)
gb_df.rename({'index': 'game_event_index'}, axis=1, inplace=True)

any_inf_gb_ind = [] # ground balls to any infielder
fb_gb_ind = [] # ground balls to first base
inf_gb_with_throw_1b_ind = [] # ground balls to non first base infielders with a throw to first
acquire_time_list = [] # time when the infielder acquires the ball
throw_time_list = [] # time when the infielder throws the ball
receieve_time_list = [] # time when the first baseman receives the ball

# timestamps of first 3 bounces for ground ball to first baseman
fb_bounce_1, fb_bounce_2, fb_bounce_3 = [], [], []
# timestamps of first 3 bounces for ground ball
bounce_1, bounce_2, bounce_3 = [], [], [] 

first_base_gb_df = pd.DataFrame(columns=game_events_df.columns)
inf_gb_with_throw_1b_df = pd.DataFrame()

for game_event_index in gb_df['game_event_index']:
    game_event_index += 1

    # default to NaN in case ball bounced less than 3 times
    bounce_1_t, bounce_2_t, bounce_3_t = np.nan, np.nan, np.nan
    bounces = 0
    while game_events_df.iloc[game_event_index, col_dict['event_code']] == 16:
        bounces += 1
        if bounces == 1:
            bounce_1_t = game_events_df.iloc[game_event_index, col_dict['timestamp']]
        elif bounces == 2:
            bounce_2_t = game_events_df.iloc[game_event_index, col_dict['timestamp']]
        elif bounces == 3:
            bounce_3_t = game_events_df.iloc[game_event_index, col_dict['timestamp']]
        game_event_index += 1
    
    # check if the next event is the ball being received by an infielder
    if game_events_df.iloc[game_event_index, col_dict['event_code']] == 2:
        if game_events_df.iloc[game_event_index, col_dict['player_position']] == 3: # first baseman
            # append the row at game_event_index to first_base_gb_df
            fb_gb_ind.append(game_event_index)

            # An infielder fielded this, so add the index
            any_inf_gb_ind.append(game_event_index)

            fb_bounce_1.append(bounce_1_t)
            fb_bounce_2.append(bounce_2_t)
            fb_bounce_3.append(bounce_3_t)
        elif 4 <= game_events_df.iloc[game_event_index, col_dict['player_position']] <= 6: # all other infielders
            # An infielder fielded this, so add the index
            any_inf_gb_ind.append(game_event_index)

            acquire_index = game_event_index
            acquire_time = game_events_df.iloc[game_event_index, col_dict['timestamp']]
            game_event_index += 1
            if game_events_df.iloc[game_event_index, col_dict['event_code']] == 3: # throw]
                throw_time = game_events_df.iloc[game_event_index, col_dict['timestamp']]
                game_event_index += 1
                # skip bounces on the throw
                while game_events_df.iloc[game_event_index, col_dict['event_code']] == 16:
                    game_event_index += 1
                
                # if the throw is received by the first baseman, add the play to the dataframe
                if game_events_df.iloc[game_event_index, col_dict['event_code']] == 2 and game_events_df.iloc[game_event_index, col_dict['player_position']] == 3:
                    receive_time = game_events_df.iloc[game_event_index, col_dict['timestamp']]
                    acquire_time_list.append(acquire_time)
                    throw_time_list.append(throw_time)
                    receieve_time_list.append(receive_time)
                    inf_gb_with_throw_1b_ind.append(acquire_index)
                    bounce_1.append(bounce_1_t)
                    bounce_2.append(bounce_2_t)
                    bounce_3.append(bounce_3_t)

any_inf_gb_df = game_events_df.iloc[any_inf_gb_ind].copy()

first_base_gb_df = game_events_df.iloc[fb_gb_ind].copy()
first_base_gb_df.reset_index(inplace=True)
first_base_gb_df.rename({'index': 'game_event_index'}, axis=1, inplace=True)
first_base_gb_df['bounce_1'] = fb_bounce_1
first_base_gb_df['bounce_2'] = fb_bounce_2
first_base_gb_df['bounce_3'] = fb_bounce_3

inf_gb_with_throw_1b_df = game_events_df.iloc[inf_gb_with_throw_1b_ind].copy()
inf_gb_with_throw_1b_df['acquire_time'] = acquire_time_list
inf_gb_with_throw_1b_df['throw_time'] = throw_time_list
inf_gb_with_throw_1b_df['1b_receive_time'] = receieve_time_list
inf_gb_with_throw_1b_df['bounce_1'] = bounce_1
inf_gb_with_throw_1b_df['bounce_2'] = bounce_2
inf_gb_with_throw_1b_df['bounce_3'] = bounce_3
inf_gb_with_throw_1b_df.reset_index(inplace=True)
inf_gb_with_throw_1b_df.rename({'index': 'game_event_index'}, axis=1, inplace=True)

# print lengths of dataframes
print(f'Number of ground balls in play: {len(gb_df)}')
print(f'Number of ground balls in play fielded by infielder: {len(any_inf_gb_df)}')
print(f'Number of ground balls in play hit to first baseman: {len(first_base_gb_df)}')
print(f'Number of ground balls in play fielded by infielder and thrown to first: {len(inf_gb_with_throw_1b_df)}')

Number of ground balls in play: 2786
Number of ground balls in play fielded by infielder: 1355
Number of ground balls in play hit to first baseman: 198
Number of ground balls in play fielded by infielder and thrown to first: 837


In [4]:
# Save dataframes to csv files
gb_df.to_csv('../data/derived/plays/all_gb_df.csv')
any_inf_gb_df.to_csv('../data/derived/plays/any_inf_gb_df.csv')
first_base_gb_df.to_csv('../data/derived/plays/first_base_gb_df.csv')
inf_gb_with_throw_1b_df.to_csv('../data/derived/plays/inf_gb_with_throw_1b_df.csv')