In [26]:
import pandas as pd

In [27]:
df_pitch_subs = pd.read_csv('pitching_subs.csv')

In [28]:
df_pitch_subs['made_save'] = None
df_next_pitcher = df_pitch_subs.shift(-1).reset_index()

# Our csv already tells us which pitcher gets the save, so we will leave that calculation to the scorer
# Need to check if any relief pitchers either blow,or neither get a S/BS
# For now, let's just see if they blow it based on the state of the game when they come in, and get pulled/end of game
# We know that if they come in with a lead, and leave with the game tied/losing, they blew it
for (cur_index, cur_row), (_, next_row) in zip(df_pitch_subs.iterrows(), df_next_pitcher.iterrows()):
    # made_save will be True(completed save), False(blew save), or None(neither completed or blew)\
    # default value for a made_save is None
    made_save = None
    
    # We will check the state of the game when the pitcher came in (cur_row)
    # as well as the state of the game with the pitcher stopped pitching (next_row)
    
    # Make sure next event is the same game, and the same team
    if cur_row['GAME_ID'] == next_row['GAME_ID'] and cur_row['HOME_TEAM'] == next_row['HOME_TEAM']:
        # home
        if cur_row['HOME_TEAM'] == 1:
            # blew lead if winning in initial state, and losing in their last state
            blew_lead = next_row['HOME_SCORE_CT'] <= next_row['AWAY_SCORE_CT']
            if blew_lead:
                # Definition of Save
                #1 He enters the game with a lead of no more than three runs and pitches for at least one inning
                #2 He enters the game, regardless of the count, with the potential tying run either on base, at bat or on deck
                #3 He pitches for at least three innings.
                inn_diff = next_row['INN_CT'] - cur_row['INN_CT']
                init_score_diff = cur_row['HOME_SCORE_CT'] - cur_row['AWAY_SCORE_CT']
                
                # Save #1 or #2 or #3
                if (init_score_diff <= 3 and inn_diff >= 1) or (inn_diff >= 3) or (cur_row['HOME_SCORE_CT'] <= cur_row['AWAY_SCORE_CT'] + 2 + sum([cur_row['onFirst'], cur_row['onSecond'], cur_row['onThird']])):
                    made_save = False     
        #away
        elif cur_row['HOME_TEAM'] == 0:
            # blew lead if winning in initial state, and losing in their last state
            blew_lead = next_row['HOME_SCORE_CT'] >= next_row['AWAY_SCORE_CT']
            if blew_lead:
                inn_diff = next_row['INN_CT'] - cur_row['INN_CT']
                init_score_diff = cur_row['AWAY_SCORE_CT'] - cur_row['HOME_SCORE_CT']
                
                # Save #1 or #2 or #3
                if (init_score_diff <= 3 and inn_diff >= 1) or (inn_diff >= 3) or (cur_row['AWAY_SCORE_CT'] <= cur_row['HOME_SCORE_CT'] + 2 + sum([cur_row['onFirst'], cur_row['onSecond'], cur_row['onThird']])):
                    made_save = False
    
    # Use Retrosheet to determine if this pitcher was the one who got a save
    elif cur_row['PIT_ID'] == cur_row['SAVE_PIT_ID']:
            made_save = True
               
    # place made_save into the dataframe at the correct index
    df_pitch_subs.iloc[cur_index, df_pitch_subs.columns.get_loc('made_save')] = made_save
    


In [29]:
# We already know if a pitcher lost a lead when, but we dont know if they were even entering into a save opportunity
# A Save Opportunity (SvO) happens when a relief pitcher enters the game with a lead
# Above, we didnt check to make sure they were in the lead when they entered the game
def save_sit(row):
    if row['HOME_TEAM'] == 1 and row['HOME_SCORE_CT'] > row['AWAY_SCORE_CT']:
        return True
    elif row['HOME_TEAM'] == 0 and row['HOME_SCORE_CT'] < row['AWAY_SCORE_CT']:
        return True
    else:
        return False
df_pitch_subs['SAVE_SIT'] = df_pitch_subs.apply(save_sit, axis=1)

# Now we know if a pitcher entered into a SvO, so let's disregard all the pitchers who didn't
df_pitch_subs = df_pitch_subs[df_pitch_subs['SAVE_SIT'] == True].reset_index()

# Now that we know that all these pitchers are in SvOs and who got the save, we can drop some data that we don't need
df_pitch_subs.drop(['SAVE_PIT_ID', 'SAVE_SIT'], axis=1, inplace=True)

In [35]:
# Final data frame
df_saves = df_pitch_subs.copy()

In [37]:
df_saves.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23180 entries, 0 to 23179
Data columns (total 13 columns):
index            23180 non-null int64
GAME_ID          23180 non-null object
PIT_ID           23180 non-null object
INN_CT           23180 non-null int64
HOME_TEAM        23180 non-null int64
OUTS_CT          23180 non-null int64
AWAY_SCORE_CT    23180 non-null int64
HOME_SCORE_CT    23180 non-null int64
onFirst          23180 non-null int64
onSecond         23180 non-null int64
onThird          23180 non-null int64
FULL_NAME        23180 non-null object
made_save        1584 non-null object
dtypes: int64(9), object(4)
memory usage: 2.3+ MB
