In [1]:
import pandas as pd

In [2]:
df_pitch_subs = pd.read_csv('pitching_subs.csv')

In [3]:
df_pitch_subs['made_save'] = None
df_pitch_subs['IP'] = 0.0
df_next_pitcher = df_pitch_subs.shift(-1).reset_index()

# our csv already tells us which pitcher gets the save, so we will leave that calculation to the scorer
# need to check if any relief pitchers either blow,or neither get a S/BS
# for now, let's just see if they blow it based on the state of the game when they come in, and get pulled/end of game
# we know that if they come in with a lead, and leave with the game tied/losing, they blew it
for (cur_index, cur_row), (_, next_row) in zip(df_pitch_subs.iterrows(), df_next_pitcher.iterrows()):
    # made_save will be True(completed save), False(blew save), or None(neither completed or blew)\
    made_save = None
    
    # We will check the state of the game when the pitcher came in (cur_row)
    # as well as the state of the game with the pitcher stopped pitching (next_row)
    
    # make sure next event is the same game, and the same team
    if cur_row['GAME_ID'] == next_row['GAME_ID'] and cur_row['HOME_TEAM'] == next_row['HOME_TEAM']:
        
        # IP will be number if innings pitched
        IP = float(next_row['INN_CT'] - cur_row['INN_CT']) + float((1.0/3.0)*next_row['OUTS_CT']) - float((1.0/3.0)*cur_row['OUTS_CT'])
        
        # home
        if cur_row['HOME_TEAM'] == 1:
            # blew lead if winning in initial state, and losing in their last state
            blew_lead = next_row['HOME_SCORE_CT'] <= next_row['AWAY_SCORE_CT']
            if blew_lead:
                # Definition of Save Opportunity
                # He is credited with at least ⅓ of an inning pitched; and
                #He satisfies one of the following conditions:
                    #1 He enters the game with a lead of no more than three runs and pitches for at least one inning
                    #2 He enters the game, regardless of the count, with the potential tying run either on base, at bat or on deck
                    #3 He pitches for at least three innings.
                    
                if IP > 0:
                    init_score_diff = cur_row['HOME_SCORE_CT'] - cur_row['AWAY_SCORE_CT']
                
                    # save #1 or #2 or #3
                    if (init_score_diff <= 3 and inn_diff >= 1) or (IP >= 3) or (cur_row['HOME_SCORE_CT'] <= cur_row['AWAY_SCORE_CT'] + 2 + sum([cur_row['RUNNER_FIRST'], cur_row['RUNNER_SECOND'], cur_row['RUNNER_THIRD']])):
                        made_save = False     
        #away
        elif cur_row['HOME_TEAM'] == 0:
            # blew lead if winning in initial state, and losing in their last state
            blew_lead = next_row['HOME_SCORE_CT'] >= next_row['AWAY_SCORE_CT']
            if blew_lead:
                inn_diff = next_row['INN_CT'] - cur_row['INN_CT']
                init_score_diff = cur_row['AWAY_SCORE_CT'] - cur_row['HOME_SCORE_CT']
                
                # Save #1 or #2 or #3
                if (init_score_diff <= 3 and IP >= 1) or (IP >= 3) or (cur_row['AWAY_SCORE_CT'] <= cur_row['HOME_SCORE_CT'] + 2 + sum([cur_row['RUNNER_FIRST'], cur_row['RUNNER_SECOND'], cur_row['RUNNER_THIRD']])):
                    made_save = False
    else:
        # calculate IP if last pitcher of game
        IP = cur_row['TOTAL_INN'] - cur_row['INN_CT'] + 1 - float((1.0/3.0)*cur_row['OUTS_CT'])
    # Use Retrosheet to determine if this pitcher was the one who got a save
    if cur_row['PIT_ID'] == cur_row['SAVE_PIT_ID']:
        made_save = True
    
    
    
    # place made_save, IP into the dataframe at the correct index
    df_pitch_subs.iloc[cur_index, df_pitch_subs.columns.get_loc('made_save')] = made_save
    df_pitch_subs.iloc[cur_index, df_pitch_subs.columns.get_loc('IP')] = float(round(IP, 1))

df_pitch_subs.head()

Unnamed: 0,GAME_ID,EVENT_ID,INN_CT,TOTAL_INN,HOME_TEAM,OUTS_CT,HOME_SCORE_CT,AWAY_SCORE_CT,RUNNER_FIRST,RUNNER_SECOND,RUNNER_THIRD,PIT_ID,SAVE_PIT_ID,FULL_NAME,made_save,IP
0,ANA201004050,54,7,9,1,0,4,3,0,0,0,jepsk001,fuenb001,Kevin Jepsen,,1.0
1,ANA201004050,64,8,9,1,0,4,3,0,0,0,rodnf001,fuenb001,Fernando Rodney,,1.0
2,ANA201004050,74,9,9,1,0,6,3,0,0,0,fuenb001,fuenb001,Brian Fuentes,True,1.0
3,ANA201004050,44,5,9,0,2,4,3,1,0,1,craij001,fuenb001,Jesse Crain,False,1.7
4,ANA201004050,61,7,9,0,1,4,3,0,0,0,mijaj001,fuenb001,Jose Mijares,False,0.7


In [4]:
# We already know if a pitcher lost a lead when, but we dont know if they were even entering into a save opportunity
# A Save Opportunity (SvO) happens when a relief pitcher enters the game with a lead
# Above, we didnt check to make sure they were in the lead when they entered the game
def save_sit(row):
    if row['HOME_TEAM'] == 1 and row['HOME_SCORE_CT'] > row['AWAY_SCORE_CT']:
        if IP > 0:
            init_score_diff = cur_row['HOME_SCORE_CT'] - cur_row['AWAY_SCORE_CT']
            # save #1 or #2 or #3
            if (init_score_diff <= 3 and inn_diff >= 1) or (IP >= 3) or (cur_row['HOME_SCORE_CT'] <= cur_row['AWAY_SCORE_CT'] + 2 + sum([cur_row['RUNNER_FIRST'], cur_row['RUNNER_SECOND'], cur_row['RUNNER_THIRD']])):
                return True 
        
    elif row['HOME_TEAM'] == 0 and row['HOME_SCORE_CT'] < row['AWAY_SCORE_CT']:
        if IP > 0:
            init_score_diff = cur_row['HOME_SCORE_CT'] - cur_row['AWAY_SCORE_CT']
            # save #1 or #2 or #3
            if (init_score_diff <= 3 and inn_diff >= 1) or (IP >= 3) or (cur_row['HOME_SCORE_CT'] <= cur_row['AWAY_SCORE_CT'] + 2 + sum([cur_row['RUNNER_FIRST'], cur_row['RUNNER_SECOND'], cur_row['RUNNER_THIRD']])):
                return True 
    else:
        return False
df_pitch_subs['SAVE_SIT'] = df_pitch_subs.apply(save_sit, axis=1)

# we know if a pitcher entered into a SvO, so let's disregard all the pitchers who didn't
# we know that all these pitchers are in SvOs and who got the save, we can drop some data that we don't need
df_pitch_subs = df_pitch_subs[df_pitch_subs['SAVE_SIT'] == True].drop(['SAVE_PIT_ID', 'SAVE_SIT'], axis=1).reset_index(drop=True)

df_pitch_subs.head()

Unnamed: 0,GAME_ID,EVENT_ID,INN_CT,TOTAL_INN,HOME_TEAM,OUTS_CT,HOME_SCORE_CT,AWAY_SCORE_CT,RUNNER_FIRST,RUNNER_SECOND,RUNNER_THIRD,PIT_ID,FULL_NAME,made_save,IP
0,ANA201004050,54,7,9,1,0,4,3,0,0,0,jepsk001,Kevin Jepsen,,1.0
1,ANA201004050,64,8,9,1,0,4,3,0,0,0,rodnf001,Fernando Rodney,,1.0
2,ANA201004050,74,9,9,1,0,6,3,0,0,0,fuenb001,Brian Fuentes,True,1.0
3,ANA201004060,64,7,9,0,2,3,5,0,1,0,duenb001,Brian Duensing,,0.3
4,ANA201004060,68,8,9,0,0,3,5,0,0,0,guerm001,Matt Guerrier,,1.0


In [5]:
def sit_id(row):
    # '<start_inn><start_out><score_diff><runners_on><IP>'
    score_diff = row['HOME_SCORE_CT'] - row['AWAY_SCORE_CT'] if row['HOME_TEAM'] else row['AWAY_SCORE_CT'] - row['HOME_SCORE_CT']
    runners_on = sum([row['RUNNER_FIRST'], row['RUNNER_SECOND'], row['RUNNER_THIRD']])
    return str(row['INN_CT']) + str(row['OUTS_CT']) + str(score_diff) + str(runners_on) + str(row['IP'])

df_pitch_subs['sit_id'] = df_pitch_subs.apply(sit_id, axis=1)

In [6]:
df_pitch_subs.head()

Unnamed: 0,GAME_ID,EVENT_ID,INN_CT,TOTAL_INN,HOME_TEAM,OUTS_CT,HOME_SCORE_CT,AWAY_SCORE_CT,RUNNER_FIRST,RUNNER_SECOND,RUNNER_THIRD,PIT_ID,FULL_NAME,made_save,IP,sit_id
0,ANA201004050,54,7,9,1,0,4,3,0,0,0,jepsk001,Kevin Jepsen,,1.0,70101.0
1,ANA201004050,64,8,9,1,0,4,3,0,0,0,rodnf001,Fernando Rodney,,1.0,80101.0
2,ANA201004050,74,9,9,1,0,6,3,0,0,0,fuenb001,Brian Fuentes,True,1.0,90301.0
3,ANA201004060,64,7,9,0,2,3,5,0,1,0,duenb001,Brian Duensing,,0.3,72210.3
4,ANA201004060,68,8,9,0,0,3,5,0,0,0,guerm001,Matt Guerrier,,1.0,80201.0


In [7]:
df_pitch_subs.groupby('sit_id').size()

sit_id
100100.3     1
100100.7     2
100101.0    68
100101.3     1
100102.0     2
100200.3     1
100201.0    26
100203.0     1
100221.0     1
100301.0    10
100400.7     1
100401.0     4
100420.3     1
100501.0     2
100731.0     1
100801.0     1
101110.0     1
101110.3     1
101120.3     1
101120.7     2
101221.7     1
101320.3     1
101320.7     1
102100.3     1
102120.3     2
102210.3     1
102420.3     1
102430.3     1
10315.0      1
110100.0     1
            ..
92320.3      9
92330.3      1
92400.3      4
92410.0      2
92410.3     21
92420.0      2
92420.3     54
92430.3      4
92500.3      3
92510.0      2
92510.3      9
92520.0      2
92520.3     13
92530.0      1
92530.3      4
92600.3      2
92610.3      3
92620.0      1
92620.3      5
92630.3      5
92700.3      3
92710.3      1
92720.3      3
92730.3      2
92810.3      1
92820.3      3
92900.3      1
92910.3      1
92920.3      1
92930.3      1
Length: 2119, dtype: int64