In [1]:
import pandas as pd
import statsmodels.api as sm

In [2]:
# Function to prepare called-pitches dataframe for modelling
# Output: X, y
# X is a dataframe of observations of the input variables
# y is a series of observations of the response variable
# Function may take the better part of a minute to run (~37 secs on my CPU)
# This is almost entirely due to the line which creates the UPM column
def prepare_df(df):
    # Drop counts where number of balls is > 3
    df = df[df['count'].apply(lambda x: x[0]!='4')]
    
    # Convert strike-given-called to int
    df['strike_given_called'] = df['strike_given_called'].apply(int)
    
    # Make column to check when umpire's race matches pitcher's race
    # (1 if match, 0 if mismatch)
    df['upm'] = df.apply(lambda x: x.pitcher_race==x.umpire_race, axis=1).apply(int)
    
    # Drop the race-value columns
    df = df.drop(labels=['pitcher_race', 'umpire_race'], axis=1)
    
    # Convert innings greater than the ninth to 9
    df.inning = df.inning.apply(lambda x: min(x,9))
    
    # Turn counts and innings into dummy variables
    df = pd.get_dummies(df, columns=['count', 'inning'], drop_first=True)
    
    # Rearrange and rename columns
    new_cols = ['strike_given_called', 'upm', 'home_pitcher', 'run_diff',
            'count_0-1', 'count_0-2', 'count_1-0', 'count_1-1', 'count_1-2', 'count_2-0', 'count_2-1', 'count_2-2',
           'count_3-0', 'count_3-1', 'count_3-2',
           'inning_2', 'inning_3', 'inning_4', 'inning_5', 'inning_6', 'inning_7', 'inning_8', 'inning_9']
    df = df[new_cols]
    df = df.rename(columns={'inning_9': 'inning_9+'})
    
    # Add intercept column
    df['intercept'] = 1
    
    # Return X dataframe, y series
    return df.drop(labels=['strike_given_called'], axis=1), df['strike_given_called'] 

In [3]:
cp_df = pd.read_csv("cp_merged.csv")

In [4]:
cp_df.umpire_race.value_counts()

white       1011465
black         70230
hispanic      54806
Name: umpire_race, dtype: int64

## Effects of UPM when Umpire is white

In [5]:
cp_df.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-0,1,1,0,True,white,white
1,0-0,1,1,0,False,white,white
2,1-1,1,1,0,True,white,white
3,1-2,1,1,0,False,white,white
4,0-0,1,1,0,False,white,white


In [6]:
white_umpire = cp_df[cp_df.umpire_race == 'white']
white_umpire.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-0,1,1,0,True,white,white
1,0-0,1,1,0,False,white,white
2,1-1,1,1,0,True,white,white
3,1-2,1,1,0,False,white,white
4,0-0,1,1,0,False,white,white


In [7]:
%time X, y = prepare_df(white_umpire)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


CPU times: user 34 s, sys: 312 ms, total: 34.3 s
Wall time: 33.9 s


In [8]:
result = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.581129
         Iterations 6


In [9]:
print(result.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:              1011460
Model:                           Logit   Df Residuals:                  1011437
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08224
Time:                         10:52:27   Log-Likelihood:            -5.8779e+05
converged:                        True   LL-Null:                   -6.4046e+05
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm              0.0053      0.005      1.091      0.275      -0.004       0.015
home_pitcher     0.0456      0.004     10.181      0.000       0.037       0.054
run_diff         0.0158      0.001  

## Fixed effects when umpire is black

In [10]:
black_umpire = cp_df[cp_df.umpire_race == 'black']
black_umpire.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
5274,0-0,1,1,0,False,white,black
5275,1-1,1,1,0,True,white,black
5276,1-2,1,1,0,False,white,black
5277,0-1,1,1,0,False,white,black
5278,1-2,1,1,0,False,white,black


In [11]:
X, y = prepare_df(black_umpire)

In [12]:
results = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.578704
         Iterations 6


In [13]:
print(results.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:                70230
Model:                           Logit   Df Residuals:                    70207
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08512
Time:                         10:52:41   Log-Likelihood:                -40642.
converged:                        True   LL-Null:                       -44424.
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm             -0.0675      0.034     -1.968      0.049      -0.135      -0.000
home_pitcher    -0.0046      0.017     -0.268      0.789      -0.038       0.029
run_diff         0.0075      0.003  

## Fixed effects when the umpire is hispanic

In [14]:
hispanic_umpire = cp_df[cp_df.umpire_race == 'hispanic']
hispanic_umpire.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
2642,0-0,1,1,0,False,white,hispanic
2643,1-0,1,1,0,True,white,hispanic
2644,1-1,1,1,0,False,white,hispanic
2645,0-1,1,1,0,False,white,hispanic
2646,0-0,1,1,0,False,white,hispanic


In [15]:
%time X, y = prepare_df(hispanic_umpire)

CPU times: user 1.85 s, sys: 21.1 ms, total: 1.87 s
Wall time: 1.89 s


In [16]:
results = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.576493
         Iterations 6


In [17]:
print(results.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:                54806
Model:                           Logit   Df Residuals:                    54783
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08525
Time:                         10:52:48   Log-Likelihood:                -31595.
converged:                        True   LL-Null:                       -34540.
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm             -0.0042      0.024     -0.177      0.860      -0.051       0.042
home_pitcher     0.0860      0.019      4.454      0.000       0.048       0.124
run_diff         0.0138      0.003  

In [18]:
cp_df.pitcher_race.value_counts()

white       792601
hispanic    242264
black        72007
asian        29629
Name: pitcher_race, dtype: int64

## Fixed effects when pitcher is white

In [19]:
white_pitcher = cp_df[cp_df.pitcher_race == 'white']

In [20]:
white_pitcher.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-0,1,1,0,True,white,white
1,0-0,1,1,0,False,white,white
2,1-1,1,1,0,True,white,white
3,1-2,1,1,0,False,white,white
4,0-0,1,1,0,False,white,white


In [21]:
%time X, y = prepare_df(white_pitcher)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


CPU times: user 27.2 s, sys: 284 ms, total: 27.5 s
Wall time: 27.1 s


In [22]:
results = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.580647
         Iterations 6


In [23]:
print(results.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:               792597
Model:                           Logit   Df Residuals:                   792574
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08288
Time:                         10:53:33   Log-Likelihood:            -4.6022e+05
converged:                        True   LL-Null:                   -5.0181e+05
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm              0.0038      0.008      0.476      0.634      -0.012       0.020
home_pitcher     0.0425      0.005      8.412      0.000       0.033       0.052
run_diff         0.0145      0.001  

## Fixed effects when pitcher is hispanic

In [24]:
hispanic_pitcher = cp_df[cp_df.pitcher_race == 'hispanic']
hispanic_pitcher.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
123,0-0,0,8,-5,False,hispanic,white
124,1-0,0,8,-5,True,hispanic,white
125,1-2,0,8,-5,False,hispanic,white
126,2-2,0,8,-5,True,hispanic,white
127,0-0,0,8,-5,False,hispanic,white


In [25]:
%time X, y = prepare_df(hispanic_pitcher)

CPU times: user 8.4 s, sys: 75.3 ms, total: 8.48 s
Wall time: 8.23 s


In [26]:
results = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.579984
         Iterations 6


In [27]:
print(results.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:               242264
Model:                           Logit   Df Residuals:                   242241
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08271
Time:                         10:53:51   Log-Likelihood:            -1.4051e+05
converged:                        True   LL-Null:                   -1.5318e+05
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm             -0.0115      0.022     -0.536      0.592      -0.054       0.031
home_pitcher     0.0447      0.009      4.888      0.000       0.027       0.063
run_diff         0.0147      0.002  

## Fixed effects when pitcher is black

In [28]:
black_pitcher = cp_df[cp_df.pitcher_race == 'black']
black_pitcher.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
674,0-0,1,9,0,True,black,white
675,0-1,1,9,0,True,black,white
676,0-2,1,9,0,True,black,white
677,0-0,1,9,0,False,black,white
678,1-1,1,9,0,False,black,white


In [29]:
%time X, y = prepare_df(black_pitcher)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


CPU times: user 2.59 s, sys: 33.1 ms, total: 2.62 s
Wall time: 2.57 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [30]:
results = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.581704
         Iterations 6


In [31]:
print(results.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:                72006
Model:                           Logit   Df Residuals:                    71983
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.07681
Time:                         10:53:59   Log-Likelihood:                -41886.
converged:                        True   LL-Null:                       -45371.
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm             -0.0397      0.034     -1.170      0.242      -0.106       0.027
home_pitcher     0.0547      0.017      3.248      0.001       0.022       0.088
run_diff         0.0191      0.003  

## Fixed effects when pitcher is Asian:
* We cannot examine the effects of whether or not an Asian pitcher matches the umpire on race, because none of the umpires in our data set were Asian.

# Matching on whiteness/non-whiteness

In [32]:
cp_df.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-0,1,1,0,True,white,white
1,0-0,1,1,0,False,white,white
2,1-1,1,1,0,True,white,white
3,1-2,1,1,0,False,white,white
4,0-0,1,1,0,False,white,white


In [33]:
def white_or_not(s):
    if s == 'white':
        return 'white'
    else:
        return 'nonwhite'

In [34]:
wcp_df = cp_df

In [35]:
wcp_df.pitcher_race = wcp_df.pitcher_race.apply(white_or_not)

In [36]:
wcp_df.umpire_race = wcp_df.umpire_race.apply(white_or_not)

In [37]:
%time X, y = prepare_df(wcp_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


CPU times: user 38.5 s, sys: 421 ms, total: 38.9 s
Wall time: 38.5 s


In [38]:
results = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.580796
         Iterations 6


In [39]:
print(results.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:              1136496
Model:                           Logit   Df Residuals:                  1136473
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08250
Time:                         10:56:37   Log-Likelihood:            -6.6007e+05
converged:                        True   LL-Null:                   -7.1942e+05
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm              0.0036      0.004      0.805      0.421      -0.005       0.012
home_pitcher     0.0445      0.004     10.535      0.000       0.036       0.053
run_diff         0.0151      0.001  

## Fixed effects if umpire is nonwhite

In [40]:
nonwhite_umpire = wcp_df[wcp_df.umpire_race == 'nonwhite']
nonwhite_umpire.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
2642,0-0,1,1,0,False,white,nonwhite
2643,1-0,1,1,0,True,white,nonwhite
2644,1-1,1,1,0,False,white,nonwhite
2645,0-1,1,1,0,False,white,nonwhite
2646,0-0,1,1,0,False,white,nonwhite


In [41]:
%time X, y = prepare_df(nonwhite_umpire)

CPU times: user 4.42 s, sys: 40.1 ms, total: 4.46 s
Wall time: 4.23 s


In [42]:
results = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.577946
         Iterations 6


In [43]:
print(results.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:               125036
Model:                           Logit   Df Residuals:                   125013
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08485
Time:                         10:56:50   Log-Likelihood:                -72264.
converged:                        True   LL-Null:                       -78964.
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm             -0.0237      0.014     -1.708      0.088      -0.051       0.004
home_pitcher     0.0361      0.013      2.827      0.005       0.011       0.061
run_diff         0.0106      0.002  

## Fixed effects when pitcher is nonwhite

In [44]:
nonwhite_pitcher = wcp_df[wcp_df.pitcher_race == 'nonwhite']
nonwhite_pitcher.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
123,0-0,0,8,-5,False,nonwhite,white
124,1-0,0,8,-5,True,nonwhite,white
125,1-2,0,8,-5,False,nonwhite,white
126,2-2,0,8,-5,True,nonwhite,white
127,0-0,0,8,-5,False,nonwhite,white


In [45]:
%time X, y = prepare_df(nonwhite_pitcher)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


CPU times: user 12 s, sys: 134 ms, total: 12.1 s
Wall time: 11.7 s


In [46]:
results = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.581064
         Iterations 6


In [47]:
print(results.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:               343899
Model:                           Logit   Df Residuals:                   343876
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08173
Time:                         10:57:07   Log-Likelihood:            -1.9983e+05
converged:                        True   LL-Null:                   -2.1761e+05
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm             -0.0229      0.012     -1.862      0.063      -0.047       0.001
home_pitcher     0.0487      0.008      6.337      0.000       0.034       0.064
run_diff         0.0166      0.001  