In [1]:
import pandas as pd
import numpy as np

# Tools for recursive feature selection
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Tools for fitting logistic regression and getting p-values
import statsmodels.api as sm

# For plotting
import matplotlib.pyplot as plt

# Our local, useful functions
from baseball_utils import *

In [2]:
cp_df = pd.read_csv("cp_merged.csv")
cp_df.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-0,1,1,0,True,white,white
1,0-0,1,1,0,False,white,white
2,1-1,1,1,0,True,white,white
3,1-2,1,1,0,False,white,white
4,0-0,1,1,0,False,white,white


In [3]:
%time X, y = prepare_df(cp_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['strike_given_called'] = df['strike_given_called'].apply(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['upm'] = df.apply(lambda x: x.pitcher_race==x.umpire_race, axis=1).apply(int)


CPU times: user 37.6 s, sys: 339 ms, total: 37.9 s
Wall time: 37.3 s


In [4]:
%time fit = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.580795
         Iterations 6
CPU times: user 13.7 s, sys: 3.23 s, total: 16.9 s
Wall time: 9.01 s


In [19]:
fit.predict([0] + X.mean()[1:].tolist())[0]

0.30296134225978794

In [7]:
fit.summary()

0,1,2,3
Dep. Variable:,strike_given_called,No. Observations:,1136496.0
Model:,Logit,Df Residuals:,1136473.0
Method:,MLE,Df Model:,22.0
Date:,"Tue, 09 Jul 2019",Pseudo R-squ.:,0.0825
Time:,17:40:40,Log-Likelihood:,-660070.0
converged:,True,LL-Null:,-719420.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
upm,0.0051,0.004,1.177,0.239,-0.003,0.014
home_pitcher,0.0445,0.004,10.536,0.000,0.036,0.053
run_diff,0.0151,0.001,21.209,0.000,0.014,0.017
count_0-1,-1.1167,0.007,-156.696,0.000,-1.131,-1.103
count_0-2,-2.2604,0.014,-156.204,0.000,-2.289,-2.232
count_1-0,-0.1644,0.007,-25.165,0.000,-0.177,-0.152
count_1-1,-0.9228,0.008,-116.058,0.000,-0.938,-0.907
count_1-2,-2.0133,0.012,-169.233,0.000,-2.037,-1.990
count_2-0,0.0382,0.010,3.709,0.000,0.018,0.058


In [24]:
def upm_effects(fit):
    # Find predicted value
    pred = fit.predict([0] + X.mean()[1:].tolist())[0]
    
    return (fit.params[0]*(pred - pred**2))

In [25]:
upm_effects(fit)

0.0010851791079812992

## Effects for white umpire

In [26]:
white_umpires = cp_df[cp_df.umpire_race == 'white']

In [36]:
%time X_white, y_white = prepare_df(white_umpires)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['strike_given_called'] = df['strike_given_called'].apply(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['upm'] = df.apply(lambda x: x.pitcher_race==x.umpire_race, axis=1).apply(int)


CPU times: user 34.3 s, sys: 393 ms, total: 34.7 s
Wall time: 34.2 s


In [37]:
fit_white = sm.Logit(y_white, X_white).fit()

Optimization terminated successfully.
         Current function value: 0.581129
         Iterations 6


In [38]:
fit_white.summary()

0,1,2,3
Dep. Variable:,strike_given_called,No. Observations:,1011460.0
Model:,Logit,Df Residuals:,1011437.0
Method:,MLE,Df Model:,22.0
Date:,"Tue, 09 Jul 2019",Pseudo R-squ.:,0.08224
Time:,18:04:50,Log-Likelihood:,-587790.0
converged:,True,LL-Null:,-640460.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
upm,0.0053,0.005,1.091,0.275,-0.004,0.015
home_pitcher,0.0456,0.004,10.181,0.000,0.037,0.054
run_diff,0.0158,0.001,20.676,0.000,0.014,0.017
count_0-1,-1.1127,0.008,-147.460,0.000,-1.128,-1.098
count_0-2,-2.2577,0.015,-147.383,0.000,-2.288,-2.228
count_1-0,-0.1649,0.007,-23.815,0.000,-0.178,-0.151
count_1-1,-0.9210,0.008,-109.326,0.000,-0.938,-0.904
count_1-2,-2.0140,0.013,-159.614,0.000,-2.039,-1.989
count_2-0,0.0349,0.011,3.195,0.001,0.013,0.056


In [39]:
upm_effects(fit_white)

0.0011165217354785766

## Effects for black umpire

In [31]:
black_umpires = cp_df[cp_df.umpire_race == 'black']

In [32]:
%time X_black, y_black = prepare_df(black_umpires)

CPU times: user 2.35 s, sys: 27 ms, total: 2.38 s
Wall time: 2.32 s


In [40]:
fit_black = sm.Logit(y_black, X_black).fit()

Optimization terminated successfully.
         Current function value: 0.578704
         Iterations 6


In [41]:
fit_black.summary()

0,1,2,3
Dep. Variable:,strike_given_called,No. Observations:,70230.0
Model:,Logit,Df Residuals:,70207.0
Method:,MLE,Df Model:,22.0
Date:,"Tue, 09 Jul 2019",Pseudo R-squ.:,0.08512
Time:,18:05:02,Log-Likelihood:,-40642.0
converged:,True,LL-Null:,-44424.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
upm,-0.0675,0.034,-1.968,0.049,-0.135,-0.000
home_pitcher,-0.0046,0.017,-0.268,0.789,-0.038,0.029
run_diff,0.0075,0.003,2.749,0.006,0.002,0.013
count_0-1,-1.1575,0.029,-39.992,0.000,-1.214,-1.101
count_0-2,-2.2820,0.059,-38.712,0.000,-2.397,-2.166
count_1-0,-0.1475,0.026,-5.613,0.000,-0.199,-0.096
count_1-1,-0.9792,0.032,-30.320,0.000,-1.042,-0.916
count_1-2,-2.0222,0.048,-42.087,0.000,-2.116,-1.928
count_2-0,0.0759,0.042,1.820,0.069,-0.006,0.158


In [42]:
upm_effects(fit_black)

-0.014262807483721099

## Effects for hispanic umpire

In [43]:
hispanic_umpire = cp_df[cp_df.umpire_race == 'hispanic']

In [44]:
%time X_hispanic, y_hispanic = prepare_df(hispanic_umpire)

CPU times: user 1.8 s, sys: 25.6 ms, total: 1.83 s
Wall time: 1.84 s


In [46]:
fit_hispanic = sm.Logit(y_hispanic, X_hispanic).fit()

Optimization terminated successfully.
         Current function value: 0.576493
         Iterations 6


In [47]:
upm_effects(fit_hispanic)

-0.0008806338471457959

## Effects for nonwhite umpire

In [48]:
def white_or_not(r):
    if r == 'white':
        return 'white'
    else:
        return 'nonwhite'

In [49]:
w_nw_df = cp_df

In [50]:
w_nw_df.umpire_race = w_nw_df.umpire_race.apply(white_or_not)

In [51]:
w_nw_df.pitcher_race = w_nw_df.pitcher_race.apply(white_or_not)

In [52]:
nonwhite_umpire = w_nw_df[w_nw_df.umpire_race == 'nonwhite']
nonwhite_umpire.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
2642,0-0,1,1,0,False,white,nonwhite
2643,1-0,1,1,0,True,white,nonwhite
2644,1-1,1,1,0,False,white,nonwhite
2645,0-1,1,1,0,False,white,nonwhite
2646,0-0,1,1,0,False,white,nonwhite


In [53]:
%time X_nonwhite, y_nonwhite = prepare_df(nonwhite_umpire)

CPU times: user 4.38 s, sys: 44.1 ms, total: 4.43 s
Wall time: 4.19 s


In [54]:
fit_nonwhite = sm.Logit(y_nonwhite, X_nonwhite).fit()

Optimization terminated successfully.
         Current function value: 0.577946
         Iterations 6


In [56]:
fit_nonwhite.summary()

0,1,2,3
Dep. Variable:,strike_given_called,No. Observations:,125036.0
Model:,Logit,Df Residuals:,125013.0
Method:,MLE,Df Model:,22.0
Date:,"Tue, 09 Jul 2019",Pseudo R-squ.:,0.08485
Time:,18:10:21,Log-Likelihood:,-72264.0
converged:,True,LL-Null:,-78964.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
upm,-0.0237,0.014,-1.708,0.088,-0.051,0.004
home_pitcher,0.0361,0.013,2.827,0.005,0.011,0.061
run_diff,0.0106,0.002,5.250,0.000,0.007,0.015
count_0-1,-1.1499,0.022,-53.039,0.000,-1.192,-1.107
count_0-2,-2.2840,0.044,-51.758,0.000,-2.370,-2.197
count_1-0,-0.1603,0.020,-8.129,0.000,-0.199,-0.122
count_1-1,-0.9375,0.024,-38.962,0.000,-0.985,-0.890
count_1-2,-2.0080,0.036,-56.258,0.000,-2.078,-1.938
count_2-0,0.0661,0.031,2.119,0.034,0.005,0.127


In [55]:
upm_effects(fit_nonwhite)

-0.005011480822723513