In [1]:
import pandas as pd
import statsmodels.api as sm

In [2]:
cp_df = pd.read_csv("cp_merged.csv")

In [3]:
cp_df.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-0,1,1,0,True,white,white
1,0-0,1,1,0,False,white,white
2,1-1,1,1,0,True,white,white
3,1-2,1,1,0,False,white,white
4,0-0,1,1,0,False,white,white


In [4]:
# Function to prepare called-pitches dataframe for modelling
# Output: X, y
# X is a dataframe of observations of the input variables
# y is a series of observations of the response variable
# Function may take the better part of a minute to run (~37 secs on my CPU)
# This is almost entirely due to the line which creates the UPM column
def prepare_df(df):
    # Drop counts where number of balls is > 3
    df = df[df['count'].apply(lambda x: x[0]!='4')]
    
    # Convert strike-given-called to int
    df['strike_given_called'] = df['strike_given_called'].apply(int)
    
    # Make column to check when umpire's race matches pitcher's race
    # (1 if match, 0 if mismatch)
    df['upm'] = df.apply(lambda x: x.pitcher_race==x.umpire_race, axis=1).apply(int)
    
    # Drop the race-value columns
    df = df.drop(labels=['pitcher_race', 'umpire_race'], axis=1)
    
    # Convert innings greater than the ninth to 9
    df.inning = df.inning.apply(lambda x: min(x,9))
    
    # Turn counts and innings into dummy variables
    df = pd.get_dummies(df, columns=['count', 'inning'], drop_first=True)
    
    # Rearrange and rename columns
    new_cols = ['strike_given_called', 'upm', 'home_pitcher', 'run_diff',
            'count_0-1', 'count_0-2', 'count_1-0', 'count_1-1', 'count_1-2', 'count_2-0', 'count_2-1', 'count_2-2',
           'count_3-0', 'count_3-1', 'count_3-2',
           'inning_2', 'inning_3', 'inning_4', 'inning_5', 'inning_6', 'inning_7', 'inning_8', 'inning_9']
    df = df[new_cols]
    df = df.rename(columns={'inning_9': 'inning_9+'})
    
    # Add intercept column
    df['intercept'] = 1
    
    # Return X dataframe, y series
    return df.drop(labels=['strike_given_called'], axis=1), df['strike_given_called'] 

In [5]:
%time X, y = prepare_df(cp_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


CPU times: user 38.5 s, sys: 469 ms, total: 38.9 s
Wall time: 38.5 s


In [6]:
result = sm.Logit(y,X).fit()

Optimization terminated successfully.
         Current function value: 0.580795
         Iterations 6


In [7]:
print(result.summary())

                            Logit Regression Results                           
Dep. Variable:     strike_given_called   No. Observations:              1136496
Model:                           Logit   Df Residuals:                  1136473
Method:                            MLE   Df Model:                           22
Date:                 Sat, 06 Jul 2019   Pseudo R-squ.:                 0.08250
Time:                         11:02:19   Log-Likelihood:            -6.6007e+05
converged:                        True   LL-Null:                   -7.1942e+05
                                         LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
upm              0.0051      0.004      1.177      0.239      -0.003       0.014
home_pitcher     0.0445      0.004     10.536      0.000       0.036       0.053
run_diff         0.0151      0.001  