In [1]:
import pandas as pd
import numpy as np

### Concatenate and transform pitches dataframes

In [2]:
games_2013 = pd.read_csv("data/games_2013.csv")
games_2013.head()

Unnamed: 0.1,Unnamed: 0,pitch,balls,strikes,count,batter,pitcher,umpire,home_pitcher,inning,run_diff
0,0,C,0,0,0-0,Ian Kinsler,Bud Norris,Sam Holbrook,1,1,0
1,1,X,0,1,0-1,Ian Kinsler,Bud Norris,Sam Holbrook,1,1,0
2,2,B,0,0,0-0,Elvis Andrus,Bud Norris,Sam Holbrook,1,1,0
3,3,S,1,0,1-0,Elvis Andrus,Bud Norris,Sam Holbrook,1,1,0
4,4,C,1,1,1-1,Elvis Andrus,Bud Norris,Sam Holbrook,1,1,0


In [3]:
games_2014 = pd.read_csv("data/games_2014.csv")
games_2014.head()

Unnamed: 0.1,Unnamed: 0,pitch,balls,strikes,count,batter,pitcher,umpire,home_pitcher,inning,run_diff
0,0,C,0,0,0-0,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
1,1,F,0,1,0-1,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
2,2,S,0,2,0-2,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
3,3,B,0,0,0-0,Justin Turner,Wade Miley,Tim Welke,1,1,0
4,4,B,1,0,1-0,Justin Turner,Wade Miley,Tim Welke,1,1,0


In [4]:
games_2015 = pd.read_csv("data/games_2015.csv")

games_2015.head()

Unnamed: 0.1,Unnamed: 0,pitch,balls,strikes,count,batter,pitcher,umpire,home_pitcher,inning,run_diff
0,0,C,0,0,0-0,Matt Carpenter,Jon Lester,Mike Winters,1,1,0
1,1,S,0,1,0-1,Matt Carpenter,Jon Lester,Mike Winters,1,1,0
2,2,F,0,2,0-2,Matt Carpenter,Jon Lester,Mike Winters,1,1,0
3,3,B,0,2,0-2,Matt Carpenter,Jon Lester,Mike Winters,1,1,0
4,4,B,1,2,1-2,Matt Carpenter,Jon Lester,Mike Winters,1,1,0


In [5]:
# Combine games 2013 - 2015
# Drop unnecessary columns
pitches_df = pd.concat([games_2013, games_2014, games_2015])\
                .drop(columns=['Unnamed: 0', 'balls', 'strikes', 'batter'])

pitches_df.head()

Unnamed: 0,pitch,count,pitcher,umpire,home_pitcher,inning,run_diff
0,C,0-0,Bud Norris,Sam Holbrook,1,1,0
1,X,0-1,Bud Norris,Sam Holbrook,1,1,0
2,B,0-0,Bud Norris,Sam Holbrook,1,1,0
3,S,1-0,Bud Norris,Sam Holbrook,1,1,0
4,C,1-1,Bud Norris,Sam Holbrook,1,1,0


In [6]:
# Drop all rows where pitch is not a called pitch
pitches_df = pitches_df[(pitches_df.pitch == 'C') | (pitches_df.pitch == 'B')]
pitches_df.head()

Unnamed: 0,pitch,count,pitcher,umpire,home_pitcher,inning,run_diff
0,C,0-0,Bud Norris,Sam Holbrook,1,1,0
2,B,0-0,Bud Norris,Sam Holbrook,1,1,0
4,C,1-1,Bud Norris,Sam Holbrook,1,1,0
5,B,1-2,Bud Norris,Sam Holbrook,1,1,0
7,B,0-0,Bud Norris,Sam Holbrook,1,1,0


In [7]:
# Convert pitch column to True/False - whether pitch was a strike
# C = 'C'alled strike
# B = 'B'all (not strike)
pitches_df['strike_given_called'] = (pitches_df.pitch == 'C')
pitches_df.head()

Unnamed: 0,pitch,count,pitcher,umpire,home_pitcher,inning,run_diff,strike_given_called
0,C,0-0,Bud Norris,Sam Holbrook,1,1,0,True
2,B,0-0,Bud Norris,Sam Holbrook,1,1,0,False
4,C,1-1,Bud Norris,Sam Holbrook,1,1,0,True
5,B,1-2,Bud Norris,Sam Holbrook,1,1,0,False
7,B,0-0,Bud Norris,Sam Holbrook,1,1,0,False


In [8]:
# Drop pitch column
pitches_df = pitches_df.drop(columns=['pitch'])
pitches_df.head()

Unnamed: 0,count,pitcher,umpire,home_pitcher,inning,run_diff,strike_given_called
0,0-0,Bud Norris,Sam Holbrook,1,1,0,True
2,0-0,Bud Norris,Sam Holbrook,1,1,0,False
4,1-1,Bud Norris,Sam Holbrook,1,1,0,True
5,1-2,Bud Norris,Sam Holbrook,1,1,0,False
7,0-0,Bud Norris,Sam Holbrook,1,1,0,False


### Merge with pitcher data to get race of pitcher

In [9]:
# Get data from MLB census 2014
mlb_census_df = pd.read_csv("data/mlb_census.csv")
mlb_census_df.head()

Unnamed: 0,#,NAME,POS,POS (BROAD),BAT,THW,AGE,Height,WT,BIRTH CITY,BIRTH STATE,BIRTH COUNTRY,RACE,Experience,SCHOOL,SALARY,TEAM
0,13,Alex Avila,C,Catcher,L,R,27,71,210,"Hialeah, FL",FL,United States,Hispanic,5.0,Alabama,"$4,150,000",Detroit Tigers
1,30,David Robertson,RP,Pitcher,R,R,29,71,195,"Birmingham, AL",AL,United States,White,6.0,Alabama,"$5,215,000",New York Yankees
2,29,Tommy Hunter,RP,Pitcher,R,R,27,75,248,"Indianapolis, IN",IN,United States,White,6.0,Alabama,"$3,000,000",Orioles
3,20,Anthony Recker,C,Catcher,R,R,30,74,240,"Allentown, PA",PA,United States,White,3.0,Alvernia College,"$505,340",New York Mets
4,4,Nick Hundley,C,Catcher,R,R,30,73,196,"Corvallis, OR",OR,United States,White,6.0,Arizona,"$4,000,000",San Diego Padres


In [10]:
mlb_census_df = mlb_census_df[['NAME', 'RACE']].rename(columns={'NAME': 'pitcher_name', 'RACE': 'pitcher_race'})
mlb_census_df.head()

Unnamed: 0,pitcher_name,pitcher_race
0,Alex Avila,Hispanic
1,David Robertson,White
2,Tommy Hunter,White
3,Anthony Recker,White
4,Nick Hundley,White


In [11]:
mlb_census_df.pitcher_race = mlb_census_df.pitcher_race.apply(lambda x : x.lower())

In [12]:
mlb_census_df.head()

Unnamed: 0,pitcher_name,pitcher_race
0,Alex Avila,hispanic
1,David Robertson,white
2,Tommy Hunter,white
3,Anthony Recker,white
4,Nick Hundley,white


In [13]:
# Get researched data on other pitchers' races
missing_pitcher_df = pd.read_csv("data/pitcher_race.csv")
missing_pitcher_df.head()

Unnamed: 0,pitcher_name,pitcher_race
0,Aaron Sanchez,hispanic
1,Alex Burnett,hispanic
2,Alex Claudio,hispanic
3,Alex Sanabia,hispanic
4,Andy Pettitte,white


In [14]:
missing_pitcher_df.pitcher_race.value_counts()

white        455
black         91
hispanic      87
asian         14
hispanic       1
Name: pitcher_race, dtype: int64

In [15]:
# Fix an extra space in an entry
missing_pitcher_df.pitcher_race = missing_pitcher_df.pitcher_race.apply(lambda x : x.replace("hispanic ", "hispanic"))

In [16]:
all_pitchers = pd.concat([missing_pitcher_df, mlb_census_df])
all_pitchers.head()

Unnamed: 0,pitcher_name,pitcher_race
0,Aaron Sanchez,hispanic
1,Alex Burnett,hispanic
2,Alex Claudio,hispanic
3,Alex Sanabia,hispanic
4,Andy Pettitte,white


In [17]:
all_pitchers.pitcher_race.value_counts()

white       905
hispanic    302
black       162
asian        29
Name: pitcher_race, dtype: int64

In [18]:
# Perform left join on pitcher_name
# First, we must prepare
pitches_df = pitches_df.rename(columns={'pitcher': 'pitcher_name', 'umpire': 'umpire_name'})
pitches_df.head()

Unnamed: 0,count,pitcher_name,umpire_name,home_pitcher,inning,run_diff,strike_given_called
0,0-0,Bud Norris,Sam Holbrook,1,1,0,True
2,0-0,Bud Norris,Sam Holbrook,1,1,0,False
4,1-1,Bud Norris,Sam Holbrook,1,1,0,True
5,1-2,Bud Norris,Sam Holbrook,1,1,0,False
7,0-0,Bud Norris,Sam Holbrook,1,1,0,False


In [19]:
# We have to fix a character encoding problem
set(pitches_df.pitcher_name)

{'Felix\xa0Doubront',
 'Adam\xa0Loewen',
 'Steve\xa0Delabar',
 'Brandon\xa0McCarthy',
 'Shawn\xa0Tolleson',
 'Yohan\xa0Pino',
 'Jairo\xa0Diaz',
 'Jorge\xa0De\xa0La\xa0Rosa',
 'Trevor\xa0Cahill',
 'Cory\xa0Mazzoni',
 'Kenley\xa0Jansen',
 'Chris\xa0Rearick',
 'Anibal\xa0Sanchez',
 'Luis\xa0Severino',
 'Kyle\xa0Barraclough',
 'Zach\xa0Miner',
 'Archie\xa0Bradley',
 'Chaz\xa0Roe',
 'Victor\xa0Marte',
 'David\xa0Hernandez',
 'Shelby\xa0Miller',
 'Ethan\xa0Martin',
 'Jordan\xa0Lyles',
 'Lisalverto\xa0Bonilla',
 'Ryan\xa0Cook',
 'Tanner\xa0Roark',
 'Chris\xa0Resop',
 'Eric\xa0Surkamp',
 'Joel\xa0Hanrahan',
 'Hector\xa0Rondon',
 'Blaine\xa0Boyer',
 'Jarrett\xa0Grube',
 'Raisel\xa0Iglesias',
 'Kenny\xa0Roberts',
 'Travis\xa0Wood',
 'Ryan\xa0Weber',
 'Antonio\xa0Bastardo',
 'Pat\xa0McCoy',
 'Colton\xa0Murray',
 'Alex\xa0Wood',
 'Edinson\xa0Volquez',
 'Mike\xa0Adams',
 'David\xa0Phelps',
 'Andrew\xa0Romine',
 'Phil\xa0Hughes',
 'Luis\xa0Mendoza',
 'Zack\xa0Godley',
 'Matt\xa0West',
 'Heath\xa0Bel

In [20]:
pitches_df.pitcher_name = pitches_df.pitcher_name.apply(lambda x: x.replace('\xa0', ' '))

In [21]:
set(pitches_df.pitcher_name) - set(all_pitchers.pitcher_name)

{"Eric O'Flaherty", "Ryan O'Rourke", "Sean O'Sullivan"}

In [22]:
set(pitches_df.pitcher_name) - set(all_pitchers.pitcher_name.apply(lambda x: x.replace("OFlaherty", "O'Flaherty"))\
                                  .apply(lambda x: x.replace("ORourke", "O'Rourke"))\
                                  .apply(lambda x: x.replace("OSullivan", "O'Sullivan")))

set()

In [23]:
all_pitchers.pitcher_name = all_pitchers.pitcher_name.apply(lambda x: x.replace("OFlaherty", "O'Flaherty"))\
                                  .apply(lambda x: x.replace("ORourke", "O'Rourke"))\
                                  .apply(lambda x: x.replace("OSullivan", "O'Sullivan"))

In [24]:
merge1_df = pd.merge(pitches_df, all_pitchers, how='left', on='pitcher_name')
merge1_df.head()

Unnamed: 0,count,pitcher_name,umpire_name,home_pitcher,inning,run_diff,strike_given_called,pitcher_race
0,0-0,Bud Norris,Sam Holbrook,1,1,0,True,white
1,0-0,Bud Norris,Sam Holbrook,1,1,0,False,white
2,1-1,Bud Norris,Sam Holbrook,1,1,0,True,white
3,1-2,Bud Norris,Sam Holbrook,1,1,0,False,white
4,0-0,Bud Norris,Sam Holbrook,1,1,0,False,white


In [25]:
any(merge1_df.pitcher_race.isna())

False

### Merge with umpire data to get umpire race

In [26]:
# Load umpire data
umpire_df = pd.read_csv("data/umpire_race.csv")
umpire_df.head()

Unnamed: 0.1,Unnamed: 0,umpire_name,umpire_race
0,0,Jordan Baker,white
1,1,Lance Barksdale,white
2,2,Lance Barrett,white
3,3,Ted Barrett,white
4,4,Scott Barry,white


In [27]:
umpire_df.umpire_race.value_counts()

white       88
black        6
hispanic     6
Name: umpire_race, dtype: int64

In [28]:
umpire_df = umpire_df[['umpire_name', 'umpire_race']]
umpire_df.head()

Unnamed: 0,umpire_name,umpire_race
0,Jordan Baker,white
1,Lance Barksdale,white
2,Lance Barrett,white
3,Ted Barrett,white
4,Scott Barry,white


In [29]:
set(merge1_df.umpire_name) - set(umpire_df.umpire_name)

{"Brian O'Nora"}

In [30]:
umpire_df[umpire_df.umpire_name == 'Brian ONora']

Unnamed: 0,umpire_name,umpire_race
59,Brian ONora,white


In [31]:
umpire_df.umpire_name = umpire_df.umpire_name.apply(lambda x: x.replace("Brian ONora", "Brian O'Nora"))

In [32]:
merge2_df = pd.merge(merge1_df, umpire_df, how='left', on='umpire_name')
merge2_df.head()

Unnamed: 0,count,pitcher_name,umpire_name,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-0,Bud Norris,Sam Holbrook,1,1,0,True,white,white
1,0-0,Bud Norris,Sam Holbrook,1,1,0,False,white,white
2,1-1,Bud Norris,Sam Holbrook,1,1,0,True,white,white
3,1-2,Bud Norris,Sam Holbrook,1,1,0,False,white,white
4,0-0,Bud Norris,Sam Holbrook,1,1,0,False,white,white


In [33]:
merge2_df[merge2_df.umpire_race.isna()]

Unnamed: 0,count,pitcher_name,umpire_name,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race


## Create output dataframes
* `cp_merged.csv` is an older dataframe
* `cp_merged_ids.csv` contains randomized numbers for the umpires' names, which allows us to conduct tests for significance at a finer level. PLEASE NOTE that the purpose of this project is not to point fingers at anyone, or to suggest absolute causation in finding correlations between umpire/pitcher race and how pitches are called.

In [34]:
# Obfuscate umpire names for future use
# Drop pitcher names
umpire_names = merge2_df.umpire_name.unique()
perm = np.random.permutation(len(umpire_names))
obf = {x: perm[i] for i,x in enumerate(umpire_names)}
merge2_df.umpire_name = merge2_df.umpire_name.apply(lambda x: obf[x])

# Order the data so that randomized umpire ids are in ascending order
merge2_df = merge2_df.sort_values(by=['umpire_name'])

# Remove the pitcher names
merge2_df = merge2_df.drop(columns=['pitcher_name'])

# Reset index
merge2_df = merge2_df.reset_index(drop=True)

# Convert strike_given_called to integer to save storage space
merge2_df.strike_given_called = merge2_df.strike_given_called.apply(int)

merge2_df.head()

Unnamed: 0,count,umpire_name,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-1,0,0,4,0,0,white,white
1,0-1,0,0,1,0,0,white,white
2,0-0,0,1,2,0,1,white,white
3,0-1,0,1,2,0,1,white,white
4,0-2,0,1,2,0,0,white,white


In [35]:
merge2_df.to_csv("data/cp_merged_ids.csv", index=False)

In [36]:
# Drop extra column, prepare to write to file for original analysis notebook
merge3_df = merge2_df.drop(columns=['umpire_name'])

In [37]:
merge3_df.head()

Unnamed: 0,count,home_pitcher,inning,run_diff,strike_given_called,pitcher_race,umpire_race
0,0-1,0,4,0,0,white,white
1,0-1,0,1,0,0,white,white
2,0-0,1,2,0,1,white,white
3,0-1,1,2,0,1,white,white
4,0-2,1,2,0,0,white,white


In [38]:
# Write out csv
merge3_df.to_csv("data/cp_merged.csv", index=False)