In [307]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('pitches16.csv', index_col = 0)

In [None]:
# Don't know why the index col won't get read in as index
df.drop(['index'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
# No pitch type data so drop, usually also no speed data or position data
df.dropna(subset=['pitch_type'], inplace = True)

# No position data for pitches missing speed either
df.dropna(subset=['release_speed'], inplace = True)

# Field shifts not important for scope of my project
df.drop(['if_fielding_alignment', 'of_fielding_alignment'], axis = 1, inplace = True)

# No release position means no zone info, therefore can't be used to predict pitch location
df.dropna(subset=['release_pos_x'], inplace = True)

In [None]:
# Start dropping unneeded columns
df.drop(['spin_dir', 'release_pos_x', 'release_pos_z', 'spin_rate_deprecated', 'break_angle_deprecated', 'home_team', 
         'away_team', 'hit_location', 'game_year', 'pfx_x', 'pfx_z', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 
         'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'hit_distance_sc', 
         'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk'], 
        axis =1 , inplace = True)

# Start iloc dropping bulk clustered unneeded columns at the end
df = df.iloc[:,:-4]

# Keep dropping unneeded columns
df.drop(['pitcher.1', 'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9','estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
       'launch_speed_angle', 'home_score', 'away_score'], 
        axis =1 , inplace = True)

In [None]:
df.fillna({'on_3b':0, 'on_2b': 0, 'on_1b':0}, inplace = True)

# Batted ball in play not relevant to scope of capstone
df.drop(['bb_type'], axis = 1, inplace = True)

# Description is too specific for purpose of capstone
df.drop(['des'], axis = 1, inplace = True)

df.drop(['break_length_deprecated'], axis = 1, inplace = True)

df['description'].value_counts()

# Description is good enough for my purposes
df.drop(['events'], axis = 1, inplace = True)

In [None]:
# All from the same game (weird), just drop

df[df['pitch_name'].isna()]

In [None]:
df.dropna(subset=['pitch_name'], inplace = True)

In [None]:
# All NaN's filled in
df.isna().sum()

In [None]:
df.reset_index(drop = True, inplace = True)

# Turn into binary
df['on_3b'] = np.where(df['on_3b'] == 0, 0, 1)
df['on_2b'] = np.where(df['on_2b'] == 0, 0, 1)
df['on_1b'] = np.where(df['on_1b'] == 0, 0, 1)

# Binarize regular season/playoff games
df['reg_season'] = np.where(df['game_type'] == 'R', 1, 0)

df['post_season'] = np.where(np.isin(df['game_type'], ['F','D','L','W']), 1, 0)

# Drop the original game types columns now
df.drop(['game_type'], axis = 1, inplace = True)

# Binarize pitcher handedness and batter handedness
df['p_left'] = np.where(df['p_throws']=='L', 1, 0)
df['p_right'] = np.where(df['p_throws']=='R', 1, 0)

df['bat_left'] = np.where(df['stand']=='L', 1, 0)
df['bat_right'] = np.where(df['stand']=='R', 1, 0)

print(df['p_right'].value_counts())
print(df['p_left'].value_counts())
print(df['bat_left'].value_counts())
print(df['bat_right'].value_counts())

In [None]:
# Drop the original column now
df.drop(['p_throws'], axis = 1, inplace = True)

df.drop(['stand'], axis = 1, inplace = True)

df.drop(['release_pos_y'], axis = 1, inplace = True)

In [None]:
# df.drop(['at_bat_number'], axis = 1, inplace = True)

In [None]:
# Sort by date and within date, sort by player name and then sort by chronological order of in game at bats and pitch number
df.sort_values(by = ['game_date', 'player_name', 'at_bat_number', 'pitch_number'], inplace = True, ignore_index = True)

In [None]:
df.head(25)[['balls', 'strikes', 'at_bat_number']]

In [None]:
df.to_csv('pitches_clean16.csv')

***

In [None]:
df = pd.read_csv('pitches13-15.csv', index_col = 0)

In [None]:
# Don't know why the index col won't get read in as index
df.drop(['index'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df['game_date'].sort_values()

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
# No pitch type data so drop, usually also no speed data or position data
df.dropna(subset=['pitch_type'], inplace = True)

# No position data for pitches missing speed either
df.dropna(subset=['release_speed'], inplace = True)

# Field shifts not important for scope of my project
df.drop(['if_fielding_alignment', 'of_fielding_alignment'], axis = 1, inplace = True)

# No release position means no zone info, therefore can't be used to predict pitch location
df.dropna(subset=['release_pos_x'], inplace = True)

In [None]:
# Start dropping unneeded columns
df.drop(['spin_dir', 'release_pos_x', 'release_pos_z', 'spin_rate_deprecated', 'break_angle_deprecated', 'home_team', 
         'away_team', 'hit_location', 'game_year', 'pfx_x', 'pfx_z', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 
         'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'hit_distance_sc', 
         'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk'], 
        axis =1 , inplace = True)

# Start iloc dropping bulk clustered unneeded columns at the end
df = df.iloc[:,:-4]

# Keep dropping unneeded columns
df.drop(['pitcher.1', 'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9','estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
       'launch_speed_angle', 'home_score', 'away_score'], 
        axis =1 , inplace = True)

In [None]:
df.fillna({'on_3b':0, 'on_2b': 0, 'on_1b':0}, inplace = True)

# Batted ball in play not relevant to scope of capstone
df.drop(['bb_type'], axis = 1, inplace = True)

# Description is too specific for purpose of capstone
df.drop(['des'], axis = 1, inplace = True)

df.drop(['break_length_deprecated'], axis = 1, inplace = True)

df['description'].value_counts()

# Description is good enough for my purposes
df.drop(['events'], axis = 1, inplace = True)

In [None]:
# All from the same game (weird), just drop

df[df['pitch_name'].isna()]

In [None]:
df.dropna(subset=['pitch_name'], inplace = True)

In [None]:
# All NaN's filled in
df.isna().sum()

In [None]:
df.reset_index(drop = True, inplace = True)

# Turn into binary
df['on_3b'] = np.where(df['on_3b'] == 0, 0, 1)
df['on_2b'] = np.where(df['on_2b'] == 0, 0, 1)
df['on_1b'] = np.where(df['on_1b'] == 0, 0, 1)

# Binarize regular season/playoff games
df['reg_season'] = np.where(df['game_type'] == 'R', 1, 0)

df['post_season'] = np.where(np.isin(df['game_type'], ['F','D','L','W']), 1, 0)

# Drop the original game types columns now
df.drop(['game_type'], axis = 1, inplace = True)

# Binarize pitcher handedness and batter handedness
df['p_left'] = np.where(df['p_throws']=='L', 1, 0)
df['p_right'] = np.where(df['p_throws']=='R', 1, 0)

df['bat_left'] = np.where(df['stand']=='L', 1, 0)
df['bat_right'] = np.where(df['stand']=='R', 1, 0)

print(df['p_right'].value_counts())
print(df['p_left'].value_counts())
print(df['bat_left'].value_counts())
print(df['bat_right'].value_counts())

In [None]:
# Drop the original column now
df.drop(['p_throws'], axis = 1, inplace = True)

df.drop(['stand'], axis = 1, inplace = True)

df.drop(['release_pos_y'], axis = 1, inplace = True)

In [None]:
# df.drop(['at_bat_number'], axis = 1, inplace = True)

In [None]:
# Sort by date and within date, sort by player name and then sort by chronological order of in game at bats and pitch number
df.sort_values(by = ['game_date', 'player_name', 'at_bat_number', 'pitch_number'], inplace = True, ignore_index = True)

In [None]:
df.head(25)[['balls', 'strikes', 'at_bat_number']]

In [None]:
# Statcast only began in 2015 it seems
df['game_date'].sort_values()

In [None]:
df.to_csv('pitches_clean15.csv')

***

In [None]:
df = pd.read_csv('pitches.csv', index_col = 0)

In [None]:
# Don't know why the index col won't get read in as index
df.drop(['index'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
# No pitch type data so drop, usually also no speed data or position data
df.dropna(subset=['pitch_type'], inplace = True)

# No position data for pitches missing speed either
df.dropna(subset=['release_speed'], inplace = True)

# Field shifts not important for scope of my project
df.drop(['if_fielding_alignment', 'of_fielding_alignment'], axis = 1, inplace = True)

# No release position means no zone info, therefore can't be used to predict pitch location
df.dropna(subset=['release_pos_x'], inplace = True)

In [None]:
# Start dropping unneeded columns
df.drop(['spin_dir', 'release_pos_x', 'release_pos_z', 'spin_rate_deprecated', 'break_angle_deprecated', 'home_team', 
         'away_team', 'hit_location', 'game_year', 'pfx_x', 'pfx_z', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 
         'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'hit_distance_sc', 
         'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk'], 
        axis =1 , inplace = True)

# Start iloc dropping bulk clustered unneeded columns at the end
df = df.iloc[:,:-4]

# Keep dropping unneeded columns
df.drop(['pitcher.1', 'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9','estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
       'launch_speed_angle', 'home_score', 'away_score'], 
        axis =1 , inplace = True)

In [None]:
df.fillna({'on_3b':0, 'on_2b': 0, 'on_1b':0}, inplace = True)

# Batted ball in play not relevant to scope of capstone
df.drop(['bb_type'], axis = 1, inplace = True)

# Description is too specific for purpose of capstone
df.drop(['des'], axis = 1, inplace = True)

df.drop(['break_length_deprecated'], axis = 1, inplace = True)

df['description'].value_counts()

# Description is good enough for my purposes
df.drop(['events'], axis = 1, inplace = True)

In [None]:
# All from the same game (weird), just drop

df[df['pitch_name'].isna()]

In [None]:
df.dropna(subset=['pitch_name'], inplace = True)

In [None]:
# All NaN's filled in
df.isna().sum()

In [None]:
df.reset_index(drop = True, inplace = True)

# Turn into binary
df['on_3b'] = np.where(df['on_3b'] == 0, 0, 1)
df['on_2b'] = np.where(df['on_2b'] == 0, 0, 1)
df['on_1b'] = np.where(df['on_1b'] == 0, 0, 1)

# Binarize regular season/playoff games
df['reg_season'] = np.where(df['game_type'] == 'R', 1, 0)

df['post_season'] = np.where(np.isin(df['game_type'], ['F','D','L','W']), 1, 0)

# Drop the original game types columns now
df.drop(['game_type'], axis = 1, inplace = True)

# Binarize pitcher handedness and batter handedness
df['p_left'] = np.where(df['p_throws']=='L', 1, 0)
df['p_right'] = np.where(df['p_throws']=='R', 1, 0)

df['bat_left'] = np.where(df['stand']=='L', 1, 0)
df['bat_right'] = np.where(df['stand']=='R', 1, 0)

print(df['p_right'].value_counts())
print(df['p_left'].value_counts())
print(df['bat_left'].value_counts())
print(df['bat_right'].value_counts())

In [None]:
# Drop the original column now
df.drop(['p_throws'], axis = 1, inplace = True)

df.drop(['stand'], axis = 1, inplace = True)

df.drop(['release_pos_y'], axis = 1, inplace = True)

In [None]:
# df.drop(['at_bat_number'], axis = 1, inplace = True)

In [None]:
# Sort by date and within date, sort by player name and then sort by chronological order of in game at bats and pitch number
df.sort_values(by = ['game_date', 'player_name', 'at_bat_number', 'pitch_number'], inplace = True, ignore_index = True)

In [None]:
df.head(25)[['balls', 'strikes', 'at_bat_number']]

In [None]:
df['game_date'].sort_values()

In [None]:
df.to_csv('pitches_clean17-19.csv')

***

## MERGING

In [322]:
pitch_1719 = pd.read_csv('pitches_clean17-19.csv', index_col=0)

pitch_1315 = pd.read_csv('pitches_clean13-15.csv', index_col=0)

pitch16 = pd.read_csv('pitches_clean16.csv', index_col = 0)

  mask |= (ar1 == a)


In [323]:
merged = pd.concat([pitch_1315, pitch16, pitch_1719], axis=0, ignore_index = True, sort = False)

In [324]:
merged['game_date'].sort_values()


0          2015-04-05
205        2015-04-05
204        2015-04-05
203        2015-04-05
202        2015-04-05
              ...    
3615228    2019-10-30
3615227    2019-10-30
3615226    2019-10-30
3615233    2019-10-30
3615436    2019-10-30
Name: game_date, Length: 3615437, dtype: object

In [325]:
# Order should be good to go as is given how they were concatenated
#merged.sort_values(by=['game_date'], inplace = True)
merged.reset_index(drop = True, inplace = True)
merged

Unnamed: 0,pitch_type,game_date,release_speed,player_name,batter,pitcher,description,zone,type,balls,...,pitch_number,pitch_name,bat_score,fld_score,reg_season,post_season,p_left,p_right,bat_left,bat_right
0,SI,2015-04-05,90.1,Adam Wainwright,451594.0,425794.0,ball,13.0,B,0.0,...,1.0,Sinker,0.0,1.0,1,0,0,1,1,0
1,FC,2015-04-05,88.1,Adam Wainwright,451594.0,425794.0,hit_into_play_no_out,5.0,X,1.0,...,2.0,Cutter,0.0,1.0,1,0,0,1,1,0
2,FF,2015-04-05,92.2,Adam Wainwright,624585.0,425794.0,foul,5.0,S,0.0,...,1.0,4-Seam Fastball,0.0,1.0,1,0,0,1,0,1
3,CU,2015-04-05,76.6,Adam Wainwright,624585.0,425794.0,blocked_ball,14.0,B,0.0,...,2.0,Curveball,0.0,1.0,1,0,0,1,0,1
4,FC,2015-04-05,89.9,Adam Wainwright,624585.0,425794.0,hit_into_play,9.0,X,1.0,...,4.0,Cutter,0.0,1.0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3615432,FF,2019-10-30,89.1,Zack Greinke,665742.0,425844.0,ball,11.0,B,0.0,...,1.0,4-Seam Fastball,1.0,2.0,0,1,0,1,1,0
3615433,CU,2019-10-30,71.1,Zack Greinke,665742.0,425844.0,swinging_strike,11.0,S,1.0,...,2.0,Curveball,1.0,2.0,0,1,0,1,1,0
3615434,CH,2019-10-30,88.3,Zack Greinke,665742.0,425844.0,ball,13.0,B,1.0,...,3.0,Changeup,1.0,2.0,0,1,0,1,1,0
3615435,CH,2019-10-30,88.0,Zack Greinke,665742.0,425844.0,ball,8.0,B,2.0,...,4.0,Changeup,1.0,2.0,0,1,0,1,1,0


In [326]:
merged.head(25)[['balls', 'strikes', 'description', 'at_bat_number']]

Unnamed: 0,balls,strikes,description,at_bat_number
0,0.0,0.0,ball,6.0
1,1.0,0.0,hit_into_play_no_out,6.0
2,0.0,0.0,foul,7.0
3,0.0,1.0,blocked_ball,7.0
4,1.0,2.0,hit_into_play,7.0
5,0.0,0.0,blocked_ball,8.0
6,1.0,0.0,swinging_strike,8.0
7,1.0,1.0,called_strike,8.0
8,1.0,2.0,blocked_ball,8.0
9,2.0,2.0,called_strike,8.0


In [327]:
# Not sure if drop yet
#merged.drop(['at_bat_number'], axis = 1, inplace = True)

In [328]:
merged.to_csv('pitches_finalv2.csv')

***

## Data Processing / Feature Engineering

In [329]:
df = pd.read_csv('pitches_finalv2.csv', index_col = 0)

  mask |= (ar1 == a)


In [330]:
# Start converting needless float columns to int to reduce file size
df['zone'] = df['zone'].astype('int')

df['balls'] = df['balls'].astype('int')
df['strikes'] = df['strikes'].astype('int')

df['bat_score'] = df['bat_score'].astype('int')
df['fld_score'] = df['fld_score'].astype('int')

df['pitch_number'] = df['pitch_number'].astype('int')
df['inning'] = df['inning'].astype('int')
df['outs_when_up'] = df['outs_when_up'].astype('int')


In [331]:
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,player_name,batter,pitcher,description,zone,type,balls,...,pitch_number,pitch_name,bat_score,fld_score,reg_season,post_season,p_left,p_right,bat_left,bat_right
0,SI,2015-04-05,90.1,Adam Wainwright,451594.0,425794.0,ball,13,B,0,...,1,Sinker,0,1,1,0,0,1,1,0
1,FC,2015-04-05,88.1,Adam Wainwright,451594.0,425794.0,hit_into_play_no_out,5,X,1,...,2,Cutter,0,1,1,0,0,1,1,0
2,FF,2015-04-05,92.2,Adam Wainwright,624585.0,425794.0,foul,5,S,0,...,1,4-Seam Fastball,0,1,1,0,0,1,0,1
3,CU,2015-04-05,76.6,Adam Wainwright,624585.0,425794.0,blocked_ball,14,B,0,...,2,Curveball,0,1,1,0,0,1,0,1
4,FC,2015-04-05,89.9,Adam Wainwright,624585.0,425794.0,hit_into_play,9,X,1,...,4,Cutter,0,1,1,0,0,1,0,1


In [333]:
# Create new column for prev pitch where it is same as pitch_name but shifted by down 1
df['prev_pitch'] = df['pitch_name'].shift(1)

# Fill in 'None' where there is a 0-0 count, otherwise leave alone
df['prev_pitch'] = np.where((df['balls'] == 0) & (df['strikes'] == 0), 'None', df['prev_pitch'])

## NOTE: This approach is inaccurate for the rare cases where a pitcher (or batter) is subbed out mid at-bat
## The above situation only occurs in case of injuries or ejections and are rare
## The best way I know to address/check for this condition would be a really long for loop (loop through 3 mil+ entries)
## Also could raise issues where rows have been dropped, causing 2 unrelated pitcher/batter combos to be adjacent
## But for the most part dropped rows came in batches (ie. same game, same year, same player etc)
## Elected to not worry about these outlier scenarios for now given how long the for loop takes to run (2hr+)

In [334]:
df

Unnamed: 0,pitch_type,game_date,release_speed,player_name,batter,pitcher,description,zone,type,balls,...,pitch_name,bat_score,fld_score,reg_season,post_season,p_left,p_right,bat_left,bat_right,prev_pitch
0,SI,2015-04-05,90.1,Adam Wainwright,451594.0,425794.0,ball,13,B,0,...,Sinker,0,1,1,0,0,1,1,0,
1,FC,2015-04-05,88.1,Adam Wainwright,451594.0,425794.0,hit_into_play_no_out,5,X,1,...,Cutter,0,1,1,0,0,1,1,0,Sinker
2,FF,2015-04-05,92.2,Adam Wainwright,624585.0,425794.0,foul,5,S,0,...,4-Seam Fastball,0,1,1,0,0,1,0,1,
3,CU,2015-04-05,76.6,Adam Wainwright,624585.0,425794.0,blocked_ball,14,B,0,...,Curveball,0,1,1,0,0,1,0,1,4-Seam Fastball
4,FC,2015-04-05,89.9,Adam Wainwright,624585.0,425794.0,hit_into_play,9,X,1,...,Cutter,0,1,1,0,0,1,0,1,Curveball
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3615432,FF,2019-10-30,89.1,Zack Greinke,665742.0,425844.0,ball,11,B,0,...,4-Seam Fastball,1,2,0,1,0,1,1,0,
3615433,CU,2019-10-30,71.1,Zack Greinke,665742.0,425844.0,swinging_strike,11,S,1,...,Curveball,1,2,0,1,0,1,1,0,4-Seam Fastball
3615434,CH,2019-10-30,88.3,Zack Greinke,665742.0,425844.0,ball,13,B,1,...,Changeup,1,2,0,1,0,1,1,0,Curveball
3615435,CH,2019-10-30,88.0,Zack Greinke,665742.0,425844.0,ball,8,B,2,...,Changeup,1,2,0,1,0,1,1,0,Changeup


In [335]:
pd.options.display.max_rows = 999
df.head(150)[['player_name', 'pitch_name', 'type', 'description', 'balls', 'strikes', 'prev_pitch']]

Unnamed: 0,player_name,pitch_name,type,description,balls,strikes,prev_pitch
0,Adam Wainwright,Sinker,B,ball,0,0,
1,Adam Wainwright,Cutter,X,hit_into_play_no_out,1,0,Sinker
2,Adam Wainwright,4-Seam Fastball,S,foul,0,0,
3,Adam Wainwright,Curveball,B,blocked_ball,0,1,4-Seam Fastball
4,Adam Wainwright,Cutter,X,hit_into_play,1,2,Curveball
5,Adam Wainwright,Curveball,B,blocked_ball,0,0,
6,Adam Wainwright,Cutter,S,swinging_strike,1,0,Curveball
7,Adam Wainwright,4-Seam Fastball,S,called_strike,1,1,Cutter
8,Adam Wainwright,Curveball,B,blocked_ball,1,2,4-Seam Fastball
9,Adam Wainwright,Cutter,S,called_strike,2,2,Curveball


In [336]:
# Drop now that they're unnecessary columns
df.drop(['batter', 'pitcher'], axis = 1, inplace = True)

In [337]:
df['pitch_type'].value_counts()

FF    1291593
SL     582418
FT     398644
CH     376784
SI     302925
CU     297748
FC     201485
KC      88897
FS      54731
KN      11505
IN       6233
EP        868
FO        842
PO        620
SC        113
UN         21
FA         10
Name: pitch_type, dtype: int64

In [338]:
# Redundant to pitch_name
df.drop(['pitch_type'], axis = 1, inplace = True)

In [339]:
df.dtypes

game_date         object
release_speed    float64
player_name       object
description       object
zone               int32
type              object
balls              int32
strikes            int32
plate_x          float64
plate_z          float64
on_3b              int64
on_2b              int64
on_1b              int64
outs_when_up       int32
inning             int32
sz_top           float64
sz_bot           float64
at_bat_number    float64
pitch_number       int32
pitch_name        object
bat_score          int32
fld_score          int32
reg_season         int64
post_season        int64
p_left             int64
p_right            int64
bat_left           int64
bat_right          int64
prev_pitch        object
dtype: object

In [340]:
# File size reduction: 90 MB
df.to_csv('pitches_finalv3.csv')

In [341]:
df = pd.read_csv('pitches_finalv3.csv', index_col = 0)

  mask |= (ar1 == a)


In [342]:
df.head()

Unnamed: 0,game_date,release_speed,player_name,description,zone,type,balls,strikes,plate_x,plate_z,...,pitch_name,bat_score,fld_score,reg_season,post_season,p_left,p_right,bat_left,bat_right,prev_pitch
0,2015-04-05,90.1,Adam Wainwright,ball,13,B,0,0,-1.585,1.72,...,Sinker,0,1,1,0,0,1,1,0,
1,2015-04-05,88.1,Adam Wainwright,hit_into_play_no_out,5,X,1,0,0.008,2.602,...,Cutter,0,1,1,0,0,1,1,0,Sinker
2,2015-04-05,92.2,Adam Wainwright,foul,5,S,0,0,0.015,2.287,...,4-Seam Fastball,0,1,1,0,0,1,0,1,
3,2015-04-05,76.6,Adam Wainwright,blocked_ball,14,B,0,1,1.703,0.22,...,Curveball,0,1,1,0,0,1,0,1,4-Seam Fastball
4,2015-04-05,89.9,Adam Wainwright,hit_into_play,9,X,1,2,0.788,1.808,...,Cutter,0,1,1,0,0,1,0,1,Curveball


In [343]:
# Simplify 4S and 2S Fastballs as just 'Fastball'
df['pitch_name'] = np.where(df['pitch_name'].str.contains("Fastball"), 'Fastball', df['pitch_name'])

In [344]:
df.head()

Unnamed: 0,game_date,release_speed,player_name,description,zone,type,balls,strikes,plate_x,plate_z,...,pitch_name,bat_score,fld_score,reg_season,post_season,p_left,p_right,bat_left,bat_right,prev_pitch
0,2015-04-05,90.1,Adam Wainwright,ball,13,B,0,0,-1.585,1.72,...,Sinker,0,1,1,0,0,1,1,0,
1,2015-04-05,88.1,Adam Wainwright,hit_into_play_no_out,5,X,1,0,0.008,2.602,...,Cutter,0,1,1,0,0,1,1,0,Sinker
2,2015-04-05,92.2,Adam Wainwright,foul,5,S,0,0,0.015,2.287,...,Fastball,0,1,1,0,0,1,0,1,
3,2015-04-05,76.6,Adam Wainwright,blocked_ball,14,B,0,1,1.703,0.22,...,Curveball,0,1,1,0,0,1,0,1,4-Seam Fastball
4,2015-04-05,89.9,Adam Wainwright,hit_into_play,9,X,1,2,0.788,1.808,...,Cutter,0,1,1,0,0,1,0,1,Curveball


In [347]:
# Simplify 4S and 2S Fastballs again
df['prev_pitch'] = np.where(df['prev_pitch'].str.contains("Fastball"), 'Fastball', df['prev_pitch'])

In [345]:
df['pitch_name'].value_counts()

Fastball            1690247
Slider               582418
Changeup             376784
Sinker               302925
Curveball            297748
Cutter               201485
Knuckle Curve         88897
Split Finger          54731
Knuckle Ball          11505
Intentional Ball       6233
Eephus                  868
Forkball                842
Pitch Out               620
Screwball               113
Unknown                  21
Name: pitch_name, dtype: int64

In [359]:
df['prev_pitch'].value_counts()

Fastball            1265178
None                 929952
Slider               428610
Changeup             273050
Curveball            227939
Sinker               221369
Cutter               148208
Knuckle Curve         67234
Split Finger          39105
Knuckle Ball           8485
Intentional Ball       4427
Eephus                  598
Forkball                595
Pitch Out               592
Screwball                81
Unknown                  14
Name: prev_pitch, dtype: int64

In [349]:
df.to_csv('pitches_final_fb.csv')

***

In [361]:
df = pd.read_csv('pitches_final_fb.csv',index_col= 0)

  mask |= (ar1 == a)


In [362]:
# Lots of different pitches
df['prev_pitch'].value_counts()

Fastball            1265178
None                 929952
Slider               428610
Changeup             273050
Curveball            227939
Sinker               221369
Cutter               148208
Knuckle Curve         67234
Split Finger          39105
Knuckle Ball           8485
Intentional Ball       4427
Eephus                  598
Forkball                595
Pitch Out               592
Screwball                81
Unknown                  14
Name: prev_pitch, dtype: int64

In [363]:
df[df['pitch_name'].str.contains("Fastball|Slider")]['pitch_name'].value_counts()

Fastball    1690247
Slider       582418
Name: pitch_name, dtype: int64

In [364]:
# Simplify classes to fastballs, off-speed, breaking balls, junk and other
# Classification based on https://en.wikipedia.org/wiki/Breaking_ball

# Add Cutters and Sinkers to fastballs
df['pitch_name'] = np.where(df['pitch_name'].str.contains("Cutter|Sinker"), 'Fastball', df['pitch_name'])
df['prev_pitch'] = np.where(df['prev_pitch'].str.contains("Cutter|Sinker"), 'Fastball', df['prev_pitch'])


# Offspeed
df['pitch_name'] = np.where(df['pitch_name'].str.contains("Changeup|Split Finger|Forkball"), 'Off-speed', df['pitch_name'])
df['prev_pitch'] = np.where(df['prev_pitch'].str.contains("Changeup|Split Finger|Forkball"), 'Off-speed', df['prev_pitch'])

# Breaking Balls
df['pitch_name'] = np.where(df['pitch_name'].str.contains("Slider|Curveball|Knuckle Curve|Screwball"), 
                            'Breaking Ball', df['pitch_name'])
df['prev_pitch'] = np.where(df['prev_pitch'].str.contains("Slider|Curveball|Knuckle Curve|Screwball"), 
                            'Breaking Ball', df['prev_pitch'])

# Junk
df['pitch_name'] = np.where(df['pitch_name'].str.contains("Knuckle Ball|Eephus"), 'Junk', df['pitch_name'])
df['prev_pitch'] = np.where(df['prev_pitch'].str.contains("Knuckle Ball|Eephus"), 'Junk', df['prev_pitch'])

# Other
df['pitch_name'] = np.where(df['pitch_name'].str.contains("Intentional Ball|Pitch Out|Unknown"), 'Other', df['pitch_name'])
df['prev_pitch'] = np.where(df['prev_pitch'].str.contains("Intentional Ball|Pitch Out|Unknown"), 'Other', df['prev_pitch'])

df['pitch_name'].value_counts()

Fastball         2194657
Breaking Ball     969176
Off-speed         432357
Junk               12373
Other               6874
Name: pitch_name, dtype: int64

In [365]:
df.to_csv('pitches_final_simplified.csv')