In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
batters_og = pd.read_csv('../data/processed/batters/2015_to_2019_batter_data_by_ab.csv', index_col=0)

In [3]:
batters_og.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,team
0,CU,2019-09-29,80.9,-1.3756,6.2551,Kaleb Cowart,592230,501789,field_out,hit_into_play,...,8,5,8,8,5,5,8,Infield shift,Standard,laa2019
1,FF,2019-09-29,93.5,-0.0852,6.6552,Brian Goodwin,571718,606965,force_out,hit_into_play,...,8,5,8,8,5,5,8,Infield shift,Standard,laa2019
2,CH,2019-09-29,82.4,-0.1186,6.6827,David Fletcher,664058,606965,single,hit_into_play_no_out,...,8,5,8,8,5,5,8,Standard,Standard,laa2019
3,FF,2019-09-29,91.9,-0.1843,6.6535,Tommy La Stella,600303,606965,field_out,hit_into_play,...,8,5,8,8,5,5,8,Infield shift,Standard,laa2019
4,CH,2019-09-29,82.1,-0.237,6.6769,Justin Bour,571506,606965,field_out,hit_into_play,...,8,5,8,8,5,5,8,Infield shift,Standard,laa2019


In [4]:
batters_og.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

In [5]:
keep_cols = ['game_date', 'player_name', 'batter', 'events', 'description', 'des',
            'bb_type', 'sv_id', 'woba_value', 'woba_denom', 'babip_value', 'iso_value']

kill_cols = []
for c in batters_og.columns:
    if c in keep_cols:
        pass
    else:
        kill_cols.append(c)

In [6]:
batters = batters_og.drop(columns = kill_cols)
batters.head()

Unnamed: 0,game_date,player_name,batter,events,description,des,bb_type,sv_id,woba_value,woba_denom,babip_value,iso_value
0,2019-09-29,Kaleb Cowart,592230,field_out,hit_into_play,Kaleb Cowart lines out to center fielder Jake ...,line_drive,190929_223922,0.0,1.0,0,0
1,2019-09-29,Brian Goodwin,571718,force_out,hit_into_play,"Brian Goodwin grounds into a force out, second...",ground_ball,190929_223525,0.0,1.0,0,0
2,2019-09-29,David Fletcher,664058,single,hit_into_play_no_out,David Fletcher singles on a ground ball to rig...,ground_ball,190929_223331,0.9,1.0,1,0
3,2019-09-29,Tommy La Stella,600303,field_out,hit_into_play,Tommy La Stella flies out to center fielder Ja...,fly_ball,190929_223222,0.0,1.0,0,0
4,2019-09-29,Justin Bour,571506,field_out,hit_into_play,Justin Bour flies out to center fielder Jake M...,fly_ball,190929_221955,0.0,1.0,0,0


## Capture at bat events by type
Types include:
- At bat
- Single
- Double
- Triple
- Home run
- Strike outs
- Walks

In [7]:
batters.events.unique()

array(['field_out', 'force_out', 'single', 'double', 'strikeout', 'walk',
       'home_run', 'hit_by_pitch', 'grounded_into_double_play',
       'field_error', 'sac_fly', 'double_play', 'fielders_choice',
       'triple', 'sac_bunt', 'caught_stealing_2b', 'catcher_interf',
       'fielders_choice_out', 'strikeout_double_play',
       'sac_fly_double_play', 'triple_play', 'intent_walk',
       'caught_stealing_home', 'batter_interference',
       'caught_stealing_3b', 'sac_bunt_double_play'], dtype=object)

In [8]:
# Capture at bats
batters['ab'] = batters['events'].apply(lambda e: 0 if e == 'walk' or \
                                        e == 'hit_by_pitch' or \
                                        e == 'sac_fly' or \
                                        e == 'sac_bunt' or \
                                        e == 'catcher_interf' or \
                                        e == 'sac_fly_double_play' or \
                                        e == 'sac_bunt_double_play'
                                        else 1)

In [9]:
# Capture strikeouts
batters['k'] = batters['events'].apply(lambda e: 1 if e == 'strikeout' or e == 'strikeout_double_play' else 0)

In [10]:
# Capture walks
batters['walk'] = batters['events'].apply(lambda e: 1 if e == 'walk' else 0)

In [11]:
# Capture singles
batters['single'] = batters['events'].apply(lambda e: 1 if e == 'single' else 0)

In [12]:
# Capture doubles
batters['double'] = batters['events'].apply(lambda e: 1 if e == 'double' else 0)

In [13]:
# Capture triples
batters['triple'] = batters['events'].apply(lambda e: 1 if e == 'triple' else 0)

In [14]:
# Capture home runs
batters['hr'] = batters['events'].apply(lambda e: 1 if e == 'home_run' else 0)

In [15]:
# Capture sacrifices
batters['sac'] = batters['events'].apply(lambda e: 1 if e == 'sac_fly' or \
                                         e == 'sac_bunt' or \
                                         e == 'sac_fly_double_play' or \
                                         e == 'sac_bunt_double_play'\
                                         else 0)

## Type of contact per each hitter
Types include:
- Line drive
- Ground_ball
- Fly ball
- Popup

In [16]:
batters.loc[batters.bb_type.isnull()];

In [17]:
# Capture line_drives
batters['line_drive'] = batters['bb_type'].apply(lambda e: 1 if e == 'line_drive' else 0)

In [18]:
# Capture ground balls
batters['ground_ball'] = batters['bb_type'].apply(lambda e: 1 if e == 'ground_ball' else 0)

In [19]:
# Capture line_drives
batters['fly_ball'] = batters['bb_type'].apply(lambda e: 1 if e == 'fly_ball' else 0)

In [20]:
# Capture line_drives
batters['popup'] = batters['bb_type'].apply(lambda e: 1 if e == 'popup' else 0)

In [26]:
batters;

## Generate RBIs

In [37]:
# batters['des'].apply(lambda r: str(r).count('scores'))
batters['rbi'] = batters['des'].apply(lambda r: str(r).count('scores'))

## Clean up data frame

In [38]:
batters.columns

Index(['game_date', 'player_name', 'batter', 'events', 'description', 'des',
       'bb_type', 'sv_id', 'woba_value', 'woba_denom', 'babip_value',
       'iso_value', 'ab', 'k', 'walk', 'single', 'double', 'triple', 'hr',
       'sac', 'line_drive', 'ground_ball', 'fly_ball', 'popup', 'rbi'],
      dtype='object')

In [39]:
kill_cols_v2 = ['events', 'description', 'des', 'bb_type', 'woba_value', 'woba_denom', 'babip_value', 'iso_value']

In [40]:
batters = batters.drop(columns = kill_cols_v2)

In [41]:
outfile = open('../data/processed/batters/2015_to_2019_batter_ab_cleaned.pickle','wb')
pickle.dump(batters, outfile)
outfile.close()

In [42]:
batters.game_date = pd.to_datetime(batters.game_date)
batters.sort_values(by=['batter', 'game_date'])

Unnamed: 0,game_date,player_name,batter,sv_id,ab,k,walk,single,double,triple,hr,sac,line_drive,ground_ball,fly_ball,popup,rbi
6131,2015-04-06,Bartolo Colon,112526,150406_172239,1,1,0,0,0,0,0,0,0,0,0,0,0
6140,2015-04-06,Bartolo Colon,112526,150406_164546,1,0,0,0,0,0,0,0,1,0,0,0,0
5943,2015-04-12,Bartolo Colon,112526,150412_150130,1,1,0,0,0,0,0,0,0,0,0,0,0
5952,2015-04-12,Bartolo Colon,112526,150412_143304,1,0,0,1,0,0,0,0,1,0,0,0,1
5961,2015-04-12,Bartolo Colon,112526,150412_135929,1,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4559,2019-05-12,Nick Margevicius,676606,190512_195709,1,0,0,1,0,0,0,0,1,0,0,0,0
4386,2019-05-18,Nick Margevicius,676606,190519_013713,1,1,0,0,0,0,0,0,0,0,0,0,0
3943,2019-06-01,Nick Margevicius,676606,190602_024752,1,0,0,0,0,0,0,0,0,1,0,0,0
3723,2019-06-07,Nick Margevicius,676606,190608_024737,1,0,0,0,0,0,0,0,0,1,0,0,0


In [43]:
numeric_cols = ['ab', 'k', 'walk', 'single', 'double', 'triple', 'hr', 'sac', 
                'line_drive', 'ground_ball', 'fly_ball', 'popup', 'rbi']

In [44]:
batters_game = batters.groupby(by = ['batter', 'game_date', 'player_name'], axis = 0).sum()

In [45]:
batters_game.loc[batters_game.index.get_level_values(2) == 'Bryce Harper']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ab,k,walk,single,double,triple,hr,sac,line_drive,ground_ball,fly_ball,popup,rbi
batter,game_date,player_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
547180,2015-04-06,Bryce Harper,4,1,0,1,0,0,1,0,1,1,1,0,0
547180,2015-04-08,Bryce Harper,4,1,0,2,0,0,0,0,1,1,0,0,0
547180,2015-04-09,Bryce Harper,4,3,0,0,0,0,0,0,0,0,1,0,0
547180,2015-04-10,Bryce Harper,4,3,0,1,0,0,0,0,0,1,0,0,0
547180,2015-04-11,Bryce Harper,4,1,1,0,0,0,0,0,1,1,0,0,0
547180,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547180,2019-09-25,Bryce Harper,4,1,0,1,1,0,0,0,1,1,1,0,0
547180,2019-09-26,Bryce Harper,4,3,0,0,0,0,0,0,0,1,0,0,0
547180,2019-09-27,Bryce Harper,6,1,0,0,1,1,0,1,2,2,2,0,2
547180,2019-09-28,Bryce Harper,4,0,0,0,0,0,1,0,0,0,2,2,2


In [46]:
batters_game.to_csv('../data/processed/batters/2015_to_2019_batters_by_game_grouped.csv')

In [47]:
outfile = open('../data/processed/batters/2015_to_2019_batter_ab_cleaned_grouped.pickle','wb')
pickle.dump(batters_game ,outfile)
outfile.close()

outfile = open('../data/processed/batters/2015_to_2019_batter_ab_cleaned.pickle','wb')
pickle.dump(batters_game ,outfile)
outfile.close()