In [2]:
# Import required packages
import pandas as pd
import numpy as np
%matplotlib inline
import time
import pickle

In [7]:
# Function that modifies baseball savant url for each team in order to get all batter data per date in 2019
def url_per_team(front_url, year, mid_url, team, back_url):
    team_url = front_url + year + mid_url + team + back_url
    team_url = team_url.replace(' ', '')
    return team_url

In [10]:
# Create url components required to reference baseballsavant's data

front_url = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&\
                hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea='

years = ['2019', '2018', '2017', '2016', '2015']

mid_url = '%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&\
                game_date_gt=&game_date_lt=&hfInfield=&team='

back_url = '&position=&hfOutfield=&hfRO=&\
                home_road=&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=0&min_results=0&\
                group_by=name-date&sort_col=ba&player_event_sort=h_launch_speed&sort_order=desc&\
                min_pas=0&chk_stats_pa=on&chk_stats_abs=on&chk_stats_hits=on&chk_stats_singles=on&\
                chk_stats_dbls=on&chk_stats_triples=on&chk_stats_hrs=on&chk_stats_so=on&\
                chk_stats_k_percent=on&chk_stats_bb=on&chk_stats_bb_percent=on&chk_stats_babip=on&\
                chk_stats_iso=on&chk_stats_ba=on&chk_stats_xba=on&chk_stats_xbadiff=on&chk_stats_slg=on&\
                chk_stats_xslg=on&chk_stats_xslgdiff=on&chk_stats_woba=on&chk_stats_xwoba=on&chk_stats_wobadiff=on&'

teams = ['LAA', 'HOU', 'OAK', 'TOR', 'ATL', 'MIL', 'STL', 'CHC', 'ARI', 'LAD', 'SF', 'CLE', 'SEA', \
            'MIA', 'NYM', 'WSH', 'BAL', 'SD', 'PHI', 'PIT', 'TEX', 'TB', 'BOS', 'CIN', 'COL', 'KC', \
            'DET', 'MIN', 'CWS', 'NYY']

In [25]:
# Create dictionary of team names and baseaball savant urls

team_urls_dict = {}

for team in teams:
    for year in years:
        name_lower = team.lower()
        team_url = url_per_team(front_url, year, mid_url, team, back_url)
        team_urls_dict.update({'{}{}'.format(name_lower, year): team_url})

In [26]:
# Check to see if dictionary works and it does
team_urls_dict['pit2016']

'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=2016%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfInfield=&team=PIT&position=&hfOutfield=&hfRO=&home_road=&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name-date&sort_col=ba&player_event_sort=h_launch_speed&sort_order=desc&min_pas=0&chk_stats_pa=on&chk_stats_abs=on&chk_stats_hits=on&chk_stats_singles=on&chk_stats_dbls=on&chk_stats_triples=on&chk_stats_hrs=on&chk_stats_so=on&chk_stats_k_percent=on&chk_stats_bb=on&chk_stats_bb_percent=on&chk_stats_babip=on&chk_stats_iso=on&chk_stats_ba=on&chk_stats_xba=on&chk_stats_xbadiff=on&chk_stats_slg=on&chk_stats_xslg=on&chk_stats_xslgdiff=on&chk_stats_woba=on&chk_stats_xwoba=on&chk_stats_wobadiff=on&'

In [27]:
# Test scrape for LA Angels batter data and it works!

# laa = pd.read_csv(team_urls_dict['laa'])
# laa

In [28]:
# Create a function that takes a team name and url
# It captues csv batter data for that team
# Adds a column with the team name
# Saves df to a dataframe
# It returns the team id (lowercase) and the batters data dataframe dictionary

def get_batters_data(team, url):
    batter_df = pd.read_csv(url)
    batter_df['team'] = team
    return team, batter_df

In [29]:
# # Create dictionary with which to save all batter dataframes acquired from baseball savant
batter_df_dict = {}
teams_left = 29 * 5

for team in team_urls_dict:
    team, batter_df = get_batters_data(team, team_urls_dict[team])
    batter_df_dict.update({team: batter_df})
    print('Just snagged batter data for {}, {} of 30 teams remaining...'.format(team, teams_left))
    teams_left -= 1
    time.sleep(20)
    
# # After snagging data for half the league (15 teams) hit a 502 error
# # Disabling code so webscraping does not rerun

Just snagged batter data for laa2019, 145 of 30 teams remaining...
Just snagged batter data for laa2018, 144 of 30 teams remaining...
Just snagged batter data for laa2017, 143 of 30 teams remaining...
Just snagged batter data for laa2016, 142 of 30 teams remaining...
Just snagged batter data for laa2015, 141 of 30 teams remaining...
Just snagged batter data for hou2019, 140 of 30 teams remaining...
Just snagged batter data for hou2018, 139 of 30 teams remaining...
Just snagged batter data for hou2017, 138 of 30 teams remaining...
Just snagged batter data for hou2016, 137 of 30 teams remaining...
Just snagged batter data for hou2015, 136 of 30 teams remaining...
Just snagged batter data for oak2019, 135 of 30 teams remaining...
Just snagged batter data for oak2018, 134 of 30 teams remaining...
Just snagged batter data for oak2017, 133 of 30 teams remaining...
Just snagged batter data for oak2016, 132 of 30 teams remaining...
Just snagged batter data for oak2015, 131 of 30 teams remainin

Just snagged batter data for col2015, 21 of 30 teams remaining...
Just snagged batter data for kc2019, 20 of 30 teams remaining...
Just snagged batter data for kc2018, 19 of 30 teams remaining...
Just snagged batter data for kc2017, 18 of 30 teams remaining...
Just snagged batter data for kc2016, 17 of 30 teams remaining...
Just snagged batter data for kc2015, 16 of 30 teams remaining...
Just snagged batter data for det2019, 15 of 30 teams remaining...
Just snagged batter data for det2018, 14 of 30 teams remaining...
Just snagged batter data for det2017, 13 of 30 teams remaining...
Just snagged batter data for det2016, 12 of 30 teams remaining...
Just snagged batter data for det2015, 11 of 30 teams remaining...
Just snagged batter data for min2019, 10 of 30 teams remaining...
Just snagged batter data for min2018, 9 of 30 teams remaining...
Just snagged batter data for min2017, 8 of 30 teams remaining...
Just snagged batter data for min2016, 7 of 30 teams remaining...
Just snagged batte

In [30]:
# New loop to grab teams not acquired prior to the 502 error thrown above

for team in team_urls_dict:
    if team in batter_df_dict:
        print('Skipping {}, already grabbed it'.format(team))
    else:
        team, batter_df = get_batters_data(team, team_urls_dict[team])
        batter_df_dict.update({team: batter_df})
        print('Just snagged batter data for {}'.format(team))
        time.sleep(20)

Skipping laa2019, already grabbed it
Skipping laa2018, already grabbed it
Skipping laa2017, already grabbed it
Skipping laa2016, already grabbed it
Skipping laa2015, already grabbed it
Skipping hou2019, already grabbed it
Skipping hou2018, already grabbed it
Skipping hou2017, already grabbed it
Skipping hou2016, already grabbed it
Skipping hou2015, already grabbed it
Skipping oak2019, already grabbed it
Skipping oak2018, already grabbed it
Skipping oak2017, already grabbed it
Skipping oak2016, already grabbed it
Skipping oak2015, already grabbed it
Skipping tor2019, already grabbed it
Skipping tor2018, already grabbed it
Skipping tor2017, already grabbed it
Skipping tor2016, already grabbed it
Skipping tor2015, already grabbed it
Skipping atl2019, already grabbed it
Skipping atl2018, already grabbed it
Skipping atl2017, already grabbed it
Skipping atl2016, already grabbed it
Skipping atl2015, already grabbed it
Skipping mil2019, already grabbed it
Skipping mil2018, already grabbed it
S

In [31]:
batter_df_dict['cws2016']

Unnamed: 0,pitches,player_id,player_name,game_date,total_pitches,pitch_percent,ba,iso,babip,slg,...,eff_min_vel,release_extension,pos3_int_start_distance,pos4_int_start_distance,pos5_int_start_distance,pos6_int_start_distance,pos7_int_start_distance,pos8_int_start_distance,pos9_int_start_distance,team
0,5,543776,JB Shuck,2016-09-12,5,100.0,,,,,...,-0.4,6.60,118.0,148.0,116.0,146.0,287.0,318.0,282.0,cws2016
1,6,408047,Justin Morneau,2016-09-20,6,100.0,,,,,...,0.5,7.05,121.0,150.0,112.0,154.0,279.0,308.0,284.0,cws2016
2,14,543434,Brett Lawrie,2016-04-20,14,100.0,,,,,...,-2.1,5.45,108.0,136.0,98.0,137.0,299.0,318.0,278.0,cws2016
3,4,641313,Tim Anderson,2016-08-04,4,100.0,,,,,...,-0.3,5.94,108.0,155.0,100.0,146.0,314.0,320.0,277.0,cws2016
4,4,408047,Justin Morneau,2016-09-04,4,100.0,,,,,...,-0.3,6.09,85.0,132.0,106.0,147.0,290.0,313.0,293.0,cws2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1532,7,502009,Mat Latos,2016-05-31,7,100.0,0.0,0.0,,0.0,...,-0.3,6.26,111.0,152.0,111.0,149.0,290.0,303.0,273.0,cws2016
1533,7,425900,Dioner Navarro,2016-08-07,7,100.0,0.0,0.0,0.0,0.0,...,-1.0,5.99,115.0,154.0,129.0,150.0,275.0,308.0,299.0,cws2016
1534,11,466320,Melky Cabrera,2016-04-04,11,100.0,0.0,0.0,0.0,0.0,...,0.1,6.07,113.0,151.0,108.0,151.0,278.0,312.0,300.0,cws2016
1535,13,466320,Melky Cabrera,2016-04-05,13,100.0,0.0,0.0,0.0,0.0,...,-0.6,6.29,109.0,147.0,111.0,149.0,276.0,318.0,307.0,cws2016


In [42]:
# Loop through batter_df_dict and concatitate all dfs into one master file
all_batters_df = {}

for team in batter_df_dict.keys():
    if team == 'laa2019':
        all_batters_df = batter_df_dict[team]
    else:
        all_batters_df = pd.concat([all_batters_df, batter_df_dict[team]])

In [43]:
# Looks good, let's save as a csv and continue working in the next notebook
all_batters_df.to_csv('../data/processed/batters/2019_to_2015_batter_data_by_game.csv')

In [44]:
outfile = open('../data/processed/batters/raw_batter_data_by_game_2015_to_2019.pickle','wb')
pickle.dump(all_batters_df ,outfile)
outfile.close()