In [1]:
# Import required packages
import pandas as pd
import numpy as np
%matplotlib inline
import time
import pickle

In [2]:
# Function that modifies baseball savant url for each team in order to get all batter data per date in 2019
def url_per_team(front_url, team, back_url):
    team_url = front_url + team + back_url
    team_url = team_url.replace(' ', '')
    return team_url

In [3]:
# Create url components required to reference baseballsavant's data

front_url = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&\
                hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=2019%7C&\
                hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&\
                game_date_gt=&game_date_lt=&hfInfield=&team='

back_url = '&position=&hfOutfield=&hfRO=&\
                home_road=&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=0&min_results=0&\
                group_by=name-date&sort_col=ba&player_event_sort=h_launch_speed&sort_order=desc&\
                min_pas=0&chk_stats_pa=on&chk_stats_abs=on&chk_stats_hits=on&chk_stats_singles=on&\
                chk_stats_dbls=on&chk_stats_triples=on&chk_stats_hrs=on&chk_stats_so=on&\
                chk_stats_k_percent=on&chk_stats_bb=on&chk_stats_bb_percent=on&chk_stats_babip=on&\
                chk_stats_iso=on&chk_stats_ba=on&chk_stats_xba=on&chk_stats_xbadiff=on&chk_stats_slg=on&\
                chk_stats_xslg=on&chk_stats_xslgdiff=on&chk_stats_woba=on&chk_stats_xwoba=on&chk_stats_wobadiff=on&'

teams = ['LAA', 'HOU', 'OAK', 'TOR', 'ATL', 'MIL', 'STL', 'CHC', 'ARI', 'LAD', 'SF', 'CLE', 'SEA', \
            'MIA', 'NYM', 'WSH', 'BAL', 'SD', 'PHI', 'PIT', 'TEX', 'TB', 'BOS', 'CIN', 'COL', 'KC', \
            'DET', 'MIN', 'CWS', 'NYY']

In [4]:
# Create dictionary of team names and baseaball savant urls

team_urls_dict = {}

for team in teams:
    name_lower = team.lower()
    team_url = url_per_team(front_url, team, back_url)
    team_urls_dict.update({name_lower: team_url})

In [5]:
# Check to see if dictionary works and it does
team_urls_dict['pit']

'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=2019%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfInfield=&team=PIT&position=&hfOutfield=&hfRO=&home_road=&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name-date&sort_col=ba&player_event_sort=h_launch_speed&sort_order=desc&min_pas=0&chk_stats_pa=on&chk_stats_abs=on&chk_stats_hits=on&chk_stats_singles=on&chk_stats_dbls=on&chk_stats_triples=on&chk_stats_hrs=on&chk_stats_so=on&chk_stats_k_percent=on&chk_stats_bb=on&chk_stats_bb_percent=on&chk_stats_babip=on&chk_stats_iso=on&chk_stats_ba=on&chk_stats_xba=on&chk_stats_xbadiff=on&chk_stats_slg=on&chk_stats_xslg=on&chk_stats_xslgdiff=on&chk_stats_woba=on&chk_stats_xwoba=on&chk_stats_wobadiff=on&'

In [6]:
# Test scrape for LA Angels batter data and it works!

# laa = pd.read_csv(team_urls_dict['laa'])
# laa

Unnamed: 0,pitches,player_id,player_name,game_date,total_pitches,pitch_percent,ba,iso,babip,slg,...,takes,eff_min_vel,release_extension,pos3_int_start_distance,pos4_int_start_distance,pos5_int_start_distance,pos6_int_start_distance,pos7_int_start_distance,pos8_int_start_distance,pos9_int_start_distance
0,5,405395,Albert Pujols,2019-04-13,5,100.0,,,,,...,5,-1.1,5.68,112.0,166.0,124.0,169.0,300.0,326.0,289.0
1,5,621493,Taylor Ward,2019-07-23,5,100.0,,,,,...,5,-1.1,5.45,116.0,157.0,108.0,143.0,292.0,313.0,281.0
2,5,571718,Brian Goodwin,2019-08-24,5,100.0,,,,,...,4,-0.4,6.20,121.0,150.0,104.0,153.0,278.0,315.0,311.0
3,8,405395,Albert Pujols,2019-05-20,8,100.0,,,,,...,5,-1.4,5.48,86.0,146.0,120.0,143.0,313.0,331.0,294.0
4,5,660271,Shohei Ohtani,2019-07-24,5,100.0,,,,,...,4,0.1,6.15,129.0,183.0,126.0,152.0,302.0,324.0,312.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1600,18,594777,Kole Calhoun,2019-07-06,18,100.0,0.0,0.0,0.0,0.0,...,10,-0.5,6.11,121.0,175.0,111.0,152.0,292.0,315.0,298.0
1601,3,665120,Jared Walsh,2019-09-11,3,100.0,0.0,0.0,,0.0,...,2,-1.1,5.55,120.0,176.0,119.0,155.0,307.0,328.0,301.0
1602,19,594777,Kole Calhoun,2019-07-12,19,100.0,0.0,0.0,0.0,0.0,...,7,-1.3,5.45,119.0,168.0,120.0,150.0,297.0,321.0,289.0
1603,10,665120,Jared Walsh,2019-09-10,10,100.0,0.0,0.0,0.0,0.0,...,5,-0.7,5.67,118.0,179.0,125.0,155.0,294.0,328.0,301.0


In [None]:
# Create a function that takes a team name and url
# It captues csv batter data for that team
# Adds a column with the team name
# Saves df to a dataframe
# It returns the team id (lowercase) and the batters data dataframe dictionary

def get_batters_data(team, url):
    batter_df = pd.read_csv(url)
    batter_df['team'] = team
    return team, batter_df

In [None]:
# # Create dictionary with which to save all batter dataframes acquired from baseball savant
batter_df_dict = {}
teams_left = 29

for team in team_urls_dict:
    team, batter_df = get_batters_data(team, team_urls_dict[team])
    batter_df_dict.update({team: batter_df})
    print('Just snagged batter data for {}, {} of 30 teams remaining...'.format(team, teams_left))
    teams_left -= 1
    time.sleep(30)
    
# # After snagging data for half the league (15 teams) hit a 502 error
# # Disabling code so webscraping does not rerun

In [None]:
# New loop to grab teams not acquired prior to the 502 error thrown above

for team in team_urls_dict:
    if team in batter_df_dict:
        print('Skipping {}, already grabbed it'.format(team))
    else:
        team, batter_df = get_batters_data(team, team_urls_dict[team])
        batter_df_dict.update({team: batter_df})
        print('Just snagged batter data for {}'.format(team))
        time.sleep(20)

In [None]:
batter_df_dict['cws']

In [None]:
# Loop through batter_df_dict and concatitate all dfs into one master file

for team in batter_df_dict.keys():
    if team == 'laa':
        all_batters_df = batter_df_dict[team]
    else:
        all_batters_df = pd.concat([all_batters_df, batter_df_dict[team]])

In [None]:
# Looks good, let's save as a csv and continue working in the next notebook
all_batters_df.to_csv('../data/processed/2019_batter_data_by_game.csv')

In [None]:
outfile = open('../data/processed/batters/raw_batter_data_by_game_2019.pickle','wb')
pickle.dump(all_batters_df ,outfile)
outfile.close()