In [36]:
import datetime
import requests
from bs4 import BeautifulSoup
from itertools import count
from itertools import groupby
import xml.etree.cElementTree as et
import pandas as pd

In [None]:
class Game:
    mlabam_to_savant_team = {'ana': 'LAA', 
                             'hou': 'HOU', 
                             'oak': 'OAK', 
                             'tor': 'TOR', 
                             'atl': 'ATL', 
                             'mil': 'MIL', 
                             'sln': 'STL', 
                             'chn': 'CHC', 
                             'ari': 'ARI', 
                             'lan': 'LAD', 
                             'sfn': 'SF', 
                             'cle': 'CLE', 
                             'sea': 'SEA', 
                             'mia': 'MIA', 
                             'nyn': 'NYM', 
                             'was': 'WSH', 
                             'bal': 'BAL', 
                             'sdn': 'SD', 
                             'phi': 'PHI', 
                             'pit': 'PIT', 
                             'tex': 'TEX', 
                             'tba': 'TB', 
                             'bos': 'BOS', 
                             'cin': 'CIN', 
                             'col': 'COL', 
                             'kca': 'KC', 
                             'det': 'DET', 
                             'min': 'MIN', 
                             'cha': 'CWS', 
                             'nya': 'NYY'
                        }

    def __init__(self, gid, write_folder, include=None, exclude=None):
        if self._validate_id(gid):
            self.gid = gid
            self.url = self._make_mlbam_base_path(gid)
            self.include = include
            self.exclude = exclude
            self.write_folder = write_folder
        else:
            raise Exception('{} is not a valid gameid. it must be before today and of the form \"gid_YYYY_MM_DD_<away><league>_<home><league>_<gamenum>\"'.format(gid))

    @staticmethod
    def _validate_id(gid):
        if type(gid) != str:
            return False
        if len(gid) != 30:
            return False
        if datetime.datetime.strptime(gid[4:14], '%Y_%m_%d').date() > datetime.date.today():
            return False
        if datetime.datetime.strptime(gid[4:14], '%Y_%m_%d').date() < datetime.date(2008, 1, 1):
            return False
        return True
    
    @staticmethod
    def _make_mlbam_base_path(gid):
        url_template = 'http://gd2.mlb.com/components/game/mlb/year_{}/month_{}/day_{}/{}/'
        assert(gid[4:8].isnumeric())
        assert(gid[9:11].isnumeric())
        assert(gid[12:14].isnumeric())
        return(url_template.format(gid[4:8], gid[9:11], gid[12:14], gid))
    
    def get_links_in_game(self, exclude=None, include=None):
        if exclude is not None and type(exclude) is not list:
            exclude = [exclude]
        if include is not None and type(include) is not list:
            include = [include]

        if include is not None and exclude is not None:
            raise Exception('only exclude OR include allowed')

        def recurse_links(base_path):
            response = requests.get(base_path)
            soup = BeautifulSoup(response.text, 'html.parser')

            for link in soup.find_all('a')[1:]:
                new_base_path = base_path + link.get('href')
                if new_base_path.split('.')[-1] in {'xml', 'plist'}:
                    if exclude is not None and not any([x in new_base_path for x in exclude]):
                        target.append(new_base_path)
                    elif include is not None and any([x in new_base_path for x in include]):
                        target.append(new_base_path)
                    elif exclude is None and include is None:
                        target.append(new_base_path)
                else:
                    recurse_links(new_base_path)

        target = list()
        recurse_links(self.url)
        return(target)

    @staticmethod
    def get_xml_tree(path):
        if path.startswith('http'):
            response = requests.get(path)
        xmltree = et.fromstring(response.text)
        return(xmltree)
    
    def xml_to_table(self, root_node):
        
        def get_key(node):
            if node in indexes:
                return(next(indexes[node]))
            else:
                indexes[node] = count()
                return(next(indexes[node]))

        def recurse_xml(node, referer=None, referer_key=None):
            key = get_key(node.tag)
            subtables = set([child.tag for child in node.getchildren()]) or 'NONE'
            attribs = dict()
            text = ''

            if bool(node.attrib):
                attribs = node.attrib
            if node.text:
                text = node.text

            for child in node.getchildren():
                recurse_xml(child, node.tag, key)

            target.append({
                    'gid': self.gid,
                    'node_type': node.tag,
                    'key': key,
                    'parent_table': referer, 
                    'parent_key': referer_key,
                    'text': text.strip(),
                    **attribs
                })

        indexes = dict()
        target = list()
        recurse_xml(root_node)
        return(target)

    def write_tables(self, rows, rootpath, group_key='node_type'):
        group_func = lambda x: x[group_key]
        for key, group in groupby(sorted(rows, key=group_func), key=group_func):
            table = pd.DataFrame(list(group))
            table.drop(group_key, axis=1, inplace=True)
            table.to_csv('{}{}.tsv'.format(rootpath, key), sep='\t', index=False)
            
    def get_statcast_data(self):
        year = self.gid[4:8]
        month = self.gid[9:11]
        day = self.gid[12:14]
        away = mlabam_to_savant_team[self.gid[15:18]]
        home = mlabam_to_savant_team[self.gid[22:25]]

        date = '-'.join([year, month, day])
        savant_url = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfZ=&hfGT=R%7C&hfPR=&hfAB=&stadium=&hfBBT=&hfBBL=&hfC=&season={}&player_type=batter&hfOuts=&pitcher_throws=&batter_stands=&start_speed_gt=&start_speed_lt=&perceived_speed_gt=&perceived_speed_lt=&spin_rate_gt=&spin_rate_lt=&exit_velocity_gt=&exit_velocity_lt=&launch_angle_gt=&launch_angle_lt=&distance_gt=&distance_lt=&batted_ball_angle_gt=&batted_ball_angle_lt=&game_date_gt={}&game_date_lt={}&team={}&position=&hfRO=&home_road=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&sort_order=desc&min_abs=0&xba_gt=&xba_lt=&px1=&px2=&pz1=&pz2=&type=details&'
        home_url = savant_url.format(year, date, date, home)
        statcast = requests.get(home_url)
        home_table = pd.read_csv(StringIO(statcast.text))
        away_url = savant_url.format(year, date, date, away)
        statcast = requests.get(away_url)
        away_table = pd.read_csv(StringIO(statcast.text))

        return(pd.concat([home_table, away_table]))
            
    def scrape(self):
        links = self.get_links_in_game(self.exclude, self.include)
        for link in links:
            tree = self.get_xml_tree(link)
            table = self.xml_to_table(tree)
            self.write_tables(table, self.write_folder)
        
        statcast_table = self.get_statcast_data()
        statcast_table.to_csv('{}{}.tsv'.format(self.write_folder, 'statcast'), sep='\t', index=False)


In [None]:
test = Game('gid_2016_05_07_nynmlb_sdnmlb_1', '/home/ubuntu/src/pitchfx_statcast_scrape/data/')
test.scrape()

In [6]:
%%prun
test.scrape()

 

In [46]:
mlabam_to_savant_team = {'ana': 'LAA', 
                         'hou': 'HOU', 
                         'oak': 'OAK', 
                         'tor': 'TOR', 
                         'atl': 'ATL', 
                         'mil': 'MIL', 
                         'sln': 'STL', 
                         'chn': 'CHC', 
                         'ari': 'ARI', 
                         'lan': 'LAD', 
                         'sfn': 'SF', 
                         'cle': 'CLE', 
                         'sea': 'SEA', 
                         'mia': 'MIA', 
                         'nyn': 'NYM', 
                         'was': 'WSH', 
                         'bal': 'BAL', 
                         'sdn': 'SD', 
                         'phi': 'PHI', 
                         'pit': 'PIT', 
                         'tex': 'TEX', 
                         'tba': 'TB', 
                         'bos': 'BOS', 
                         'cin': 'CIN', 
                         'col': 'COL', 
                         'kca': 'KC', 
                         'det': 'DET', 
                         'min': 'MIN', 
                         'cha': 'CWS', 
                         'nya': 'NYY'
                    }

In [63]:
gid = 'gid_2016_05_07_nynmlb_sdnmlb_1'

def get_statcast_data(self):
    year = self.gid[4:8]
    month = self.gid[9:11]
    day = self.gid[12:14]
    away = mlabam_to_savant_team[self.gid[15:18]]
    home = mlabam_to_savant_team[self.gid[22:25]]
    
    date = '-'.join([year, month, day])
    savant_url = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfZ=&hfGT=R%7C&hfPR=&hfAB=&stadium=&hfBBT=&hfBBL=&hfC=&season={}&player_type=batter&hfOuts=&pitcher_throws=&batter_stands=&start_speed_gt=&start_speed_lt=&perceived_speed_gt=&perceived_speed_lt=&spin_rate_gt=&spin_rate_lt=&exit_velocity_gt=&exit_velocity_lt=&launch_angle_gt=&launch_angle_lt=&distance_gt=&distance_lt=&batted_ball_angle_gt=&batted_ball_angle_lt=&game_date_gt={}&game_date_lt={}&team={}&position=&hfRO=&home_road=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&sort_order=desc&min_abs=0&xba_gt=&xba_lt=&px1=&px2=&pz1=&pz2=&type=details&'
    home_url = savant_url.format(year, date, date, home)
    statcast = requests.get(home_url)
    home_table = pd.read_csv(StringIO(statcast.text))
    away_url = savant_url.format(year, date, date, away)
    statcast = requests.get(away_url)
    away_table = pd.read_csv(StringIO(statcast.text))
    
    return(pd.concat([home_table, away_table]))

In [64]:
mytab = get_statcast_data(gid)

In [65]:
mytab.sort_values(['inning', 'batter'])

Unnamed: 0,pitch_type,pitch_id,game_date,start_speed,x0,z0,player_name,batter,pitcher,events,...,az,sz_top,sz_bot,hit_distance_sc,hit_speed,hit_angle,effective_speed,release_spin_rate,release_extension,game_pk
61,FF,11,2016-05-07,91.76,-1.537,5.914,David Wright,431151,448306,Walk,...,-13.11,3.56,1.64,,,,93.49,2227.00,7.08,447317
62,CH,13,2016-05-07,84.25,-1.492,6.038,David Wright,431151,448306,Walk,...,-24.39,3.52,1.64,,,,84.64,1637.00,6.43,447317
63,CH,15,2016-05-07,84.97,-1.281,5.980,David Wright,431151,448306,Walk,...,-22.89,3.56,1.68,,,,85.88,1773.00,6.81,447317
64,FF,10,2016-05-07,91.44,-1.578,5.933,David Wright,431151,448306,Walk,...,-15.87,3.52,1.64,,,,92.53,2177.00,6.90,447317
65,KC,12,2016-05-07,77.01,-1.767,6.309,David Wright,431151,448306,Walk,...,-37.28,3.56,1.64,,,,76.83,2381.00,6.07,447317
66,FF,14,2016-05-07,91.77,-1.476,5.883,David Wright,431151,448306,Walk,...,-14.77,3.55,1.64,,,,92.47,2240.00,6.71,447317
57,CH,4,2016-05-07,82.74,-1.534,6.057,Curtis Granderson,434158,448306,Flyout,...,-25.05,3.09,1.59,,,,83.35,1739.00,6.62,447317
58,FC,6,2016-05-07,88.70,-1.696,6.027,Curtis Granderson,434158,448306,Flyout,...,-23.01,3.18,1.54,362.12,98.75,38.89,89.86,2218.00,6.74,447317
59,FF,3,2016-05-07,91.02,-1.793,5.949,Curtis Granderson,434158,448306,Flyout,...,-14.85,3.12,1.54,,,,92.64,2157.00,6.98,447317
60,CH,5,2016-05-07,83.95,-1.440,5.959,Curtis Granderson,434158,448306,Flyout,...,-22.34,3.11,1.59,,,,84.68,1777.00,6.69,447317


In [35]:
pd.read_csv(StringIO(statcast.text))

Unnamed: 0,pitch_type,pitch_id,game_date,start_speed,x0,z0,player_name,batter,pitcher,events,...,az,sz_top,sz_bot,hit_distance_sc,hit_speed,hit_angle,effective_speed,release_spin_rate,release_extension,game_pk
0,FF,324,2016-05-27,94.96,-1.226,6.029,Chris Gimenez,460269,605541,Walk,...,-15.80,3.39,1.47,,,,94.51,2162.00,5.88,447579
1,SL,325,2016-05-27,84.06,-1.150,5.957,Chris Gimenez,460269,605541,Walk,...,-33.64,3.33,1.47,,,,83.70,0.00,5.84,447579
2,FT,321,2016-05-27,91.77,-0.811,6.202,Chris Gimenez,460269,605541,Walk,...,-23.04,3.40,1.56,,,,91.74,1922.00,6.00,447579
3,FT,322,2016-05-27,90.91,-1.007,6.113,Chris Gimenez,460269,605541,Walk,...,-23.52,3.52,1.59,,,,90.53,2018.00,5.88,447579
4,FF,323,2016-05-27,93.08,-0.970,6.439,Chris Gimenez,460269,605541,Walk,...,-13.51,3.33,1.47,,,,92.23,2088.00,5.62,447579
5,FF,326,2016-05-27,96.49,-1.215,6.116,Chris Gimenez,460269,605541,Walk,...,-12.36,3.36,1.47,,,,96.15,2167.00,5.92,447579
6,FF,331,2016-05-27,93.23,-0.867,6.282,Rajai Davis,434658,605541,Single,...,-20.10,3.40,1.47,,,,93.26,1879.00,6.09,447579
7,FF,333,2016-05-27,94.53,-0.832,6.002,Rajai Davis,434658,605541,Single,...,-21.78,3.24,1.55,,,,94.78,1962.00,6.29,447579
8,FF,334,2016-05-27,96.20,-1.127,5.961,Rajai Davis,434658,605541,Single,...,-13.68,3.37,1.46,,,,96.51,2106.00,6.28,447579
9,FF,330,2016-05-27,91.97,-0.762,6.071,Rajai Davis,434658,605541,Single,...,-17.42,3.24,1.62,,,,92.19,1848.00,6.17,447579


In [18]:
from io import StringIO


Index(['pitch_type', 'pitch_id', 'game_date', 'start_speed', 'x0', 'z0',
       'player_name', 'batter', 'pitcher', 'events', 'description', 'spin_dir',
       'spin_rate', 'break_angle', 'break_length', 'zone', 'des', 'game_type',
       'stand', 'p_throws', 'home_team', 'away_team', 'type', 'hit_location',
       'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'px',
       'pz', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning',
       'inning_topbot', 'hc_x', 'hc_y', 'tfs', 'tfs_zulu', 'catcher', 'umpire',
       'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'hit_speed', 'hit_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk'],
      dtype='object')