In [29]:
import requests
import xml.etree.cElementTree as et
from os import path
from bs4 import BeautifulSoup

In [33]:
ROOTPATH = '/home/ubuntu/baseballdata/'

def xml_recurse(table_name, node, gid, referer='NONE', depth=0):
    subtables = list(set([child.tag for child in node.getchildren()])) or ['NONE']
    attribs = list()
    text = ''
    
    if bool(node.attrib):
        attribs = list(node.attrib.values())
    if node.text:
        text = node.text
        
    for child in node.getchildren():
        xml_recurse(table_name, child, gid, node.tag, depth=depth+1)
    
    write_node(table_name, xmltree.tag, node.tag, gid, subtables, referer, attribs, text.strip())
            
def write_node(table_name, rootnode, tag, gid, subtables, parent, attributes, text):
    write_path = '{}.{}.{}.txt'.format(ROOTPATH + table_name, rootnode, tag)
    attributes = '\t'.join(attributes)
    for subtable in subtables:
        with open(write_path, 'a+') as tmp:
            tmp.write('\t'.join([gid, subtable, parent, attributes, text]) + '\n')

In [36]:
def get_links_in_game(path, depth=0):
    response = requests.get(path)
    soup = BeautifulSoup(response.text, 'html.parser')

    for link in soup.find_all('a')[1:]:
        mylink = link.get('href')
        if mylink.split('.')[-1] in {'xml', 'plist'}:
            table_name = ''
            if depth == 0:
                table_name = (path[96:] + mylink).split('.')[0]
            elif depth == 1:
                table_name = (path[96:] + mylink).split('.')[0].split('/')[0]
            else:
                table_name = (path[96:] + mylink).split('.')[0].split('/')[1] + '_' + (path[96:] + mylink).split('.')[0].split('/')[3]                
            response = requests.get(path + mylink)
            xmltree = et.fromstring(response.text)
            gameid = path + mylink
            gameid = gameid.split('/')[9]
            xml_recurse(table_name, xmltree, gameid)
        else:
            get_links_in_game(path + mylink, depth+1)

In [37]:
get_links_in_game('http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_07/gid_2016_05_07_oakmlb_balmlb_1/')

NameError: name 'xmltree' is not defined

In [8]:
import datetime
import requests
from bs4 import BeautifulSoup
from itertools import count
import xml.etree.cElementTree as et

In [36]:
class MlbamGame:
    def __init__(self, gid):
        if self._validate_id(gid):
            self.gid = gid
            self.url = self._make_mlbam_base_path(gid)
        else:
            raise Exception('{} is not a valid gameid. it must be before today and of the form \"gid_YYYY_MM_DD_<away><league>_<home><league>_<gamenum>\"'.format(gid))

    @staticmethod
    def _validate_id(gid):
        if type(gid) != str:
            return False
        if len(gid) != 30:
            return False
        if datetime.datetime.strptime(gid[4:14], '%Y_%m_%d').date() > datetime.date.today():
            return False
        if datetime.datetime.strptime(gid[4:14], '%Y_%m_%d').date() < datetime.date(2008, 1, 1):
            return False
        return True
    
    @staticmethod
    def _make_mlbam_base_path(gid):
        url_template = 'http://gd2.mlb.com/components/game/mlb/year_{}/month_{}/day_{}/{}/'
        assert(gid[4:8].isnumeric())
        assert(gid[9:11].isnumeric())
        assert(gid[12:14].isnumeric())
        return(url_template.format(gid[4:8], gid[9:11], gid[12:14], gid))
    
    def get_links_in_game(self, exclude=None, include=None):
        if exclude is not None and type(exclude) is not list:
            exclude = [exclude]
        if include is not None and type(include) is not list:
            include = [include]

        if include is not None and exclude is not None:
            raise Exception('only exclude OR include allowed')

        def recurse_links(base_path):
            response = requests.get(base_path)
            soup = BeautifulSoup(response.text, 'html.parser')

            for link in soup.find_all('a')[1:]:
                new_base_path = base_path + link.get('href')
                if new_base_path.split('.')[-1] in {'xml', 'plist'}:
                    if exclude is not None and not any([x in new_base_path for x in exclude]):
                        target.append(new_base_path)
                    elif include is not None and any([x in new_base_path for x in include]):
                        target.append(new_base_path)
                    elif exclude is None and include is None:
                        target.append(new_base_path)
                else:
                    recurse_links(new_base_path)

        target = list()
        recurse_links(self.url)
        return(target)

    @staticmethod
    def get_xml_tree(path):
        if path.startswith('http'):
            response = requests.get(path)
        xmltree = et.fromstring(response.text)
        return(xmltree)
    
    def xml_to_table(self, root_node):
        
        def get_key(node):
            if node in indexes:
                return(next(indexes[node]))
            else:
                indexes[node] = count()
                return(next(indexes[node]))

        def recurse_xml(node, referer=None, referer_key=None):
            key = get_key(node.tag)
            subtables = set([child.tag for child in node.getchildren()]) or 'NONE'
            attribs = dict()
            text = ''

            if bool(node.attrib):
                attribs = node.attrib
            if node.text:
                text = node.text

            for child in node.getchildren():
                recurse_xml(child, node.tag, key)

            target.append({
                    'gid': self.gid,
                    'node_type': node.tag,
                    'key': key,
                    'child_tables': subtables,
                    'parent_table': referer, 
                    'parent_key': referer_key,
                    'text': text.strip(),
                    **attribs
                })

        indexes = dict()
        target = list()
        recurse_xml(root_node)
        return(target)


In [39]:
test = MlbamGame('gid_2016_05_07_nynmlb_sdnmlb_1')
links = test.get_links_in_game(include='inning_all')
mytree = test.get_xml_tree(links[0])
rows = xml_to_table(mytree)

[{'ax': '-9.73',
  'ay': '23.021',
  'az': '-14.854',
  'break_angle': '31.0',
  'break_length': '4.1',
  'break_y': '23.9',
  'cc': '',
  'child_tables': 'NONE',
  'des': 'Ball',
  'des_es': 'Bola mala',
  'end_speed': '84.4',
  'event_num': '3',
  'id': '3',
  'key': 0,
  'mt': '',
  'nasty': '53',
  'node_type': 'pitch',
  'parent_key': 0,
  'parent_table': 'atbat',
  'pfx_x': '-5.39',
  'pfx_z': '9.56',
  'pitch_type': 'FF',
  'play_guid': 'fd7c34b2-23bc-4e0c-a29c-fc0e99c1c3fb',
  'px': '-1.713',
  'pz': '3.201',
  'spin_dir': '209.325',
  'spin_rate': '2180.331',
  'start_speed': '90.2',
  'sv_id': '160507_174232',
  'sz_bot': '1.54',
  'sz_top': '3.12',
  'text': '',
  'tfs': '004103',
  'tfs_zulu': '2016-05-08T00:41:03Z',
  'type': 'B',
  'type_confidence': '.847',
  'vx0': '2.058',
  'vy0': '-132.187',
  'vz0': '-4.404',
  'x': '182.3',
  'x0': '-1.793',
  'y': '152.35',
  'y0': '50.0',
  'z0': '5.949',
  'zone': '11'},
 {'ax': '-12.526',
  'ay': '20.631',
  'az': '-25.054',
  

In [29]:
def xml_to_table(self, root_node):

    def get_key(node):
        if node in indexes:
            return(next(indexes[node]))
        else:
            indexes[node] = count()
            return(next(indexes[node]))
    
    def recurse_xml(node, referer=None, referer_key=None):
        key = get_key(node.tag)
        subtables = set([child.tag for child in node.getchildren()]) or 'NONE'
        attribs = dict()
        text = ''

        if bool(node.attrib):
            attribs = node.attrib
        if node.text:
            text = node.text

        for child in node.getchildren():
            recurse_xml(child, node.tag, key)
            
        target.append({
                'gid': self.gid,
                'node_type': node.tag,
                'key': key,
                'child_tables': subtables,
                'parent_table': referer, 
                'parent_key': referer_key,
                'text': text.strip(),
                **attribs
            })
    
    indexes = dict()
    target = list()
    recurse_xml(root_node)
    return(target)

In [30]:
res = xml_to_table(mytree)