In [29]:
import requests
import xml.etree.ElementTree as et
from os import path
from bs4 import BeautifulSoup

In [33]:
ROOTPATH = '/home/ubuntu/baseballdata/'

def xml_recurse(table_name, node, gid, referer='NONE', depth=0):
    subtables = list(set([child.tag for child in node.getchildren()])) or ['NONE']
    attribs = list()
    text = ''
    
    if bool(node.attrib):
        attribs = list(node.attrib.values())
    if node.text:
        text = node.text
        
    for child in node.getchildren():
        xml_recurse(table_name, child, gid, node.tag, depth=depth+1)
    
    write_node(table_name, xmltree.tag, node.tag, gid, subtables, referer, attribs, text.strip())
            
def write_node(table_name, rootnode, tag, gid, subtables, parent, attributes, text):
    write_path = '{}.{}.{}.txt'.format(ROOTPATH + table_name, rootnode, tag)
    attributes = '\t'.join(attributes)
    for subtable in subtables:
        with open(write_path, 'a+') as tmp:
            tmp.write('\t'.join([gid, subtable, parent, attributes, text]) + '\n')

In [36]:
def get_links_in_game(path, depth=0):
    response = requests.get(path)
    soup = BeautifulSoup(response.text, 'html.parser')

    for link in soup.find_all('a')[1:]:
        mylink = link.get('href')
        if mylink.split('.')[-1] in {'xml', 'plist'}:
            table_name = ''
            if depth == 0:
                table_name = (path[96:] + mylink).split('.')[0]
            elif depth == 1:
                table_name = (path[96:] + mylink).split('.')[0].split('/')[0]
            else:
                table_name = (path[96:] + mylink).split('.')[0].split('/')[1] + '_' + (path[96:] + mylink).split('.')[0].split('/')[3]                
            response = requests.get(path + mylink)
            xmltree = et.fromstring(response.text)
            gameid = path + mylink
            gameid = gameid.split('/')[9]
            xml_recurse(table_name, xmltree, gameid)
        else:
            get_links_in_game(path + mylink, depth+1)

In [37]:
get_links_in_game('http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_07/gid_2016_05_07_oakmlb_balmlb_1/')

NameError: name 'xmltree' is not defined

In [131]:
import datetime
from bs4 import BeautifulSoup

In [196]:
class MlbamGame:
    def __init__(self, gid):
        if self._validate_id(gid):
            self.gid = gid
            self.url = self._make_mlbam_base_path(gid)
        else:
            raise Exception('{} is not a valid gameid. it must be before today and of the form \"gid_YYYY_MM_DD_<away><league>_<home><league>_<gamenum>\"'.format(gid))

    @staticmethod
    def _validate_id(gid):
        if type(gid) != str:
            return False
        if len(gid) != 30:
            return False
        if datetime.datetime.strptime(gid[4:14], '%Y_%m_%d').date() > datetime.date.today():
            return False
        if datetime.datetime.strptime(gid[4:14], '%Y_%m_%d').date() < datetime.date(2008, 1, 1):
            return False
        return True
    
    @staticmethod
    def _make_mlbam_base_path(gid):
        url_template = 'http://gd2.mlb.com/components/game/mlb/year_{}/month_{}/day_{}/{}/'
        assert(gid[4:8].isnumeric())
        assert(gid[9:11].isnumeric())
        assert(gid[12:14].isnumeric())
        return(url_template.format(gid[4:8], gid[9:11], gid[12:14], gid))
    
    def get_links_in_game(self, exclude=None, include=None):
        if exclude is not None and type(exclude) is not list:
            exclude = [exclude]
        if include is not None and type(include) is not list:
            include = [include]

        if include is not None and exclude is not None:
            raise Exception('only exclude OR include allowed')

        def recurse_links(base_path):
            response = requests.get(base_path)
            soup = BeautifulSoup(response.text, 'html.parser')

            for link in soup.find_all('a')[1:]:
                new_base_path = base_path + link.get('href')
                if new_base_path.split('.')[-1] in {'xml', 'plist'}:
                    if exclude is not None and not any([x in new_base_path for x in exclude]):
                        target.append(new_base_path)
                    elif include is not None and any([x in new_base_path for x in include]):
                        target.append(new_base_path)
                    elif exclude is None and include is None:
                        target.append(new_base_path)
                else:
                    recurse_links(new_base_path)

        target = list()
        recurse_links(self.url)
        return(target)

In [197]:
test = MlbamGame('gid_2016_05_10_clemlb_houmlb_1')
test.get_links_in_game(exclude=['batters'])

['http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_10/gid_2016_05_10_clemlb_houmlb_1/atv_game_events.xml',
 'http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_10/gid_2016_05_10_clemlb_houmlb_1/atv_preview.xml',
 'http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_10/gid_2016_05_10_clemlb_houmlb_1/atv_preview_noscores.xml',
 'http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_10/gid_2016_05_10_clemlb_houmlb_1/atv_runScoringPlays.xml',
 'http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_10/gid_2016_05_10_clemlb_houmlb_1/bench.xml',
 'http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_10/gid_2016_05_10_clemlb_houmlb_1/benchO.xml',
 'http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_10/gid_2016_05_10_clemlb_houmlb_1/bis_boxscore.xml',
 'http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_10/gid_2016_05_10_clemlb_houmlb_1/boxscore.xml',
 'http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_

In [177]:
if None:
    print('x')

In [174]:
True == False

False