In [195]:
from collections import OrderedDict
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import re
import requests
import os

# scrape entire team batting table

In [177]:
class html_table_scraper:
    '''Given a baseball-reference.com url -- scrapes "TEAM BATTING" html table for any season/team'''
    
    def scrape_br_html_table(self,url):
        '''Scrapes "TEAM BATTING" html table into pandas data frame'''
        self.url = url

        #create bs object
        r = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(r, "lxml")

        #find table, find header, find rows
        table = soup.find('div', attrs={'class': 'overthrow table_container'})
        table_head = table.find('thead')
        table_body = table.find('tbody')

        #create table header list
        header = []    
        for th in table_head.findAll('th'):
            key = th.get_text()
            header.append(key)

        #find number of 'empty' rows
        endrows = 0
        for tr in table.findAll('tr'):
            if tr.findAll('th')[0].get_text() in (''):
                endrows += 1

        #find number of rows in table
        rows = len(table.findAll('tr'))
        rows -= endrows + 1  

        #create lists of row data, create ordered dictionary from header and row data...
        #...create list of dictionaries for data frame
        list_of_dicts = []
        for row in range(rows):
            the_row = []
            try:
                table_row = table.findAll('tr')[row]
                for tr in table_row:
                    value = tr.get_text()
                    the_row.append(value)
                od = OrderedDict(zip(header,the_row))
                list_of_dicts.append(od)
            except AttributeError:
                continue 

        #create df
        df = pd.DataFrame(list_of_dicts)

        #change column names to all uppercase for easy searching 
        df.columns = [col.upper() for col in df.columns]

        #strip all bad characters from NAME column to allow searching by name
        df['NAME'] = df['NAME'].str.replace(r'\(([^()]+)\)', '')
        df['NAME'] = df['NAME'].str.replace('*', '')
        df['NAME'] = df['NAME'].str.replace('#', '')

        #set index so it's easier to search
        df1 = df.set_index(keys='NAME')
        return df1

    def search_table(self,url):
        '''given a scraped baseball_reference html table -- searches table for name and statistic of a specific player.
            error handling prompts user to re-enter data if not found in data frame'''
        self.url = url

        df2= self.scrape_br_html_table(url)
        while True:
            try:
                #prompt user for input
                player = input('Enter a player Name:')
                stat = input('Enter a Statistic:') #MUST BE ALL UPPERCASE LETTERS
                #search df
                df3 = df2.loc[player][stat]
                #format output
                d={'Name': player, stat: df3}
                return d
            except KeyError as e:
                print('Error:', e, 'Check spelling')

                pass
            else: 
                break 
table_scraper = html_table_scraper() #instantiate class

In [178]:
url = 'http://www.baseball-reference.com/teams/BOS/2017.shtml'
table_scraper.scrape_br_html_table(url)

Unnamed: 0_level_0,RK,POS,AGE,G,PA,AB,R,H,2B,3B,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sandy Leon,1,C,28,21,76,74.0,9.0,17.0,3.0,0.0,...,0.25,0.432,0.682,81.0,32.0,2.0,0.0,0.0,0.0,1.0
Mitch Moreland,2,1B,31,35,145,124.0,17.0,33.0,15.0,0.0,...,0.372,0.435,0.808,119.0,54.0,5.0,1.0,0.0,0.0,3.0
Dustin Pedroia,3,2B,33,31,138,119.0,11.0,34.0,6.0,0.0,...,0.365,0.387,0.752,105.0,46.0,3.0,0.0,1.0,2.0,1.0
Xander Bogaerts,4,SS,24,31,131,120.0,18.0,42.0,7.0,2.0,...,0.405,0.442,0.846,130.0,53.0,3.0,2.0,0.0,0.0,0.0
Pablo Sandoval,5,3B,30,17,67,61.0,7.0,13.0,1.0,0.0,...,0.269,0.377,0.646,73.0,23.0,3.0,0.0,0.0,1.0,0.0
Andrew Benintendi,6,LF,22,34,154,136.0,21.0,42.0,7.0,0.0,...,0.377,0.471,0.847,129.0,64.0,6.0,2.0,0.0,2.0,1.0
Jackie Bradley,7,CF,27,20,77,70.0,5.0,12.0,0.0,1.0,...,0.234,0.286,0.519,40.0,20.0,3.0,0.0,0.0,1.0,0.0
Mookie Betts,8,RF,24,32,143,127.0,18.0,38.0,12.0,0.0,...,0.378,0.512,0.889,139.0,65.0,4.0,1.0,0.0,0.0,1.0
Hanley Ramirez,9,DH,33,28,119,105.0,15.0,27.0,3.0,0.0,...,0.345,0.429,0.773,109.0,45.0,5.0,0.0,0.0,0.0,0.0
Chris Young,10,LF,33,26,90,79.0,13.0,21.0,6.0,0.0,...,0.356,0.418,0.773,109.0,33.0,1.0,0.0,0.0,0.0,0.0


# Scrape team abbreviations

In [6]:
#use this function to create a list of team names
def format_team_names():
    '''returns a list of team name abbreviations used in baseball reference tables'''
    
    names_link = 'http://www.baseball-reference.com/leagues/MLB/2016.shtml'
    df = pd.read_html(names_link, flavor='html5lib', attrs={'class': 'sortable'})

    df_names = df[0]

    teams= df_names['Tm'].tolist()
    del teams[-2:]
    team_names = ['/' + i  for i in teams]
    
    return team_names

In [23]:
team_names = format_team_names()
team_names

['/ARI',
 '/ATL',
 '/BAL',
 '/BOS',
 '/CHC',
 '/CHW',
 '/CIN',
 '/CLE',
 '/COL',
 '/DET',
 '/HOU',
 '/KCR',
 '/LAA',
 '/LAD',
 '/MIA',
 '/MIL',
 '/MIN',
 '/NYM',
 '/NYY',
 '/OAK',
 '/PHI',
 '/PIT',
 '/SDP',
 '/SEA',
 '/SFG',
 '/STL',
 '/TBR',
 '/TEX',
 '/TOR',
 '/WSN']

# Save csvs of team batting tables for specific team-year combinations

In [193]:
def create_links(stem, team_names, years, extension):
    
    '''creates links to be fed to bs4
    Args: str(steem)
          [team_name]
          [year]
          str(extension)'''

    links_lst = []
    for y in years:
        for n in team_names:
            links_lst.append(stem + n + y + extension)
            
    length_list  =len(links_lst)
    count = 0

    while count < length_list:
    
        df = table_scraper.scrape_br_html_table(links_lst[count])
        file_name = links_lst[count][40:48].replace('/', '_')
        df.to_csv('batting_' + file_name + '.csv')
        count += 1
        print('Saved csv for team_year - ' + file_name)

In [194]:
#create specific team-year combinations
stem = 'http://www.baseball-reference.com/teams'
years_lst = ['/2012', '/2013', '/2014', '/2015', '/2016', '/2017']
years_lst2 = ['/2016']
ext = '.shtml'
team_links = create_links(stem, team_names, years_lst2, ext)
team_links

Saved csv for team_year - ARI_2016
Saved csv for team_year - ATL_2016
Saved csv for team_year - BAL_2016
Saved csv for team_year - BOS_2016
Saved csv for team_year - CHC_2016
Saved csv for team_year - CHW_2016
Saved csv for team_year - CIN_2016
Saved csv for team_year - CLE_2016
Saved csv for team_year - COL_2016
Saved csv for team_year - DET_2016
Saved csv for team_year - HOU_2016
Saved csv for team_year - KCR_2016
Saved csv for team_year - LAA_2016
Saved csv for team_year - LAD_2016
Saved csv for team_year - MIA_2016
Saved csv for team_year - MIL_2016
Saved csv for team_year - MIN_2016
Saved csv for team_year - NYM_2016
Saved csv for team_year - NYY_2016
Saved csv for team_year - OAK_2016
Saved csv for team_year - PHI_2016
Saved csv for team_year - PIT_2016
Saved csv for team_year - SDP_2016
Saved csv for team_year - SEA_2016
Saved csv for team_year - SFG_2016
Saved csv for team_year - STL_2016
Saved csv for team_year - TBR_2016
Saved csv for team_year - TEX_2016
Saved csv for team_y

# before running all_player_names() download ALL BATTING TABLES for 1 season so there's  a list of players names to extract from

In [141]:
def all_player_names(fp):
    '''given a file path that has ALL batting stats for a specific team, 
       concatenates all csvs into a giant data frame '''
    
    #combine all csvs into 1 giant dataframe
    p = fp
    df_lst = []
    for r,d,f in os.walk(p):
        for n in f:
            fp = os.path.join(r,n)
            df1 = pd.read_csv(fp)
            df_lst.append(df1)

    stats_all = pd.concat(df_lst)
    
    all_player_names = stats_all['NAME'].tolist()
    sorted_names = sorted(all_player_names, key=lambda x: x.split(" ")[-1])
    return sorted_names

In [142]:
file_path = '/Users/Anthony/Desktop/python_projects/twitter/baseball/2016'
player_names = all_player_names(file_path)
player_names

['Fernando Abad',
 'Fernando Abad',
 'Jose Abreu',
 'A.J. Achter',
 'Dustin Ackley',
 'Cristhian Adames',
 'Austin Adams',
 'Matt Adams',
 'Tim Adleman',
 'Ehire Adrianza',
 'Jesus Aguilar',
 'Matt Albers',
 'Andrew Albers',
 'Hanser Alberto',
 'Al Alburquerque',
 'Arismendy Alcantara',
 'Raul Alcantara',
 'Scott Alexander',
 'Jorge Alfaro',
 'Cody Allen',
 'Abraham Almonte',
 'Albert Almora',
 'Yonder Alonso',
 'Dan Altavilla',
 'Aaron Altherr',
 'Jose Altuve',
 'Dario Alvarez',
 'Pedro Alvarez',
 'Dariel Alvarez',
 'Jose Alvarez',
 'Dario Alvarez',
 'Alexi Amarista',
 'Tim Anderson',
 'Cody Anderson',
 'Tyler Anderson',
 'Brett Anderson',
 'Chase Anderson',
 'Robert Andino',
 'Matt Andriese',
 'Elvis Andrus',
 'Dustin Antolin',
 'Norichika Aoki',
 'Jayson Aquino',
 'Elvis Araujo',
 'Chris Archer',
 'Oswaldo Arcia',
 'Orlando Arcia',
 'Oswaldo Arcia',
 'Oswaldo Arcia',
 'Oswaldo Arcia',
 'Nolan Arenado',
 'Shawn Armstrong',
 'Jonathan Aro',
 'Jake Arrieta',
 'Cody Asche',
 'Alec Asher

# filter players using master_list and first letter of last name 

In [182]:
def get_names_from_master_list(master_list, letter):
    '''get a list of players by first letter of last name'''

    new_lst = []
    for i in master_list:
        split = i.split(" ")
        if split[1].startswith(letter):
            new_lst.append(i)
    return new_lst

In [144]:
a_players = get_names_from_master_list(player_names, 'A')
a_players

['Fernando Abad',
 'Fernando Abad',
 'Jose Abreu',
 'A.J. Achter',
 'Dustin Ackley',
 'Cristhian Adames',
 'Austin Adams',
 'Matt Adams',
 'Tim Adleman',
 'Ehire Adrianza',
 'Jesus Aguilar',
 'Matt Albers',
 'Andrew Albers',
 'Hanser Alberto',
 'Al Alburquerque',
 'Arismendy Alcantara',
 'Raul Alcantara',
 'Scott Alexander',
 'Jorge Alfaro',
 'Cody Allen',
 'Abraham Almonte',
 'Albert Almora',
 'Yonder Alonso',
 'Dan Altavilla',
 'Aaron Altherr',
 'Jose Altuve',
 'Dario Alvarez',
 'Pedro Alvarez',
 'Dariel Alvarez',
 'Jose Alvarez',
 'Dario Alvarez',
 'Alexi Amarista',
 'Tim Anderson',
 'Cody Anderson',
 'Tyler Anderson',
 'Brett Anderson',
 'Chase Anderson',
 'Robert Andino',
 'Matt Andriese',
 'Elvis Andrus',
 'Dustin Antolin',
 'Norichika Aoki',
 'Jayson Aquino',
 'Elvis Araujo',
 'Chris Archer',
 'Oswaldo Arcia',
 'Orlando Arcia',
 'Oswaldo Arcia',
 'Oswaldo Arcia',
 'Oswaldo Arcia',
 'Nolan Arenado',
 'Shawn Armstrong',
 'Jonathan Aro',
 'Jake Arrieta',
 'Cody Asche',
 'Alec Asher

# create a dictionary of dictionaries from html tags, read into data frame, save to csv

In [196]:
def scrape_salary_table(player_url, save_csv=None):
    '''scrapes salary table as multi-index data frame, optional arg to save as csv'''
    
    
    #the table we cant is in comment section 
    page = requests.get(player_url).text
    table_code = page[page.find('<table class="sortable stats_table" id="br-salaries"'):]
    soup = BeautifulSoup(table_code, 'lxml')
    
    #second bs4 soup
    r = urllib.request.urlopen(player_url).read()
    normal_soup = BeautifulSoup(r)

    table_body  = soup.find('tbody')
    
    for i in normal_soup.find_all('h1'):
        for j in i:
            player_name = i.get_text()
            
    #this line is for salary
    sal = [j.get_text() for i in table_body.findAll('tr') for j in i.findAll('td') if j['data-stat'] == 'salary']
    salary_lst = [i.replace('$', '').replace('*', '') for i in sal]
    
    
    #this block is for years              
    years = table_body.findAll('th')                          
    years_lst = [i.get_text() for i in years]
    del years_lst[-1]
    
    #create a dictionary of dictionaries
    dd9 = {}
    dd9[player_name] = {}
    dd9[player_name]['years'] = years_lst
    dd9[player_name]['salary'] = salary_lst

    split = pd.DataFrame.from_dict(dd9, orient = 'index')
    
    #explodes a list into rows
    years_col = split.years.apply(lambda x: pd.Series(x)).unstack()
    salary_col= split.salary.apply(lambda x: pd.Series(x)).unstack()

    #concat one series to df then add other series to existing df
    df = years_col.to_frame()
    df['salary'] = salary_col
    df2 = df.reset_index()
    df3= df2.rename(columns = {0: 'years', 'level_1': 'name'})

    del df3['level_0']

    df4 = df3.set_index(keys = ['name', 'years'])
    
    if save_csv:
        df4.to_csv(player_name  +'.csv')
    return df4

In [190]:
url = 'http://www.baseball-reference.com/players/a/adamsma01.shtml'

scrape_salary_table(url, save_csv = True)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Unnamed: 0_level_0,Unnamed: 1_level_0,salary
name,years,Unnamed: 2_level_1
Matt Adams,2013,490000
Matt Adams,2014,516000
Matt Adams,2015,534000
Matt Adams,2016,1650000
Matt Adams,2017,2800000


# given a list of last names beginning with the same letter, compares it against appropriate br link to look for matches and scrape salary tables

In [198]:
def csv_by_letter(lookup_lastletter, players_lst, save = None):
    '''both args must be the same letter'''
    
    #compare player names to a href tags to see if a link exists for them 
    stem_players = 'http://www.baseball-reference.com'

    url = 'http://www.baseball-reference.com/a/'

    r = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(r)

    a_player_links = []
    for i in soup.find_all('a'):
        for name in players_lst:
            if name in i.text:
                filtered_links = stem_players + i['href']
                df = scrape_salary_table(filtered_links, save_csv = save)
                print(df)

In [200]:
uurl = 'http://www.baseball-reference.com/a/'
csv_by_letter(uurl, a_players[0:5], save = True)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


                        salary
name          years           
Fernando Abad                 
              2011     418,000
              2012     485,000
              2014     525,900
              2015   1,087,500
              2016   1,250,000
              2017   2,000,000
                        salary
name          years           
Fernando Abad                 
              2011     418,000
              2012     485,000
              2014     525,900
              2015   1,087,500
              2016   1,250,000
              2017   2,000,000
                        salary
name          years           
Fernando Abad                 
              2011     418,000
              2012     485,000
              2014     525,900
              2015   1,087,500
              2016   1,250,000
              2017   2,000,000
                        salary
name          years           
Fernando Abad                 
              2011     418,000
              2012     485,000
        

AttributeError: 'NoneType' object has no attribute 'findAll'