In [35]:
#!/usr/bin/env python
from collections import OrderedDict
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import re
import requests
import os

# scrapes any batting table

In [None]:
class html_table_scraper:
    '''Given a baseball-reference.com url -- scrapes "TEAM BATTING" html table for any season/team into a pandas data frame.
        Allows user to search pandas data frame for a specific player and statistic'''
    
    def scrape_br_html_table(self,url):
        '''Scrapes "TEAM BATTING" html table into pandas data frame'''
        self.url = url

        #create bs object
        r = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(r)

        #find table, find header, find rows
        table = soup.find('div', attrs={'class': 'overthrow table_container'})
        table_head = table.find('thead')
        table_body = table.find('tbody')

        #create table header list
        header = []    
        for th in table_head.findAll('th'):
            key = th.get_text()
            header.append(key)

        #find number of 'empty' rows
        endrows = 0
        for tr in table.findAll('tr'):
            if tr.findAll('th')[0].get_text() in (''):
                endrows += 1

        #find number of rows in table
        rows = len(table.findAll('tr'))
        rows -= endrows + 1  

        #create lists of row data, create ordered dictionary from header and row data...
        #...create list of dictionaries for data frame
        list_of_dicts = []
        for row in range(rows):
            the_row = []
            try:
                table_row = table.findAll('tr')[row]
                for tr in table_row:
                    value = tr.get_text()
                    the_row.append(value)
                od = OrderedDict(zip(header,the_row))
                list_of_dicts.append(od)
            except AttributeError:
                continue 

        #create df
        df = pd.DataFrame(list_of_dicts)

        #change column names to all uppercase for easy searching 
        df.columns = [col.upper() for col in df.columns]

        #strip all bad characters from NAME column to allow searching by name
        df['NAME'] = df['NAME'].str.replace(r'\(([^()]+)\)', '')
        df['NAME'] = df['NAME'].str.replace('*', '')
        df['NAME'] = df['NAME'].str.replace('#', '')

        #set index so it's easier to search
        df1 = df.set_index(keys='NAME')
        return df1

    def search_table(self,url):
        '''given a scraped baseball_reference html table -- searches table for name and statistic of a specific player.
            error handling prompts user to re-enter data if not found in data frame'''
        self.url = url

        df2= self.scrape_br_html_table(url)
        while True:
            try:
                #prompt user for input
                player = input('Enter a player Name:')
                stat = input('Enter a Statistic:') #MUST BE ALL UPPERCASE LETTERS
                #search df
                df3 = df2.loc[player][stat]
                #format output
                d={'Name': player, stat: df3}
                return d
            except KeyError as e:
                print('Error:', e, 'Check spelling')

                pass
            else: 
                break 
table_scraper = html_table_scraper() #instantiate clas

In [22]:
url = 'http://www.baseball-reference.com/teams/BOS/2017.shtml'
table_scraper.scrape_br_html_table(url)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Unnamed: 0_level_0,RK,POS,AGE,G,PA,AB,R,H,2B,3B,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sandy Leon,1,C,28,21,76,74.0,9.0,17.0,3.0,0.0,...,0.25,0.432,0.682,81.0,32.0,2.0,0.0,0.0,0.0,1.0
Mitch Moreland,2,1B,31,35,145,124.0,17.0,33.0,15.0,0.0,...,0.372,0.435,0.808,119.0,54.0,5.0,1.0,0.0,0.0,3.0
Dustin Pedroia,3,2B,33,31,138,119.0,11.0,34.0,6.0,0.0,...,0.365,0.387,0.752,105.0,46.0,3.0,0.0,1.0,2.0,1.0
Xander Bogaerts,4,SS,24,31,131,120.0,18.0,42.0,7.0,2.0,...,0.405,0.442,0.846,130.0,53.0,3.0,2.0,0.0,0.0,0.0
Pablo Sandoval,5,3B,30,17,67,61.0,7.0,13.0,1.0,0.0,...,0.269,0.377,0.646,73.0,23.0,3.0,0.0,0.0,1.0,0.0
Andrew Benintendi,6,LF,22,34,154,136.0,21.0,42.0,7.0,0.0,...,0.377,0.471,0.847,129.0,64.0,6.0,2.0,0.0,2.0,1.0
Jackie Bradley,7,CF,27,20,77,70.0,5.0,12.0,0.0,1.0,...,0.234,0.286,0.519,40.0,20.0,3.0,0.0,0.0,1.0,0.0
Mookie Betts,8,RF,24,32,143,127.0,18.0,38.0,12.0,0.0,...,0.378,0.512,0.889,139.0,65.0,4.0,1.0,0.0,0.0,1.0
Hanley Ramirez,9,DH,33,28,119,105.0,15.0,27.0,3.0,0.0,...,0.345,0.429,0.773,109.0,45.0,5.0,0.0,0.0,0.0,0.0
Chris Young,10,LF,33,26,90,79.0,13.0,21.0,6.0,0.0,...,0.356,0.418,0.773,109.0,33.0,1.0,0.0,0.0,0.0,0.0


# Scrape team abbreviations

In [6]:
#use this function to create a list of team names
def format_team_names():
    
    '''returns a list of team name abbreviations used in baseball reference tables'''
    
    names_link = 'http://www.baseball-reference.com/leagues/MLB/2016.shtml'
    df = pd.read_html(names_link, flavor='html5lib', attrs={'class': 'sortable'})

    df_names = df[0]

    teams= df_names['Tm'].tolist()
    del teams[-2:]
    team_names = ['/' + i  for i in teams]
    
    return team_names

In [23]:
team_names = format_team_names()
team_names

['/ARI',
 '/ATL',
 '/BAL',
 '/BOS',
 '/CHC',
 '/CHW',
 '/CIN',
 '/CLE',
 '/COL',
 '/DET',
 '/HOU',
 '/KCR',
 '/LAA',
 '/LAD',
 '/MIA',
 '/MIL',
 '/MIN',
 '/NYM',
 '/NYY',
 '/OAK',
 '/PHI',
 '/PIT',
 '/SDP',
 '/SEA',
 '/SFG',
 '/STL',
 '/TBR',
 '/TEX',
 '/TOR',
 '/WSN']

# Create links for specific team-year combinations

In [24]:
#use this function to create links
def create_links(stem, team_names, years, extension):
    
    '''this function creates links to be iterated over
    str(steem)
       [team_name]
       [year]
       str(extension)'''

    links_lst = []
    for y in years:
        for n in team_names:
            links_lst.append(stem + n + y + extension)
            
    return links_lst

In [26]:
#create specific team-year combinations
stem = 'http://www.baseball-reference.com/teams'
years_lst = ['/2012', '/2013', '/2014', '/2015', '/2016', '/2017']
years_lst2 = ['/2016']
ext = '.shtml'
team_links = create_links(stem, team_names, years_lst2, ext)
team_links

['http://www.baseball-reference.com/teams/ARI/2016.shtml',
 'http://www.baseball-reference.com/teams/ATL/2016.shtml',
 'http://www.baseball-reference.com/teams/BAL/2016.shtml',
 'http://www.baseball-reference.com/teams/BOS/2016.shtml',
 'http://www.baseball-reference.com/teams/CHC/2016.shtml',
 'http://www.baseball-reference.com/teams/CHW/2016.shtml',
 'http://www.baseball-reference.com/teams/CIN/2016.shtml',
 'http://www.baseball-reference.com/teams/CLE/2016.shtml',
 'http://www.baseball-reference.com/teams/COL/2016.shtml',
 'http://www.baseball-reference.com/teams/DET/2016.shtml',
 'http://www.baseball-reference.com/teams/HOU/2016.shtml',
 'http://www.baseball-reference.com/teams/KCR/2016.shtml',
 'http://www.baseball-reference.com/teams/LAA/2016.shtml',
 'http://www.baseball-reference.com/teams/LAD/2016.shtml',
 'http://www.baseball-reference.com/teams/MIA/2016.shtml',
 'http://www.baseball-reference.com/teams/MIL/2016.shtml',
 'http://www.baseball-reference.com/teams/MIN/2016.shtml

# Using output of create_links() -- save links to csv in pwd

In [31]:
#use this function to create csvs
def html_to_csv(links_list):
    
    
    length_list  =len(links_list)
    count = 0

    while count < length_list:
    
        count += 1
        df = table_scraper.scrape_br_html_table(links_list[count])
        file_name = links_list[count][40:48].replace('/', '_')
        df.to_csv('batting_' + file_name + '.csv')
        print(links_list[count])

In [32]:
html_to_csv(team_links)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


http://www.baseball-reference.com/teams/ATL/2016.shtml
http://www.baseball-reference.com/teams/BAL/2016.shtml


KeyboardInterrupt: 

# before running all_player_names() make sure you downloaded ALL BATTING TABLES for 1 season so we have a comprehensive list of players

In [None]:
def all_player_names(fp):
    
    #combine all csvs into 1 giant dataframe
    p = fp
    df_lst = []
    for r,d,f in os.walk(p):
        for n in f:
            fp = os.path.join(r,n)
            df1 = pd.read_csv(fp)
            df_lst.append(df1)

    stats_all = pd.concat(df_lst)
    
    all_player_names = stats_all['NAME'].tolist()
    sorted_names = sorted(all_player_names, key=lambda x: x.split(" ")[-1])
    return sorted_names

In [56]:
file_path = '/Users/Anthony/Desktop/python_projects/twitter/baseball/2016'
player_names = all_player_names(file_path)
player_names

AttributeError: 'float' object has no attribute 'split'

In [42]:
def get_names_from_master_list(master_list, letter):

    new_lst = []
    for i in master_list:
        split = i.split(" ")
        if split[1].startswith(letter):
            new_lst.append(i)
    return new_lst

In [57]:
a_players = get_names_from_master_list(player_names, 'A')
a_players

['Dario Alvarez',
 'Pedro Alvarez',
 'Dariel Alvarez',
 'Jayson Aquino',
 'Erick Aybar']

# Everything below here is related to salary tables

## scrapes salary table and returns it as data frame

In [44]:
def scrape_salary_table(player_url):
    
    page = requests.get(player_url).text


    table_code = page[page.find('<table class="sortable stats_table" id="br-salaries"'):]
    soup = BeautifulSoup(table_code, 'lxml')

    table_body  = soup.find('tbody')
    
    #this block is for salary
    salary_lst = []
    for i in table_body.findAll('tr'):
        for j in i.findAll('td'):
            if j['data-stat'] == 'salary':
                sal = j.get_text()
                salary_lst.append(sal)
                
    #this block is for years              
    years = table_body.findAll('th')                          
    years_lst = []
    for i in years:
        ls = i.get_text()
        years_lst.append(ls)
    del years_lst[-1]
        
    df = pd.DataFrame(years_lst)
    df['salary'] = salary_lst
    df1 = df.reset_index()
    df2 = df1.rename(columns={0: 'years'})
    df3 = df2.set_index(keys = ['years'])
    return df3

In [45]:
l  = 'http://www.baseball-reference.com/players/a/abadfe01.shtml'
scrape_salary_table(l)

Unnamed: 0_level_0,index,salary
years,Unnamed: 1_level_1,Unnamed: 2_level_1
,0,
2011.0,1,"$418,000"
2012.0,2,"$485,000"
2014.0,3,"$525,900"
2015.0,4,"$1,087,500"
2016.0,5,"$1,250,000"
2017.0,6,"$2,000,000"


In [52]:
def salary_to_csv(lst_player_names, br_url_stem):
    
    stem_players = 'http://www.baseball-reference.com'

    br_url_stem = 'http://www.baseball-reference.com/a/'

    r = urllib.request.urlopen(br_url_stem).read()
    soup = BeautifulSoup(r)

    d = {}
    for i in soup.find_all('a'):
        for name in lst_player_names:
            if name in i.text:
                filtered_links = stem_players + i['href']
                #key = filtered_links[44:53]
                #d[key] = name
                df = scrape_salary_table(filtered_links)
                fnames = name.replace(" ", "_")
                df.to_csv(name + '.csv')
                print(name, 'salary csv copied')

In [53]:
br_url_stem = 'http://www.baseball-reference.com/a/'
salary_to_csv(a_players, br_url_stem )



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Pedro Alvarez
Erick Aybar
Pedro Alvarez
Erick Aybar


KeyboardInterrupt: 

In [None]:
import shutil