In [1]:
from bokeh.charts import Bar, output_notebook, show
from collections import OrderedDict
from bs4 import BeautifulSoup
from glob import glob
import urllib.request
import pandas as pd
import re
import os

In [2]:
def create_soup(url):
    '''creates parsable bs4 object'''
    r = urllib.request.urlopen(url).read()
    return  BeautifulSoup(r, "lxml")

def get_team_names(year_as_str):
    '''returns a list of team name abbreviations used in baseball reference tables '''

    base_url = 'http://www.baseball-reference.com/leagues/MLB/'
    ext = '.shtml'

    full_url = base_url + year_as_str + ext
    print('Scraping team names from >>', full_url)
    
    df = pd.read_html(full_url, flavor='html5lib', attrs={'class': 'sortable'})
    df_team_names = df[0]
    teams= df_team_names['Tm'].tolist()
    del teams[-2:]
    return ['/' + i  for i in teams] 

def set_batting_dtypes_float(df, col):
    '''takes a pd column and converts it to a float returns new df'''
    
    float_col = df[col].tolist()
    new_float_col = ['0' + i for i in float_col]
    df[col] = new_float_col
    df[col]= df[col].astype(float)
    return df

def set_batting_dtypes(df, col):
    '''index must be reset before passing '''

    objects = [i for i in df.columns if i.startswith(('NAME', 'TEAM_NAME', 'POS'))]
    floats = [i for i in df.columns if i.startswith(('OBP', 'SLG', 'OPS', 'BA'))]

    no_ints = objects + floats
    ints = [i for i in df.columns if i not in no_ints]

    df[ints] = df[ints].astype(int)
    df1 = set_batting_dtypes_float(df, col)
    return df1

def scrape_batting_tables(url):
    #'''Scrapes "TEAM BATTING" html table into pandas data frame'''

    soup = create_soup(url)
    print('Scraping from >> ', url)

    team_abbrev = re.split(r'[/.]', url)[6]
    year_url = re.split(r'[/.]', url)[-2]

    table = soup.find('div', attrs={'class': 'overthrow table_container'})
    table_head = table.find('thead')

    header = []    
    for th in table_head.findAll('th'):
        key = th.get_text()
        header.append(key)

    endrows = 0
    for tr in table.findAll('tr'):
        if tr.findAll('th')[0].get_text() in (''):
            endrows += 1

    rows = len(table.findAll('tr'))
    rows -= endrows + 1  

    list_of_dicts = []
    for row in range(rows):
        the_row = []
        try:
            table_row = table.findAll('tr')[row]
            for tr in table_row:
                value = tr.get_text()
                the_row.append(value)
            od = OrderedDict(zip(header,the_row))
            list_of_dicts.append(od)
        except AttributeError:
            continue 

    df = pd.DataFrame(list_of_dicts)
    df.columns = [col.upper() for col in df.columns]

    df['TEAM_NAME'] = team_abbrev
    df['YEAR'] = year_url

    df['NAME'] = df['NAME'].str.replace(r'\(([^()]+)\)', '')
    df['NAME'] = df['NAME'].str.replace('*', '')
    df['NAME'] = df['NAME'].str.replace('#', '')

    df1 = set_batting_dtypes(df, 'BA')
    df2 = set_batting_dtypes(df1, 'OPS')
    df3 = set_batting_dtypes(df2, 'OBP')
    df4 = set_batting_dtypes(df3, 'SLG')
    return df4

def get_batting_tables(teams, years):
    ''' concats list of urls to scrape & saves to csv  '''
    
    base_url = 'http://www.baseball-reference.com/teams'
    ext = '.shtml'

    links_lst = [base_url + n + y + ext for y in years for n in teams]
    print('Saving in ', os.getcwd(), '\n')
    
    count = 0
    df_lst = []
    while count < len(links_lst):
        df = scrape_batting_tables(links_lst[count])
        df_lst.append(df)
        file_name = links_lst[count][40:48].replace('/', '_')
        df.to_csv('batting_' + file_name + '.csv')
        count += 1
        print('Saved csv for team_year - ' + file_name, '\n')
    return pd.concat(df_lst)

# Scrape 1 batting statistics table

In [3]:
url = 'https://www.baseball-reference.com/teams/NYM/2017.shtml'
df = scrape_batting_tables(url)
df.head()

Scraping from >>  https://www.baseball-reference.com/teams/NYM/2017.shtml


Unnamed: 0,RK,POS,NAME,AGE,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,TEAM_NAME,YEAR
0,1,C,Travis d'Arnaud,28,103,351,325,36,78,18,...,0.709,85,137,12,2,0,3,3,NYM,2017
1,2,1B,Lucas Duda,31,75,291,252,30,62,21,...,0.879,129,134,6,2,0,0,4,NYM,2017
2,3,2B,Neil Walker,31,73,299,265,40,70,13,...,0.78,105,117,4,4,1,2,1,NYM,2017
3,4,SS,Jose Reyes,34,135,520,463,71,113,23,...,0.731,92,192,3,2,4,3,1,NYM,2017
4,5,3B,Wilmer Flores,25,110,362,336,42,91,17,...,0.795,106,164,14,3,0,6,1,NYM,2017


# Scrape a list of team names used to concat urls

In [4]:
teams = get_team_names('2017')
teams[0:10]

Scraping team names from >> http://www.baseball-reference.com/leagues/MLB/2017.shtml


['/ARI',
 '/ATL',
 '/BAL',
 '/BOS',
 '/CHC',
 '/CHW',
 '/CIN',
 '/CLE',
 '/COL',
 '/DET']

# Scrape batting statistics table for any team/year combination 

In [5]:
base_url = 'http://www.baseball-reference.com/teams'
years_lst = ['/2012']


get_batting_tables(teams[0:3], years_lst).head()

Saving in  /Users/Anthony/Desktop/python_projects_clean/br_scraping 

Scraping from >>  http://www.baseball-reference.com/teams/ARI/2012.shtml
Saved csv for team_year - ARI_2012 

Scraping from >>  http://www.baseball-reference.com/teams/ATL/2012.shtml
Saved csv for team_year - ATL_2012 

Scraping from >>  http://www.baseball-reference.com/teams/BAL/2012.shtml
Saved csv for team_year - BAL_2012 



Unnamed: 0,RK,POS,NAME,AGE,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,TEAM_NAME,YEAR
0,1,C,Miguel Montero,28,141,573,486,65,139,25,...,0.829,123,213,15,12,0,2,6,ARI,2012
1,2,1B,Paul Goldschmidt,24,145,587,514,82,147,43,...,0.85,126,252,9,4,0,9,4,ARI,2012
2,3,2B,Aaron Hill,30,156,668,609,93,184,44,...,0.882,133,318,15,4,1,2,7,ARI,2012
3,4,SS,Willie Bloomquist,34,80,338,324,47,98,21,...,0.724,94,129,5,0,0,2,0,ARI,2012
4,5,3B,Ryan Roberts,31,83,280,252,28,63,9,...,0.663,78,90,10,0,2,4,1,ARI,2012


In [6]:
def sum_team_stats(df, col_to_sum, team_name):
    ''' team_sums up a specific statistic for every team and appends that value to a dictionary'''

    sum_col =  df.loc[df['TEAM_NAME'] == team_name, col_to_sum].sum()

    d = {}
    d[team_name + '_' + col_to_sum + '_total'] = sum_col
    return d

def sum_stat_by_team(df_path, year_int, stat,
                     get_plot_df=None, get_cols=None):

    '''year = 2016
       stat=2b
       filter_by = AB
       '''
    files_lst = glob(df_path + '/batting*csv')

    # append vertically 
    df_lst = [pd.read_csv(i) for i in files_lst]
    df = pd.concat(df_lst)

    # what columns are can we plot
    if get_cols:
        return df.columns

    # filter by year    
    df_2016 = df[df['YEAR'] == year_int]

    teamz = list(set(df_2016.TEAM_NAME.tolist()))    

    # this function returns a dictionary for the sum of a given stat for each team 
    team_sums = {}
    for i in teamz:
        dic = sum_team_stats(df_2016, stat, i)
        team_sums.update(dic)

    # create new df from new dictionary
    summed_df = pd.DataFrame.from_dict(team_sums, orient='index').reset_index()

    #create a new team names column
    idx_col = summed_df['index'].tolist()
    team_name_col = [i[:3] for i in idx_col]
    summed_df['TEAM_NAME'] = team_name_col
    df_to_plot = summed_df.rename(columns={0: stat})  

    # inspet df before plotting
    if get_plot_df:
        return df_to_plot
    #plot
    p = Bar(df_to_plot, label='TEAM_NAME', values=stat, agg='sum', 
            title=str(year_int) + ' ' + stat + ' Summed By Team', legend=False)
    return show(p)

In [7]:
output_notebook()

# Take sum of a statistic (that's not a %) for all players on a team & plot for a given year 

In [8]:
batting_csvs = '/Users/Anthony/Desktop/python_projects_clean/br_scraping/team_batting_2012-2017'
sum_stat_by_team(batting_csvs, 2016, 'HR',get_cols=True)

Index(['NAME', 'TEAM_NAME', 'YEAR', 'RK', 'POS', 'AGE', 'G', 'PA', 'AB', 'R',
       'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP',
       'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB'],
      dtype='object')

In [9]:
sum_stat_by_team(batting_csvs, 2016, 'HR')

In [10]:
# compare with http://www.espn.com/mlb/stats/team/_/stat/batting/year/2016/sort/homeRuns/order/true
sum_stat_by_team(batting_csvs, 2016, 'HR', get_plot_df=True).sort_values('HR', ascending=0).head()

Unnamed: 0,index,HR,TEAM_NAME
8,BAL_HR_total,253.0,BAL
0,STL_HR_total,225.0,STL
17,SEA_HR_total,223.0,SEA
24,TOR_HR_total,221.0,TOR
10,NYM_HR_total,218.0,NYM
