
This notebook contains the code used to grab batting statistics from the "Standard" page of Fangraphs.com MLB stats  
Heavily relies on Selenium


In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import pandas as pd
pd.set_option("display.max_columns",999)

In [4]:
#Opening Chromedriver via Selenium
chromedriver = "/Users/brian_newborn/Downloads/chromedriver" # path to the local chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

In [5]:
#2010-2017 Batting Stats, Standard
url="https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=100&type=0&season=2017&month=0&season1=2010&ind=1&team=0&rost=0&age=0&filter=&players=0"
driver.get(url)

#col_names_list will be column names to use as headers in final dataframe
col_names_list = []

#master_players will be player dictionary - rows of dataframe
master_players={} 

In [6]:
def one_section_only():
    '''
    This function works through a full section of Fangraphs (Standard, in this example).
    It will click the next page button until the end of the section is reached.
    '''
    try:
        next_pg = driver.find_element_by_xpath('//a[@title="Next Page"]')
        next_pg.send_keys(u'\ue007')
        print("Next Page")
        keep_running=True
    except:
        print("End of Section")
        keep_running =False
    return keep_running

def col_headers():
    #Grab Headers
    identifier_header = "rgHeader"
    
    col_names = driver.find_elements_by_class_name(identifier_header)
    curr_page = driver.find_element_by_xpath\
    ('.//div[@class = "rgWrap rgNumPart"]//a[@class="rgCurrentPage"]').text
    for n in col_names[4:]:
        col_names_list.append(n.text)
        print(n.text)
    return col_names_list

def scrape_players():  
    #Grab Players - 2 Types of HTML rows
    identifier1,identifier2 = "rgRow","rgAltRow"

    players = driver.find_elements_by_class_name(identifier2)
    players2 = driver.find_elements_by_class_name(identifier1)
    players = players+players2
    
    for player in players:
        player = player.find_elements_by_class_name("grid_line_regular") #td elements-need each element       
        name =  (player[2].text)+' '+(player[1].text) #name + year
#         print(f"Name: {name}")
        for td in player:
            if name in master_players: #if player already exists, append item to 
                master_players[name].append(td.text)

            else:
                master_players[name]=[td.text]


In [7]:
columns = col_headers()

G
AB
PA
H
1B
2B
3B
HR
R
RBI
BB
IBB
SO
HBP
SF
SH
GDP
SB
CS
AVG


In [8]:
columns

['G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG']

In [9]:
keep_running = True #flag to keep loop going
#Call scraper once
scrape_players()
#Go into loop. End when one_section_only returns False
while keep_running == True:
        keep_running = one_section_only()
        scrape_players()

Next Page
Next Page


KeyboardInterrupt: 

In [69]:
master_players

{'Adrian Gonzalez 2011': ['15',
  '2011',
  'Adrian Gonzalez',
  'Red Sox',
  '159',
  '630',
  '715',
  '213',
  '138',
  '45',
  '3',
  '27',
  '108',
  '117',
  '74',
  '20',
  '119',
  '6',
  '5',
  '0',
  '28',
  '1',
  '0',
  '.338'],
 'Buster Posey 2012': ['22',
  '2012',
  'Buster Posey',
  'Giants',
  '148',
  '530',
  '610',
  '178',
  '114',
  '39',
  '1',
  '24',
  '78',
  '103',
  '69',
  '7',
  '96',
  '2',
  '9',
  '0',
  '19',
  '1',
  '1',
  '.336'],
 'Carlos Gonzalez 2010': ['23',
  '2010',
  'Carlos Gonzalez',
  'Rockies',
  '145',
  '587',
  '636',
  '197',
  '120',
  '34',
  '9',
  '34',
  '111',
  '117',
  '40',
  '8',
  '135',
  '2',
  '7',
  '0',
  '9',
  '26',
  '8',
  '.336'],
 'Corey Seager 2015': ['21',
  '2015',
  'Corey Seager',
  'Dodgers',
  '27',
  '98',
  '113',
  '33',
  '20',
  '8',
  '1',
  '4',
  '17',
  '17',
  '14',
  '1',
  '19',
  '1',
  '0',
  '0',
  '2',
  '2',
  '0',
  '.337'],
 'DJ LeMahieu 2016': ['2',
  '2016',
  'DJ LeMahieu',
  'Rockies

In [70]:
#create dataframe from master_players dictionary
df = pd.DataFrame.from_dict(data=master_players,orient="index")


In [91]:
# needed as later data brought in duplicate points. Easy to just drop columns 24-47
df = df.drop(labels=range(24,48,1),axis=1)

In [79]:
#Drop 0-3 as well - unneeded data
df = df.drop(labels=[0,1,2,3],axis=1)

In [82]:
df.sample(5)

Unnamed: 0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
Michael Cuddyer 2013,130,489,540,162,108,31,3,20,74,84,46,5,100,2,3,0,13,10,3,0.331
Jose Altuve 2014,158,660,707,225,168,47,3,7,85,59,36,7,53,5,5,1,20,56,9,0.341
Melky Cabrera 2012,113,459,501,159,113,25,10,11,84,60,36,4,63,0,5,1,8,13,5,0.346
Daniel Murphy 2016,142,531,582,184,107,47,5,25,88,104,35,10,57,8,8,0,4,5,3,0.347
Buster Posey 2012,148,530,610,178,114,39,1,24,78,103,69,7,96,2,9,0,19,1,1,0.336


In [83]:
#Set column names to names scraped
df.columns = col_names_list

In [92]:
df.loc["Miguel Cabrera 2013"]

G       148
AB      555
PA      652
H       193
1B      122
2B       26
3B        1
HR       44
R       103
RBI     137
BB       90
IBB      19
SO       94
HBP       5
SF        2
SH        0
GDP      19
SB        3
CS        0
AVG    .348
Name: Miguel Cabrera 2013, dtype: object

In [90]:
import pickle
#Save as pickle
with open('df_standard.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)