In [1]:
#Step 1 - Grab Free Agent Data from ESPN
#Imports
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

In [6]:
free_agents_master = pd.DataFrame()

## ESPN Free Agents - Script to grab multiple years of data

In [7]:
def espn_MLB_free_agents(year):
    #URL
    url = (f"http://www.espn.com/mlb/freeagents/_/year/{year}/type/dollars")
    print(f"Checking url: {url}") #Checkpoint - letting us know we're progressing through
    
    #Beautiful Soup for web scraping - table is pure HTML
    response = requests.get(url)
    page = response.text
     
    soup = BeautifulSoup(page,"html5lib")
    
    #Col Names - just to make sure all tables are the same for safety
    #col_name_all is just a placeholder
    #**col_names_list** is the resulting list
    col_name_all = soup.find_all(class_="colhead")[1]
    col_names_list=[]
    for name in col_name_all.find_all("td"):
        col_names_list.append(name.text)
    
        
    #Players are both in class 'oddrow' and 'evenrow'
    #Using regex to grab both at once
    rowplayer_regex = re.compile('.*row\splayer.*')
        
    free_agents_list = soup.find_all(class_=rowplayer_regex)
    
    #empty dict to add player and his data to
    free_agents_one_year = {}
    
    '''
    This loop will go through all the players in that table, find the td elements. 
    Then, add the first item as dict key and add the rest as its entry
    '''
    for row in free_agents_list: #go through all players on site
        items = row.find_all('td') #find table data elements
        player = items[0].find('a').text #name will be first column, take text from a tag
        free_agents_one_year[player] = [i.text for i in items[1:]] #rest of data for table
    
    '''df will be returned at the end of the function as our takeaway'''
    df = pd.DataFrame.from_dict(data=free_agents_one_year,orient='index')
    df.reset_index(inplace=True) 
    #Reset index because player name shouldn't be index
    #We'll need it to join on our MLB hitting data
    
    #Rename Columns to match ESPN's
    col_names_dict = {"index":"PLAYER",0:"POS",1:"AGE",2:"STATUS",\
                     3:"OLD_TEAM",4:"NEW_TEAM",5:"YRS",6:"RK"\
                      ,7:"DOLLARS"}
    df.rename(columns=col_names_dict,inplace=True)
    
    '''
    Next Step - Add PLAYER+YEAR as Column
    This will allow us to have players be free agents in multiple years
    '''
    df["PLAYER_YEAR"]=df["PLAYER"] + f" {year}"
    
    #End of Function - return df
    return df 

    

In [8]:
for inp_year in range(2010,2018,1): #2010,2011,2012...2017
    #Empty df to start
    df = pd.DataFrame()
    
    #call espn_MLB_free_agents - store in df
    df = espn_MLB_free_agents(inp_year)
    
    #add df to free_agents_master
    free_agents_master = pd.concat([free_agents_master,df])
    print(f"{inp_year} has been added to master")
    print("**********************************")
    

Checking url: http://www.espn.com/mlb/freeagents/_/year/2010/type/dollars
2010 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2011/type/dollars
2011 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2012/type/dollars
2012 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2013/type/dollars
2013 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2014/type/dollars
2014 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2015/type/dollars
2015 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2016/type/dollars
2016 has been added to master
**********************************
Checking url: http://www.es

In [70]:
free_agents_master.sample(15)

Unnamed: 0,PLAYER,POS,AGE,STATUS,OLD_TEAM,NEW_TEAM,YRS,RK,DOLLARS,PLAYER_YEAR
84,Scott Baker,SP,36,Signed,Rangers,Yankees,,NR,Minor Lg,Scott Baker 2014
52,David Hernandez,RP,32,Signed,Diamondbacks,Reds,2.0,28,"$5,000,000",David Hernandez 2017
100,Randy Choate,RP,42,Signed,Cardinals,Blue Jays,,NR,Minor Lg,Randy Choate 2015
27,Matt Holliday,DH,38,Signed,Cardinals,Yankees,1.0,NR,"$13,000,000",Matt Holliday 2016
170,Danny Valencia,3B,33,Signed,Mariners,Orioles,,NR,Minor Lg,Danny Valencia 2017
74,Alexi Ogando,RP,34,Signed,Rangers,Red Sox,1.0,NR,"$1,500,000",Alexi Ogando 2014
165,Hector Santiago,RP,30,Signed,Twins,White Sox,,NR,Minor Lg,Hector Santiago 2017
41,Alex Gonzalez,SS,41,Signed (B),Braves,Brewers,1.0,NR,"$4,250,000",Alex Gonzalez 2011
62,Neil Walker,2B,32,Signed,Brewers,Yankees,1.0,24,"$4,000,000",Neil Walker 2017
142,Seth Maness,RP,29,Signed,Cardinals,Royals,,NR,Minor Lg,Seth Maness 2016


In [9]:
'''
Save free_agents_master as pickle
'''

import pickle
with open('free_agents_master.pkl', 'wb') as picklefile:
    pickle.dump(free_agents_master, picklefile)