In [2]:
#Step 1 - Grab Free Agent Data from ESPN
#Imports
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

In [3]:
#URL-
url = "http://www.espn.com/mlb/freeagents/_/year/2017/type/dollars"
response = requests.get(url)
page = response.text

In [4]:
soup = BeautifulSoup(page,"html5lib")

In [5]:
#Players are both in class oddrow and even row
#Using regex to grab both at once
rowplayer_regex = re.compile('.*row\splayer.*')

In [6]:
col_name_all = soup.find_all(class_="colhead")[1]
col_names_list=[]
for name in col_name_all.find_all("td"):
    col_names_list.append(name.text)

## Now Lets Build a List of 2017 MLB Free Agents

In [7]:
free_agents_list = soup.find_all(class_=rowplayer_regex)

In [8]:
free_agents_17 = {} #empty dict to add player and his data to
for row in free_agents_list:
    items = row.find_all('td')
    player = items[0].find('a').text
    free_agents_17[player] = [i.text for i in items[1:]]

In [9]:
free_agents_17df = pd.DataFrame.from_dict\
(data=free_agents_17,orient='index')
free_agents_17df.reset_index(inplace=True)
#reset index because player name shouldn't be index
#We'll need it to join on

In [10]:
free_agents_17df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7
0,Eric Hosmer,1B,28,Signed,Royals,Padres,8,7,"$144,000,000"
1,Yu Darvish,SP,31,Signed,Dodgers,Cubs,6,1,"$126,000,000"
2,J.D. Martinez,RF,30,Signed,Diamondbacks,Red Sox,5,6,"$110,000,000"
3,Lorenzo Cain,CF,32,Signed,Royals,Brewers,5,2,"$80,000,000"
4,Jake Arrieta,SP,32,Signed,Cubs,Phillies,3,4,"$75,000,000"


In [11]:
#Rename Columns to match ESPN's
col_names_dict = {"index":"PLAYER",0:"POS",1:"AGE",2:"STATUS",\
                 3:"OLD_TEAM",4:"NEW_TEAM",5:"YRS",6:"RK"\
                  ,7:"DOLLARS"}

In [12]:
free_agents_17df.rename(columns=col_names_dict,inplace=True)

In [13]:
#Last Step - Add Free Agent Year as Column
free_agents_17df["PLAYER_YEAR"]=free_agents_17df["PLAYER"] + " 2017"

In [14]:
free_agents_17df.head()

Unnamed: 0,PLAYER,POS,AGE,STATUS,OLD_TEAM,NEW_TEAM,YRS,RK,DOLLARS,PLAYER_YEAR
0,Eric Hosmer,1B,28,Signed,Royals,Padres,8,7,"$144,000,000",Eric Hosmer 2017
1,Yu Darvish,SP,31,Signed,Dodgers,Cubs,6,1,"$126,000,000",Yu Darvish 2017
2,J.D. Martinez,RF,30,Signed,Diamondbacks,Red Sox,5,6,"$110,000,000",J.D. Martinez 2017
3,Lorenzo Cain,CF,32,Signed,Royals,Brewers,5,2,"$80,000,000",Lorenzo Cain 2017
4,Jake Arrieta,SP,32,Signed,Cubs,Phillies,3,4,"$75,000,000",Jake Arrieta 2017


In [52]:
#Clean Up DOLLARS column from dashes, commas and $ signs

free_agents_17df["DOLLARS"] = free_agents_17df["DOLLARS"].str.replace(r'\D','0')

free_agents_17df.head()

Unnamed: 0,PLAYER,POS,AGE,STATUS,OLD_TEAM,NEW_TEAM,YRS,RK,DOLLARS,FA_YEAR
0,Eric Hosmer,1B,28,Signed,Royals,Padres,8,7,14400000000,2017
1,Yu Darvish,SP,31,Signed,Dodgers,Cubs,6,1,12600000000,2017
2,J.D. Martinez,RF,30,Signed,Diamondbacks,Red Sox,5,6,11000000000,2017
3,Lorenzo Cain,CF,32,Signed,Royals,Brewers,5,2,8000000000,2017
4,Jake Arrieta,SP,32,Signed,Cubs,Phillies,3,4,7500000000,2017


In [54]:
#Convert Numeric columns to numbers

num_cols = ["DOLLARS","AGE","YRS","FA_YEAR"]
for col in num_cols:
    free_agents_17df[col]=\
    pd.to_numeric(free_agents_17df[col])

In [55]:
free_agents_17df.dtypes

PLAYER       object
POS          object
AGE           int64
STATUS       object
OLD_TEAM     object
NEW_TEAM     object
YRS         float64
RK           object
DOLLARS       int64
FA_YEAR       int64
dtype: object

## ESPN FREE AGENT DATA HAS BEEN GRABBED + CLEANED
next step with this is to build script to grab multiple years of this data


In [66]:
def espn_MLB_free_agents(year):
    #URL
    url = (f"http://www.espn.com/mlb/freeagents/_/year/{year}/type/dollars")
    print(f"Checking url: {url}")
    response = requests.get(url)
    page = response.text
    
    #BS for web scraping - table is pure HTML
    soup = BeautifulSoup(page,"html5lib")
    
    #Col Names - just to make sure all tables are the same for safety
    col_name_all = soup.find_all(class_="colhead")[1]
    col_names_list=[]
    for name in col_name_all.find_all("td"):
        col_names_list.append(name.text)
        
    #Players are both in class oddrow and even row
    #Using regex to grab both at once
    rowplayer_regex = re.compile('.*row\splayer.*')
        
    #grab players, using regex
    free_agents_list = soup.find_all(class_=rowplayer_regex)
    
    free_agents_one_year = {} #empty dict to add player and his data to
    for row in free_agents_list: #go through all players on site
        items = row.find_all('td') #find table data elements
        player = items[0].find('a').text #name will be first column, take text from a tag
        free_agents_one_year[player] = [i.text for i in items[1:]] #rest of data for table
    
    #placeholder dataframe, we will add to master after cleansing
    df = pd.DataFrame.from_dict(data=free_agents_one_year,orient='index')
    df.reset_index(inplace=True)
    #reset index because player name shouldn't be index
    #We'll need it to join on our MLB hitting data
    
    #Data Cleaning Time
    #Rename Columns to match ESPN's
    col_names_dict = {"index":"PLAYER",0:"POS",1:"AGE",2:"STATUS",\
                     3:"OLD_TEAM",4:"NEW_TEAM",5:"YRS",6:"RK"\
                      ,7:"DOLLARS"}
    df.rename(columns=col_names_dict,inplace=True)
    #Next Step - Add Free Agent Year as Column
    df["PLAYER_YEAR"]=df["PLAYER"] + f" {year}"
#     print(df.head())
    return df #return df, we will append outside function

    

In [67]:
import copy
free_agents_backup = copy.copy(free_agents_17df)
free_agents_master = free_agents_backup

In [69]:
for inp_year in range(2010,2018,1):
    df = pd.DataFrame()
    df = espn_MLB_free_agents(inp_year)
    free_agents_master = pd.concat([free_agents_master,df])
    print(f"{inp_year} has been added to master")
#     print(free_agents_master.head())
    print("**********************************")
    

Checking url: http://www.espn.com/mlb/freeagents/_/year/2010/type/dollars
2010 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2011/type/dollars
2011 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2012/type/dollars
2012 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2013/type/dollars
2013 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2014/type/dollars
2014 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2015/type/dollars
2015 has been added to master
**********************************
Checking url: http://www.espn.com/mlb/freeagents/_/year/2016/type/dollars
2016 has been added to master
**********************************
Checking url: http://www.es

In [70]:
free_agents_master.sample(15)

Unnamed: 0,PLAYER,POS,AGE,STATUS,OLD_TEAM,NEW_TEAM,YRS,RK,DOLLARS,PLAYER_YEAR
84,Scott Baker,SP,36,Signed,Rangers,Yankees,,NR,Minor Lg,Scott Baker 2014
52,David Hernandez,RP,32,Signed,Diamondbacks,Reds,2.0,28,"$5,000,000",David Hernandez 2017
100,Randy Choate,RP,42,Signed,Cardinals,Blue Jays,,NR,Minor Lg,Randy Choate 2015
27,Matt Holliday,DH,38,Signed,Cardinals,Yankees,1.0,NR,"$13,000,000",Matt Holliday 2016
170,Danny Valencia,3B,33,Signed,Mariners,Orioles,,NR,Minor Lg,Danny Valencia 2017
74,Alexi Ogando,RP,34,Signed,Rangers,Red Sox,1.0,NR,"$1,500,000",Alexi Ogando 2014
165,Hector Santiago,RP,30,Signed,Twins,White Sox,,NR,Minor Lg,Hector Santiago 2017
41,Alex Gonzalez,SS,41,Signed (B),Braves,Brewers,1.0,NR,"$4,250,000",Alex Gonzalez 2011
62,Neil Walker,2B,32,Signed,Brewers,Yankees,1.0,24,"$4,000,000",Neil Walker 2017
142,Seth Maness,RP,29,Signed,Cardinals,Royals,,NR,Minor Lg,Seth Maness 2016


In [71]:
import pickle

with open('free_agents_master.pkl', 'wb') as picklefile:
    pickle.dump(free_agents_master, picklefile)