In [38]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

In [39]:
url = "http://www.espn.com/mlb/freeagents/_/year/2017/type/dollars"
response = requests.get(url)
page = response.text

In [40]:
soup = BeautifulSoup(page,"html5lib")

In [41]:
#Players are both in class oddrow and even row
#Using regex to grab both at once
rowplayer_regex = re.compile('.*row\splayer.*')

In [42]:
col_name_all = soup.find_all(class_="colhead")[1]
col_names_list=[]
for name in col_name_all.find_all("td"):
    col_names_list.append(name.text)

## Now Lets Build a List of 2017 MLB Free Agents

In [43]:
free_agents_list = soup.find_all(class_=rowplayer_regex)

In [44]:
free_agents_17 = {} #empty dict to add player and his data to
for row in free_agents_list:
    items = row.find_all('td')
    player = items[0].find('a').text
    free_agents_17[player] = [i.text for i in items[1:]]

In [45]:
free_agents_17df = pd.DataFrame.from_dict\
(data=free_agents_17,orient='index')
free_agents_17df.reset_index(inplace=True)
#reset index because player name shouldn't be index
#We'll need it to join on

In [46]:
free_agents_17df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7
0,Eric Hosmer,1B,28,Signed,Royals,Padres,8,7,"$144,000,000"
1,Yu Darvish,SP,31,Signed,Dodgers,Cubs,6,1,"$126,000,000"
2,J.D. Martinez,RF,30,Signed,Diamondbacks,Red Sox,5,6,"$110,000,000"
3,Lorenzo Cain,CF,32,Signed,Royals,Brewers,5,2,"$80,000,000"
4,Jake Arrieta,SP,32,Signed,Cubs,Phillies,3,4,"$75,000,000"


In [47]:
#Rename Columns to match ESPN's
col_names_dict = {"index":"PLAYER",0:"POS",1:"AGE",2:"STATUS",\
                 3:"OLD_TEAM",4:"NEW_TEAM",5:"YRS",6:"RK"\
                  ,7:"DOLLARS"}

In [48]:
free_agents_17df.rename(columns=col_names_dict,inplace=True)

In [49]:
#Last Step - Add Free Agent Year as Column
free_agents_17df["FA_YEAR"]="2017"

In [51]:
free_agents_17df.head()

Unnamed: 0,PLAYER,POS,AGE,STATUS,OLD_TEAM,NEW_TEAM,YRS,RK,DOLLARS,FA_YEAR
0,Eric Hosmer,1B,28,Signed,Royals,Padres,8,7,"$144,000,000",2017
1,Yu Darvish,SP,31,Signed,Dodgers,Cubs,6,1,"$126,000,000",2017
2,J.D. Martinez,RF,30,Signed,Diamondbacks,Red Sox,5,6,"$110,000,000",2017
3,Lorenzo Cain,CF,32,Signed,Royals,Brewers,5,2,"$80,000,000",2017
4,Jake Arrieta,SP,32,Signed,Cubs,Phillies,3,4,"$75,000,000",2017


In [52]:
#Clean Up DOLLARS column from dashes, commas and $ signs

free_agents_17df["DOLLARS"] = free_agents_17df["DOLLARS"].str.replace(r'\D','0')

free_agents_17df.head()

Unnamed: 0,PLAYER,POS,AGE,STATUS,OLD_TEAM,NEW_TEAM,YRS,RK,DOLLARS,FA_YEAR
0,Eric Hosmer,1B,28,Signed,Royals,Padres,8,7,14400000000,2017
1,Yu Darvish,SP,31,Signed,Dodgers,Cubs,6,1,12600000000,2017
2,J.D. Martinez,RF,30,Signed,Diamondbacks,Red Sox,5,6,11000000000,2017
3,Lorenzo Cain,CF,32,Signed,Royals,Brewers,5,2,8000000000,2017
4,Jake Arrieta,SP,32,Signed,Cubs,Phillies,3,4,7500000000,2017


In [54]:
#Convert Numeric columns to numbers

num_cols = ["DOLLARS","AGE","YRS","FA_YEAR"]
for col in num_cols:
    free_agents_17df[col]=\
    pd.to_numeric(free_agents_17df[col])

In [55]:
free_agents_17df.dtypes

PLAYER       object
POS          object
AGE           int64
STATUS       object
OLD_TEAM     object
NEW_TEAM     object
YRS         float64
RK           object
DOLLARS       int64
FA_YEAR       int64
dtype: object

## ESPN FREE AGENT DATA HAS BEEN GRABBED + CLEANED
next step with this is to build script to grab multiple years of this data
