In [2]:
# Required Packages
try:
    from bs4 import BeautifulSoup
    import requests
    import csv
except ImportError:
    %%capture
    !pip install bs4
    !pip install requests
    from bs4 import BeautifulSoup
    import requests
    import csv

In [3]:
url="https://www.spotrac.com/mlb/rankings/2021/salary/"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
# print(soup.prettify()) # print the parsed data of html

### Find Table of interest on the webpage

In [4]:
mlb_table = soup.find("table", attrs={"class": "datatable noborder"})

mlb_table_header = mlb_table.thead.find_all("tr") # Headers
mlb_table_data = mlb_table.tbody.find_all("tr")  # Rows

### Extract Information from the Table

In [37]:
# Get all the headings
headings = []
for th in mlb_table_header[0].find_all("th"):
    # remove any newlines and extra spaces from left and right
    headings.append(th.text.replace('\n', ' ').strip())

print(headings)

['', 'Player', 'POS', 'Age', 'salary']


In [38]:
data = []
for tr in mlb_table_data: # find all tr's from table's tbody
    
    row = {}
    # Each row is stored in the form of
    # row = {'Rank': '', 'Team': '',etc...}

    # find all td's in tr and zip it with headings
    for td, th in zip(tr.find_all("td"), headings):
        if td.attrs:
            # Getting the player name only
            if td.attrs['class'][0] == "rank-name":
                row[th] = td.find('h3').text.replace('\n', '').strip()
                # Creating custom column for team code
                row['code'] = td.find('div', "rank-position").text.replace('\n', '').strip()
                continue
        row[th] = td.text.replace('\n', '').strip()
    data.append(row)
    
# Adding custom column to headings
headings.insert(3, 'code')
print(data)

[{'': '1', 'Player': 'Mike Trout', 'code': 'LAA', 'POS': 'CF', 'Age': '29', 'salary': '$37,166,667'}, {'': '2', 'Player': 'Gerrit Cole', 'code': 'NYY', 'POS': 'SP', 'Age': '30', 'salary': '$36,000,000'}, {'': '', 'Player': 'Jacob deGrom', 'code': 'NYM', 'POS': 'SP', 'Age': '33', 'salary': '$36,000,000'}, {'': '4', 'Player': 'Nolan Arenado', 'code': 'STL', 'POS': '3B', 'Age': '30', 'salary': '$35,025,000'}, {'': '5', 'Player': 'Stephen Strasburg', 'code': 'WSH', 'POS': 'SP', 'Age': '32', 'salary': '$35,000,000'}, {'': '', 'Player': 'Zack Greinke', 'code': 'HOU', 'POS': 'SP', 'Age': '37', 'salary': '$35,000,000'}, {'': '7', 'Player': 'Max Scherzer', 'code': 'LAD', 'POS': 'SP', 'Age': '36', 'salary': '$34,603,480'}, {'': '8', 'Player': 'Justin Verlander', 'code': 'HOU', 'POS': 'SP', 'Age': '38', 'salary': '$33,000,000'}, {'': '9', 'Player': 'Manny Machado', 'code': 'SD', 'POS': '3B', 'Age': '28', 'salary': '$32,000,000'}, {'': '', 'Player': 'David Price', 'code': 'LAD', 'POS': 'SP', 'Age'

### Exporting Data to Excel

In [39]:
with open("mlb_top_players.csv", 'w', newline = '') as out_file:
    writer = csv.DictWriter(out_file, headings)
    writer.writeheader()
    writer.writerows(data)