In [2]:
# Required Packages
try:
    from bs4 import BeautifulSoup
    import requests
    import csv
except ImportError:
    %%capture
    !pip install bs4
    !pip install requests
    from bs4 import BeautifulSoup
    import requests
    import csv

In [3]:
url="https://www.spotrac.com/mlb/rankings/2021/salary/"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
# print(soup.prettify()) # print the parsed data of html

### List of Teams Url

In [21]:
mlb_teams = soup.find("select", attrs={"name": "teamUrl1"})
mlb_teams_data = mlb_teams.find_all('option')

teams = []
for option in mlb_teams_data:
    teams.append(option.attrs['value'])
    
teams

['',
 'arizona-diamondbacks',
 'atlanta-braves',
 'baltimore-orioles',
 'boston-red-sox',
 'chicago-cubs',
 'chicago-white-sox',
 'cincinnati-reds',
 'cleveland-indians',
 'colorado-rockies',
 'detroit-tigers',
 'houston-astros',
 'kansas-city-royals',
 'los-angeles-angels',
 'los-angeles-dodgers',
 'miami-marlins',
 'milwaukee-brewers',
 'minnesota-twins',
 'new-york-mets',
 'new-york-yankees',
 'oakland-athletics',
 'philadelphia-phillies',
 'pittsburgh-pirates',
 'san-diego-padres',
 'san-francisco-giants',
 'seattle-mariners',
 'st-louis-cardinals',
 'tampa-bay-rays',
 'texas-rangers',
 'toronto-blue-jays',
 'washington-nationals']

In [24]:
for team in teams:
    url=f"https://www.spotrac.com/mlb/rankings/2021/salary/{team}"

    # Make a GET request to fetch the raw HTML content
    html_content = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")
    # print(soup.prettify()) # print the parsed data of html
    
    # Find Table of interest on the webpage
    mlb_table = soup.find("table", attrs={"class": "datatable noborder"})

    mlb_table_header = mlb_table.thead.find_all("tr") # Headers
    mlb_table_data = mlb_table.tbody.find_all("tr")  # Rows
    
    # Extract Information from the Table
    # Get all the headings
    headings = []
    for th in mlb_table_header[0].find_all("th"):
        # remove any newlines and extra spaces from left and right
        headings.append(th.text.replace('\n', ' ').strip())
    
    data = []
    for tr in mlb_table_data: # find all tr's from table's tbody

        row = {}
        # Each row is stored in the form of
        # row = {'Rank': '', 'Team': '',etc...}

        # find all td's in tr and zip it with headings
        for td, th in zip(tr.find_all("td"), headings):
            if td.attrs:
                # Getting the player name only
                if td.attrs['class'][0] == "rank-name":
                    row[th] = td.find('h3').text.replace('\n', '').strip()
                    # Creating custom column for team code
                    row['code'] = td.find('div', "rank-position").text.replace('\n', '').strip()
                    continue
            row[th] = td.text.replace('\n', '').strip()
        data.append(row)

    # Adding custom column to headings
    headings.insert(3, 'code')
    
    # Exporting Table to Excel
    with open(f"mlb_player_salary_{team}.csv", 'w', newline = '') as out_file:
        writer = csv.DictWriter(out_file, headings)
        writer.writeheader()
        writer.writerows(data)