In [3]:
import requests
from bs4 import BeautifulSoup
from scraping_utils import make_http_req
import pandas as pd
from scraping_utils import years
from io import StringIO

In [2]:
def extract_yearly_mvp_table(page):
    """
    Extracts the MVP table from the given HTML page. Strips the web page of
    unnecessary information, return the relevant mvp table for the page.
    :param page: HTML content of the web page.
    :return: BeautifulSoup object containing the MVP table.
    """
    soup = BeautifulSoup(page, 'html.parser')
    # Remove the 0th table row - it contains unnecessary info for our data
    soup.find('tr', class_='over_header').decompose()
    # Extract the specific table containing MVP voting data
    mvp_table = soup.find(id='mvp')
    return mvp_table

In [3]:
# Download the MVP Voting Data for each player for each year  
years = [year for year in range(1991, 2023)]
url_template : str  = 'https://www.basketball-reference.com/awards/awards_{}.html'

for year in years: 
    url = url_template.format(year)
    file_path = '../data/yearly_mvp_data/{}.html'.format(year)
    try: 
        with open(file_path, 'x', encoding='utf-8') as f:
            url = url_template.format(year)
            # Make  a GET request to the URL
            response = make_http_req(url)
            # Save the html table into a file in our yearly_mvp_data folder
            mvp_table_html = extract_yearly_mvp_table(response.text)
            # Write the page's HTML to a file so we can view it later
            f.write(str(mvp_table_html))
    except FileExistsError: # Indicates we have alrady scraped the data for this year
            # If the file already exists, we can simply continue
            continue

In [4]:
def load_yearly_mvp_data(year):
    """
    Load MVP data from an HTML file for a specific year.

    :param year: Year for data loading.
    :return a DataFrame formed by reading the scraped and downloaded HTML table
    corresponding to the given year
    :rtype pd.DataFrame
    :raises FileNotFoundError: If HTML file is not found.
    :raises ValueError: If no tables are in the HTML file.
    """
    file_path_template = '../data/yearly_mvp_data/{}.html'
    file_path = file_path_template.format(year)
    try:
        # Load the data from the html for each year into a pandas DataFrame
        # Wrap the raw HTML in stringIO before parsing for error-free pandas reading
        with open(file_path, 'r', encoding='utf-8') as f:
            player_table_html = f.read()
            # Wrap the raw HTML in stringIO before parsing for error-free pandas reading
            return pd.read_html(StringIO(str(player_table_html)))[0]
    except FileNotFoundError:
        raise FileNotFoundError(f"File for year {year} not found.")
    except ValueError:
        raise ValueError(f"No table found in file for year {year}.")

In [5]:
"""
Process and combine yearly MVP data from HTML files into a single DataFrame.
Create a dataframe formed by concatenating each yearly MVP dataframe.
Each yearly MVP dataframe is formed by reading the html table for the year
Save the dataframe as a CSV
"""
dataframe_list = []  # Store a list to hold all of our dataframes
for year in years:
    curr_df = load_yearly_mvp_data(year)
    # Add a year column to the current df
    curr_df['Year'] = year
    dataframe_list.append(curr_df)
# Combine all yearly DataFrames into a single DataFrame
mvp_df: pd.DataFrame = pd.concat(dataframe_list)
# Now we have one data frame with all mvp voting from 1991 to 2022
# Store the data in csv format
mvp_df.to_csv('../data/mvp_voting_1991-2022.csv',index=False)