In [3]:
import requests
from bs4 import BeautifulSoup
from scraping_utils import make_http_req
import pandas as pd
from scraping_utils import years

In [4]:
def extract_yearly_mvp_table(page):
    """
    Extracts the MVP table from the given HTML page. Strips the web page of
    unnecessary information, return the relevant mvp table for the page.
    :param page: HTML content of the web page.
    :return: BeautifulSoup object containing the MVP table.
    """
    soup = BeautifulSoup(page, 'html.parser')
    # Remove the 0th table row - it contains unnecessary info for our data
    soup.find('tr', class_='over_header').decompose()
    # Extract the specific table containing MVP voting data
    mvp_table = soup.find(id='mvp')
    return mvp_table

In [5]:
# Download the MVP Voting Data for each player for each year  
years = [year for year in range(1991, 2023)]
url_template : str  = 'https://www.basketball-reference.com/awards/awards_{}.html'

for year in years: 
    url = url_template.format(year)
    file_path = '../data/yearly_mvp_data/{}.html'.format(year)
    try: 
        with open(file_path, 'x', encoding='utf-8') as f:
            url = url_template.format(year)
            # Make  a GET request to the URL
            response = make_http_req(url)
            # Save the html table into a file in our yearly_mvp_data folder
            mvp_table_html = extract_yearly_mvp_table(response.text)
            # Write a string repr. of the page's HTML for the mvp table
            f.write(str(mvp_table_html))
    except FileExistsError: # Indicates we have alrady scraped the data for this year
            # If the file already exists, we can simply continue
            continue

In [8]:
def load_yearly_mvp_data(year):
    """
    Load MVP data from an HTML file for a specific year.

    :param year: Year for data loading.
    :return a DataFrame formed by reading the scraped and downloaded HTML table
    corresponding to the given year
    :rtype pd.DataFrame
    :raises FileNotFoundError: If HTML file is not found.
    :raises ValueError: If no tables are in the HTML file.
    """
    file_path_template = '../data/yearly_mvp_data/{}.html'
    file_path = file_path_template.format(year)
    try:
        # Load the data from the html for each year into a pandas DataFrame
        # The mvp table is the 0th table in the html
        return pd.read_html(file_path)[0]
    except FileNotFoundError:
        raise FileNotFoundError(f"File for year {year} not found.")
    except ValueError:
        raise ValueError(f"No table found in file for year {year}.")

In [9]:
"""
Process and combine yearly MVP data from HTML files into a single DataFrame.
Create a dataframe formed by concatenating each yearly MVP dataframe.
Each yearly MVP dataframe is formed by reading the html table for the year
Save the dataframe as a CSV
"""
dataframe_list = []  # Store a list to hold all of our dataframes
for year in years:
    curr_df = load_yearly_mvp_data(year)
    # Add a year column to the current df
    curr_df['Year'] = year
    dataframe_list.append(curr_df)
# Combine all yearly DataFrames into a single DataFrame
mvp_df: pd.DataFrame = pd.concat(dataframe_list)
# Now we have one data frame with all mvp voting from 1991 to 2022
# Store the data in csv format
mvp_df.to_csv('../data/mvp_voting_1991-2022.csv')
mvp_df.tail()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
7,8,Stephen Curry,33,GSW,0.0,4.0,1000,0.004,64,34.5,...,5.2,6.3,1.3,0.4,0.437,0.38,0.923,8.0,0.173,2022
8,9,Chris Paul,36,PHO,0.0,2.0,1000,0.002,65,32.9,...,4.4,10.8,1.9,0.3,0.493,0.317,0.837,9.4,0.21,2022
9,10T,DeMar DeRozan,32,CHI,0.0,1.0,1000,0.001,76,36.1,...,5.2,4.9,0.9,0.3,0.504,0.352,0.877,8.8,0.154,2022
10,10T,Kevin Durant,33,BRK,0.0,1.0,1000,0.001,55,37.2,...,7.4,6.4,0.9,0.9,0.518,0.383,0.91,8.4,0.198,2022
11,10T,LeBron James,37,LAL,0.0,1.0,1000,0.001,56,37.2,...,8.2,6.2,1.3,1.1,0.524,0.359,0.756,7.5,0.172,2022
