In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from scraping_utils import years
import pandas as pd
import time
from io import StringIO

In [11]:
def parse_html(html):
    """
    Extracts the player stats table from the given HTML page. Strips the web
    page of unnecessary info, returning the relevant mvp table for the page.
    :param page: HTML content of the web page.
    :return: BeautifulSoup object containing the player table.
    """
    soup = BeautifulSoup(html, 'html.parser')
    # Remove redundant rows that repeat row 0 (the list of cols)
    for tr in soup.find_all('tr', class_='thead'):
        tr.decompose()
    # Extract the specific table containing the table with per game stats for
    # each player for the given year
    player_table = soup.find(id='per_game_stats')
    return player_table

In [12]:
def extract_yearly_player_table(url):
    """
    Initiates chrome instnances with selenium to obtain the full HTML player stats table,
    parses this HTML with BeautifulSoup and returns a cleaned version
    :param url: the url we want to scrape data from
    :return: the HTML string of the specific table we'd like to retrieve 
    """
    chrome_driver_path = "C:/Users/bmurr/Downloads/PersonalProjects/NBA_MVP_Predictor/venv/Lib/site-packages/seleniumbase/drivers/chromedriver.exe"
    # Create a service to start and manage the chrome driver server
    service = Service(chrome_driver_path)
    # Create a driver to control our browser
    driver = webdriver.Chrome(service=service)

    driver.get(url=url)  # Render our URL in our chrome browser

    # Run JS in the browser to render full table
    # JS code means: "scroll all the way down so that the full table renders"
    driver.execute_script("window.scroll(1,10000)")

    time.sleep(2)  # Give the browser appropriate time to execute the script

    html = driver.page_source  # Get the fully rendered HTML
    return parse_html(html)

In [13]:
"""
For each year we are scraping data for, grab the player statistics table from basketball reference
- Use selenium to run JS to load the full player statistics page's HTML
- Parse the fully-rendered HTML to extract the player-per-game stats table
- Download the table to files in our data directory
"""
base_url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
for year in years:
    file_path = '../data/yearly_player_data/{}.html'.format(year)
    try:  # Create a new file, if it already exists, raise a FileExistsError
        with open(file_path, 'x', encoding='utf-8') as f:
            url = base_url.format(year)
            player_table_html = extract_yearly_player_table(url)
            f.write(str(player_table_html))
    except FileExistsError:
        # If the file already exists, we can simply continue
        continue

In [14]:
"""
For each of our downloaded yearly player stats table HTML files: 
- Read the table HTML into a pandas dataframe
- Add a year column to the given dataframe 
- Combine the dataframe with all of our other yearly player stat dataframes
Write the combined dataframe to a CSV file and download it
"""
dfs = []
for year in years:
    file_path = '../data/yearly_player_data/{}.html'.format(year)
    with open(file_path, 'r', encoding='utf-8') as f:
        player_table_html = f.read()
        # Wrap the raw HTML in stringIO before parsing for error-free pandas reading
        player_table_dataframe = pd.read_html(StringIO(str(player_table_html)))[0]
        player_table_dataframe['Year'] = year
        dfs.append(player_table_dataframe)

combined_df : pd.DataFrame= pd.concat(dfs)

    

In [15]:
combined_df[combined_df['Player'] == 'James Harden']
combined_df.to_csv('../data/player_stats_1991-2022.csv',index=False)