In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from scraping_utils import years
import pandas as pd
import time
from io import StringIO

In [None]:
def parse_html(html):
    """
    Extracts the player stats table from the given HTML page. Strips the web
    page of unnecessary info, returning the relevant mvp table for the page.
    :param page: HTML content of the web page.
    :return: BeautifulSoup object containing the player table.
    """
    soup = BeautifulSoup(html, 'html.parser')
    # Remove redundant rows that repeat row 0 (the list of cols)
    soup.find('tr', class_='thead').decompose()
    # Extract the specific table containing the table with per game stats for
    # each player for the given year
    player_table = soup.find(id='per_game_stats')
    return player_table

In [4]:
def extract_yearly_player_table(url):
    """
    Initiates chrome instnances with selenium to obtain the full HTML player stats table,
    parses this HTML with BeautifulSoup and returns a cleaned version
    :param url: the url we want to scrape data from
    :return: the HTML string of the specific table we'd like to retrieve 
    """
    chrome_driver_path = "C:/Users/bmurr/Downloads/PersonalProjects/NBA_MVP_Predictor/venv/Lib/site-packages/seleniumbase/drivers/chromedriver.exe"
    # Create a service to start and manage the chrome driver server
    service = Service(chrome_driver_path)
    # Create a driver to control our browser
    driver = webdriver.Chrome(service=service)

    driver.get(url=url)  # Render our URL in our chrome browser

    # Run JS in the browser to render full table
    # JS code means: "scroll all the way down so that the full table renders"
    driver.execute_script("window.scroll(1,10000)")

    time.sleep(2)  # Give the browser appropriate time to execute the script

    html = driver.page_source  # Get the fully rendered HTML
    return parse_html(html)

In [5]:
"""
For each year we are scraping data for, grab the player statistics table from basketball reference
- Use selenium to run JS to load the full player statistics page's HTML
- Parse the fully-rendered HTML to extract the player-per-game stats table
- Download the table to files in our data directory
"""
base_url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
for year in years:
    file_path = '../data/yearly_player_data/{}.html'.format(year)
    try:  # Create a new file, if it already exists, raise a FileExistsError
        with open(file_path, 'x', encoding='utf-8') as f:
            url = base_url.format(year)
            player_table_html = extract_yearly_player_table(url)
            f.write(str(player_table_html))
    except FileExistsError:
        # If the file already exists, we can simply continue
        continue

In [18]:
"""
For each of our downloaded yearly player stats table HTML files: 
- Read the table HTML into a pandas dataframe
- Add a year column to the given dataframe 
- Combine the dataframe with all of our other yearly player stat dataframes
Write the combined dataframe to a CSV file and download it
"""
dfs = []
for year in years:
    file_path = '../data/yearly_player_data/{}.html'.format(year)
    with open(file_path, 'r', encoding='utf-8') as f:
        player_table_html = f.read()
        # Wrap the raw HTML in stringIO before parsing for error-free pandas reading
        player_table_dataframe = pd.read_html(StringIO(str(player_table_html)))[0]
        player_table_dataframe['Year'] = year
        dfs.append(player_table_dataframe)

combined_df : pd.DataFrame= pd.concat(dfs)

    

      Rk              Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  ORB  \
0      1      Alaa Abdelnaby  PF  22  POR  43   0   6.7  1.3   2.7  ...  0.6   
1      2  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  22.5  6.2  15.1  ...  0.5   
2      3          Mark Acres   C  28  ORL  68   0  19.3  1.6   3.1  ...  2.1   
3      4       Michael Adams  PG  28  DEN  66  66  35.5  8.5  21.5  ...  0.9   
4      5        Mark Aguirre  SF  31  DET  78  13  25.7  5.4  11.7  ...  1.7   
..   ...                 ...  ..  ..  ...  ..  ..   ...  ...   ...  ...  ...   
725  536        Delon Wright  PG  28  SAC  27   8  25.8  3.9   8.3  ...  1.0   
726  537      Thaddeus Young  PF  32  CHI  68  23  24.3  5.4   9.7  ...  2.5   
727  538          Trae Young  PG  22  ATL  63  63  33.7  7.7  17.7  ...  0.6   
728  539         Cody Zeller   C  28  CHO  48  21  20.9  3.8   6.8  ...  2.5   
729  540         Ivica Zubac   C  23  LAC  72  33  22.3  3.6   5.5  ...  2.6   

     DRB  TRB   AST  STL  BLK  TOV   PF

In [23]:
print(combined_df.head())
combined_df.to_csv('../data/player_stats_1991-2022.csv')

  Rk              Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  ORB  DRB  \
0  1      Alaa Abdelnaby  PF  22  POR  43   0   6.7  1.3   2.7  ...  0.6  1.4   
1  2  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  22.5  6.2  15.1  ...  0.5  1.3   
2  3          Mark Acres   C  28  ORL  68   0  19.3  1.6   3.1  ...  2.1  3.2   
3  4       Michael Adams  PG  28  DEN  66  66  35.5  8.5  21.5  ...  0.9  3.0   
4  5        Mark Aguirre  SF  31  DET  78  13  25.7  5.4  11.7  ...  1.7  3.1   

   TRB   AST  STL  BLK  TOV   PF   PTS  Year  
0  2.1   0.3  0.1  0.3  0.5  0.9   3.1  1991  
1  1.8   3.1  0.8  0.1  1.6  2.2  14.1  1991  
2  5.3   0.4  0.4  0.4  0.6  3.2   4.2  1991  
3  3.9  10.5  2.2  0.1  3.6  2.5  26.5  1991  
4  4.8   1.8  0.6  0.3  1.6  2.7  14.2  1991  

[5 rows x 31 columns]
