In [1]:
#

import os   #
import pandas as pd  #
from bs4 import BeautifulSoup   #
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout  #
from typing import List

import time   #
# Make sure to install playwright browsers by running playwright install on the command line or !playwright install from Jupyter

In [2]:
SEASONS = list(range(2016,2023))   #

SEASONS

[2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [3]:
#
# BLOCK ONE
#

# Define the parent directory for data
DATA_DIR = "data"

# Define the subdirectory for storing standings data
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")

# Define the subdirectory for storing game scores data
SCORES_DIR = os.path.join(DATA_DIR, "scores")

#

In [4]:
#
# BLOCK TWO
#


# function to obtain html
async def get_html(url: str, selector: str, sleep: int = 5, retries: int = 3) -> str:
    """
    An asynchronous function to get HTML code from a website given a URL and a CSS selector for the content.

    Args:
        url (str): The URL to the webpage to be scraped.
        selector (str): The CSS selector for the content to be extracted.
        sleep (int, optional): The amount of time (in seconds) to sleep between retries. Defaults to 5.
        retries (int, optional): The number of times to retry if the content extraction fails. Defaults to 3.

    Returns:
        str: The HTML content of the selected element.
    """
    html = None
    for i in range(1, retries+1):   # default retries up to 3 times in event of failure
        time.sleep(sleep * i)       # pause for 'sleep' seconds to avoid website detection; and website ban;  5, 10, 15 second pauses
        try:
            async with async_playwright() as p:     # initialize a Playwright instance for web scraping
                browser = await p.chromium.launch() # launch a Chromium browser instance
                page = await browser.new_page()     # create a new page object in the browser
                await page.goto(url)                # navigate to the URL in the page object
                print(await page.title())           # print the page title (for debugging)
                html = await page.inner_html(selector) # extract the HTML content from the page object using the CSS selector
        except PlaywrightTimeout:   # if a timeout error occurs, print a message and try again
            print(f'Timeout error on {url}')
            continue
        else:   # if the HTML content is successfully obtained, break the loop and return the content
            break
    return html


In [6]:
#
# BLOCK THREE
#

# Define an asynchronous function to scrape the standings for a given season
async def scrape_season(season: str) -> None:
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"   # Set the URL to the page containing game schedules for the given season
    html = await get_html(url, "#content .filter")   # Use the `get_html` function to get the page HTML, passing in the URL and CSS selector
    
    soup = BeautifulSoup(html)   # Use BeautifulSoup to parse the HTML
    links = soup.find_all("a")   # Find all the links on the page
    standings_pages = [f"https://www.basketball-reference.com{l['href']}" for l in links]   # Create a list of URLs for each standings page by appending the relative path to the base URL
    
    for url in standings_pages:   # Loop over each standings page URL
        save_path = os.path.join(STANDINGS_DIR, url.split("/")[-1])   # Set the save path by joining the `STANDINGS_DIR` path with the last part of the URL
        if os.path.exists(save_path):   # If the file already exists, skip to the next URL
            continue
        
        html = await get_html(url, "#all_schedule")   # Use the `get_html` function to get the page HTML for the current URL
        with open(save_path, "w+") as f:   # Open the file at the save path with write access
            f.write(html)   # Write the HTML to the file


In [8]:
season = 2016
url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games.html'
url

'https://www.basketball-reference.com/leagues/NBA_2016_games.html'

In [9]:
# html = await get_html(url, '#content.filter')

In [10]:
#
# BLOCK FOUR
#

# loop through each season in the list of seasons
for season in SEASONS:          
    await scrape_season(season) # asynchronously scrape the data for that season
    
#

2015-16 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2017-18 NBA Schedule | Basketball-Reference.com
2018-19 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com


In [11]:
#
# BLOCK FIVE
#

# get a list of all the files in the standings directory
standings_files = os.listdir(STANDINGS_DIR)   

#

In [12]:
#
# BLOCK SIX
#

# function to scrape individual game box scores
async def scrape_game(standings_file: str) -> None:
    with open(standings_file, 'r') as f:   # open the standings file and read its contents
        html = f.read()                    # store the contents of the standings file in the 'html' variable

    soup = BeautifulSoup(html)             # create a BeautifulSoup object from the 'html' variable
    links = soup.find_all("a")             # find all the links in the BeautifulSoup object
    hrefs = [l.get('href') for l in links] # extract the href attribute of each link and store in a list called 'hrefs'
    box_scores = [f"https://www.basketball-reference.com{l}" for l in hrefs if l and "boxscore" in l and '.html' in l] # create a list of all the box score URLs

    for url in box_scores:  # loop through each box score URL
        save_path = os.path.join(SCORES_DIR, url.split("/")[-1]) # create a save path for the file based on its URL
        if os.path.exists(save_path): # if the file already exists, skip it
            continue

        html = await get_html(url, "#content") # scrape the HTML from the box score URL using the 'get_html' function
        if not html:  # if no HTML is returned, skip to the next URL
            continue
        with open(save_path, "w+") as f: # open the save path file in write mode
            f.write(html) # write the HTML to the file

#

In [None]:
#
# BLOCK SEVEN
#

# loop over each season in the list of seasons
for season in SEASONS:  # SEASONS is assumed to be a list of integers
    files = [s for s in standings_files if str(season) in s]  # filter the list of standings files to only include files for the current season
    
    for f in files:  # loop over each file for the current season
        filepath = os.path.join(STANDINGS_DIR, f)  # create the full file path for the current file
        
        await scrape_game(filepath)  # scrape the games for the season in the current file
        
#

76ers vs Pelicans, February 19, 2016 | Basketball-Reference.com
Pacers vs Thunder, February 19, 2016 | Basketball-Reference.com
Rockets vs Suns, February 19, 2016 | Basketball-Reference.com
Warriors vs Trail Blazers, February 19, 2016 | Basketball-Reference.com
Nuggets vs Kings, February 19, 2016 | Basketball-Reference.com
Spurs vs Lakers, February 19, 2016 | Basketball-Reference.com
Celtics vs Jazz, February 19, 2016 | Basketball-Reference.com
Bucks vs Hawks, February 20, 2016 | Basketball-Reference.com
Wizards vs Heat, February 20, 2016 | Basketball-Reference.com
Knicks vs Timberwolves, February 20, 2016 | Basketball-Reference.com
Warriors vs Clippers, February 20, 2016 | Basketball-Reference.com
Pelicans vs Pistons, February 21, 2016 | Basketball-Reference.com
Cavaliers vs Thunder, February 21, 2016 | Basketball-Reference.com
Celtics vs Nuggets, February 21, 2016 | Basketball-Reference.com
Spurs vs Suns, February 21, 2016 | Basketball-Reference.com
Hornets vs Nets, February 21, 2016