In [2]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as pwt
import time
import asyncio

In [3]:
seasons = list(range(2016,2024)) # defining the range of seasons we are interested in

In [4]:
data_dir = 'data' # specifying the directories where data will be saved 
standings_dir = os.path.join(data_dir,'standings') # a directory inside of data_dir that will have lists of all the box scores
scores_dir = os.path.join(data_dir, 'scores')# this directory will store the box scores from each game 

In [5]:
# creating a function that gets HTML from a webpage.
# use 'async def' keyword means that this is an asynchronous function, which can be paused and resumed, allowing other tasks to run in the meantime. 
# it retries the operation if it fails, with a delay that increases with each attempt
async def get_html(url, selector, sleep=5, retries=1): 
    html = None
    for i in range(1, retries+1):
        await asyncio.sleep(sleep * i)  
        try:
            async with async_playwright() as p:
                browser = await p.chromium.launch()
                page = await browser.new_page()
                await page.goto(url, timeout = 100000)
                print(await page.title())
                html = await page.inner_html(selector, timeout =100000 )

        # Catching all exceptions and printing them
        except Exception as e:  
            print(f"Error on {url}: {e}")
            continue

        else:
            break

    return html


In [6]:
# this function scrapes the webpage for a specific NBA season, saves the HTML for each game, and then extracts and saves the links to the standings page for each game.
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, "#content .filter")
    
    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    standings_pages = [f"https://www.basketball-reference.com{l['href']}" for l in links]
    
    for url in standings_pages:
        save_path = os.path.join(standings_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue
        
        html = await get_html(url, "#all_schedule")
        with open(save_path, "w+") as f:
            f.write(html)

In [7]:
# this runs the scraping function for each season defined at the start.
# 'await' keyword is used because 'scrape_season' is an asynchronous function.
for season in seasons:
    await scrape_season(season)

2015-16 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2017-18 NBA Schedule | Basketball-Reference.com
2018-19 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2022-23 NBA Schedule | Basketball-Reference.com


In [8]:
# use the 'os' module to get a list of all the files in the 'standings_dir' directory
standings_files = os.listdir(standings_dir)

In [9]:
# this function reads the HTML of a standings file, extracts and saves the links to the box score page for each game, and then gets and saves the HTML for each box score page
async def scrape_game(standings_file):
    with open(standings_file, 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    hrefs = [l.get('href') for l in links]
    box_scores = [f"https://www.basketball-reference.com{l}" for l in hrefs if l and "boxscore" in l and '.html' in l]

    for url in box_scores:
        save_path = os.path.join(scores_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue

        html = await get_html(url, "#content")
        if not html: # in the case there is no HTMl, continue running the function
            continue
        with open(save_path, "w+") as f:
            f.write(html)

In [11]:
# here we are looping over each NBA season, finding all the files related to that season, 
# creating a full file path for each file, and then calling a function to scrape data from each file.
import pandas as pd

for season in seasons:
    files = [s for s in standings_files if str(season) in s]
    
    for f in files:
        filepath = os.path.join(standings_dir, f)
        
        await scrape_game(filepath)

Grizzlies vs Jazz, April 5, 2022 | Basketball-Reference.com
