In [None]:
# Import Libraries
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time

# Years To Be Scraped
SEASONS = list(range(2016, 2026))

SEASONS

# Directoryâ€™s To Store Scraped Standings And Box Scores
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings") # data -> standings
SCORES_DIR = os.path.join(DATA_DIR, "scores") # data -> scores

# Asynchronous Funtion To Use Playwright Library
# Function To Get The Specific HTML Specified In The Selector From A Given URL 
async def get_html(url, selector, sleep=5, retries=3):
    html = None
    for i in range(1, retries+1): # Retries Scraping Process A Maximum Of 3 Times
        time.sleep(sleep * i) # Sleeps/Pauses After Attempted Scraping To Prevent/Work Around Server Ban

        # Tries To Run Code Unless There Is An Error Described Bellow 
        try:
            async with async_playwright() as p:
                # If Issues Occur Could Replace .chromium. With .firefox. To Use Fire Fox Instead
                browser = await p.chromium.launch() # Opens A Chromium (Open Source Verson Of Chrome) Browser
                page = await browser.new_page() # Opens A New Tab In Browser
                await page.goto(url) # Sends Tab To The Given url
                print(await page.title()) #Prints The Pages Title
                html = await page.inner_html(selector) # Grabs A Specific Piece Of The Pages HTML
                
        # If Playwright Timeout Occurs Prints URL On Which The Error Occured And Increments The Number Of Retries Returning To The Top Of The For Loop
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue

        # Breaks Loop If The Scrape Is Successful
        else:
            break

    return html # Returns The Pages HTML

# Function To Scrape The href And Anchor Tags From A Single Season Using The HTML Returned By The get_html Function
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html" # URL To Be Scraped, Itterating Through Seasons 
    
    # Calls get_html Function
    html = await get_html(url, "#content .filter") #Selector First Looks For The Element With The ID Content (Hashtag Means Find An Element By ID) Then Inside That Element Find The Element With The Class Filter And Returns The Given HTML 
    #This Function Will Return All The Anchor Tags And href Links With The Months Of The Seasons
    
    soup = BeautifulSoup(html) # Processes With Beautiful Soup Library
    links = soup.find_all("a") # Finds All The Anchor Tags Returned From The get_html Function Above
    href = [l["href"] for l in links] # Finds All The href Links Returned From The get_html Function Above
    standings_pages = [f"https://basketball-reference.com{l}" for l in href] # Completes All The href Links To Be Active

    # Itterates Through Each One Of The Standings Pages
    for url in standings_pages:
        # Saves The Scaped Data In The Standings Directory With A Specific File Name
        save_path = os.path.join(STANDINGS_DIR, url.split("/")[-1])

        # If The Scaped Data Already Exists Do Not Scape It Again
        if os.path.exists(save_path):
            continue

        # Runs The get_html Function To Get The Schedule Table Within A Given Month
        html = await get_html(url, "#all_schedule")  

        # Opens The File In Write Mode   
        with open(save_path, "w+") as f:
            f.write(html) # Writes HTML To The File

# Loops Through Each Season
for season in SEASONS:
    # Opens Function To Scrape The Given Season
    await scrape_season(season)

# List All The Files In The Standings Directory 
standings_files = os.listdir(STANDINGS_DIR)

# Function To Scrape The Contenet Tag In The Boxscore Of A Game 
async def scrape_game(standings_file):

    # Opens An Individual Standings File Starting With The First In The List
    with open(standings_file, 'r') as f:
        html = f.read()
    
    soup = BeautifulSoup(html) # Processes With Beautiful Soup Library
    links = soup.find_all("a") # Gets All The A Tags With Individual Game Box Scores
    hrefs = [l.get("href") for l in links] # Filters HTML To Get All HREF Links 
    box_scores = [l for l in hrefs if l and "boxscore" in l and ".html" in l] # Filters All HREF Links To Get All Boxscore Links
    box_scores = [f"https://www.basketball-reference.com{l}" for l in box_scores] # Creates Full Link To Boxscores
    
    # Loops Through Each Box Score
    for url in box_scores:
        # Saves The Scaped Data In The Scores Directory With A Specific File Name
        save_path = os.path.join(SCORES_DIR, url.split("/")[-1])  
    
        # If The Scaped Data Already Exists Do Not Scape It Again
        if os.path.exists(save_path):
            continue
    
        # Runs The get_html Function To Get The Content Tag In The Box Score HTML
        html = await get_html(url, "#content")
    
        # Of Download Fails Continue The Loop
        if not html:
            continue
    
        # Opens The File In Write Mode   
        with open(save_path, "w+") as f:
            f.write(html) # Writes HTML To The File    

#Filters Out Any Files That Are Not The Monthly Standings 
standings_files = [s for s in standings_files if ".html" in s]

# Loops Through Each Standings File
for f in standings_files:
    filepath = os.path.join(STANDINGS_DIR, f)

    # Opens Function To Scrape The Given Games
    await scrape_game(filepath)