## Scraping Using Selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By
import time
import random
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
service = Service(executable_path="/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/chromedriver")
driver = webdriver.Chrome(service=service)

In [None]:
# Getting the URLs for every game from each month

nba_months = ["october", "november", "december", "january",
               "february", "march", "april", "may", "june"]

season_25_links = []

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(), options=options)


for month in nba_months:
    url = f"https://www.basketball-reference.com/leagues/NBA_2025_games-{month}.html"
    print(f"Scraping {url}")
    driver.get(url)
    time.sleep(5)


    links = driver.find_elements(By.XPATH, '//a[contains(@href, "/boxscores/") and contains(text(), "Box Score")]')
    for link in links:
        href = link.get_attribute('href')
        if href:
            season_25_links.append(href)


driver.quit()

# Check results
print(season_25_links[:10])

In [None]:
len(season_25_links)

In [None]:
remaining_games = season_25_links[616:]
len(remaining_games)

In [None]:
# Going through the list of links and saving the page as an HTML file

game_folder = "/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Games/Games_2024-25"
os.makedirs(game_folder, exist_ok=True)

options = webdriver.ChromeOptions()
prefs = {
    "profile.managed_default_content_settings.images": 2,
    "profile.managed_default_content_settings.stylesheets": 2,
    "profile.managed_default_content_settings.plugins": 2,
    "profile.managed_default_content_settings.popups": 2,
    "profile.managed_default_content_settings.geolocation": 2,
    "profile.managed_default_content_settings.notifications": 2
}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=Service(), options=options)

for game in remaining_games:
    print(f"Saving {game}")
    driver.get(game)

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.ID, "content"))
    )

    html = driver.page_source
    filename = os.path.join(game_folder, game.split("/")[-1])

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html)

    time.sleep(random.uniform(1, 4))

driver.quit()

In [None]:
# Getting the season data for individual teams.

nba_team_abb_east = ["CLE", "BOS", "NYK", "IND", "MIL", "DET", "ORL", "ATL", "CHI", "MIA", "TOR", "BRK", "PHI", "CHO", "WAS"]
nba_team_abb_west = ["OKC", "HOU", "LAL", "DEN", "LAC", "GSW", "MIN", "MEM", "SAC", "DAL", "PHO", "POR", "SAS", "NOP", "UTA"]

team_folder = "/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425"
os.makedirs(team_folder, exist_ok=True)

options = webdriver.ChromeOptions()
prefs = {
    "profile.managed_default_content_settings.images": 2,
    "profile.managed_default_content_settings.stylesheets": 2,
    "profile.managed_default_content_settings.plugins": 2,
    "profile.managed_default_content_settings.popups": 2,
    "profile.managed_default_content_settings.geolocation": 2,
    "profile.managed_default_content_settings.notifications": 2
}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=Service(), options=options)

for team in nba_team_abb_west:
    print(f"Saving {team}")
    driver.get(f"https://www.basketball-reference.com/teams/{team}/2025.html")

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.ID, "content"))
    )

    html = driver.page_source
    filename = os.path.join(team_folder, f"{team}-2025.html")

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html)
    
    time.sleep(random.uniform(1, 5))

driver.quit()

## Parsing HTML Files for Data

In [None]:
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup, Comment
import re
import os
from pathlib import Path

In [None]:
games_path = Path("/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Games/Games_2024-25")
game_data = []

In [None]:
basic_dfs = []

for game in os.listdir(games_path):
    with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Games/Games_2024-25/{game}", encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")

    basic_tables = soup.find_all(id=re.compile(r"^box-.*-game-basic$"))

    table_dfs = []
    for table in basic_tables:
        table_id = table.get("id")
        table_html = str(table)
        df = pd.read_html(StringIO(table_html), header=1)[0]
        df["team_table_id"] = table_id
        df["game_id"] = game
        table_dfs.append(df)

    # Add the opposing team table id
    if len(table_dfs) == 2:
        table_dfs[0]["opp_team_table_id"] = table_dfs[1]["team_table_id"]
        table_dfs[1]["opp_team_table_id"] = table_dfs[0]["team_table_id"]

    basic_dfs.extend(table_dfs)

basic_tables_combined = pd.concat(basic_dfs, ignore_index=True)

In [None]:
basic_tables_combined

In [None]:
advanced_dfs = []

for game in os.listdir(games_path):
    with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Games/Games_2024-25/{game}", encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")

    advanced_tables = soup.find_all(id=re.compile(r"^box-.*-game-advanced$"))

    table_dfs = []
    for table in advanced_tables:
        table_id = table.get("id")
        table_html = str(table)
        df = pd.read_html(StringIO(table_html), header=1)[0]
        df["team_table_id"] = table_id
        df["game_id"] = game
        table_dfs.append(df)

    # Add the opposing team table id
    if len(table_dfs) == 2:
        table_dfs[0]["opp_team_table_id"] = table_dfs[1]["team_table_id"]
        table_dfs[1]["opp_team_table_id"] = table_dfs[0]["team_table_id"]

    advanced_dfs.extend(table_dfs)

advanced_tables_combined = pd.concat(advanced_dfs, ignore_index=True)

In [None]:
advanced_tables_combined

In [None]:
basic_tables_combined.to_csv("/Users/camsmithers/Desktop/Camalytics/NBA/Data-NBA/BasicBox2425.csv")
advanced_tables_combined.to_csv("/Users/camsmithers/Desktop/Camalytics/NBA/Data-NBA/AdvancedBox2425.csv")

In [None]:
all_teams = Path("/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams")
playoff_teams = Path("/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams")

In [None]:
all_general_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "team_and_opponent" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="team_and_opponent")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        all_general_dfs.append(df)

all_general_dfs = pd.concat(all_general_dfs, ignore_index=True)

In [None]:
all_misc_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "team_misc" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="team_misc")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        all_misc_dfs.append(df)

all_misc_combined = pd.concat(all_misc_dfs, ignore_index=True)

In [None]:
all_pgs_dfs = []

for team in os.listdir(all_teams):
    with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")

    tables = soup.find_all(id="per_game_stats")

    for table in tables:
        table_html = str(table)
        df = pd.read_html(StringIO(table_html))[0]
        df["team_id"] = team
        all_pgs_dfs.append(df)
all_pgs_combined = pd.concat(all_pgs_dfs, ignore_index=True)

all_pgs_combined

In [None]:
playoff_pgs_dfs = []

for team in os.listdir(playoff_teams):
    with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")

    tables = soup.find_all(id="per_game_stats_post")

    for table in tables:
        table_html = str(table)
        df = pd.read_html(StringIO(table_html))[0]
        df["team_id"] = team
        playoff_pgs_dfs.append(df)
playoff_pgs_combined = pd.concat(playoff_pgs_dfs, ignore_index=True)

playoff_pgs_combined

In [None]:
all_totals_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "totals_stats" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="totals_stats")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        all_totals_dfs.append(df)

all_totals_combined = pd.concat(all_totals_dfs, ignore_index=True)

In [None]:
playoff_totals_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "totals_stats_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="totals_stats_post")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        playoff_totals_dfs.append(df)

playoff_totals_combined = pd.concat(playoff_totals_dfs, ignore_index=True)

In [None]:
all_per36_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        tables = soup.find_all(id="per_minute_stats")

        for table in tables:
            table_html = str(table)
            df = pd.read_html(StringIO(table_html))[0]
            df["team_id"] = team
            all_per36_dfs.append(df)

all_per36_combined = pd.concat(all_per36_dfs, ignore_index=True)

all_per36_combined

In [None]:
playoff_per36_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        tables = soup.find_all(id="per_minute_stats_post")

        for table in tables:
            table_html = str(table)
            df = pd.read_html(StringIO(table_html))[0]
            df["team_id"] = team
            playoff_per36_dfs.append(df)

playoff_per36_combined = pd.concat(playoff_per36_dfs, ignore_index=True)

playoff_per36_combined

In [None]:
all_per100_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "per_poss" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="per_poss")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        all_per100_dfs.append(df)

all_per100_combined = pd.concat(all_per100_dfs, ignore_index=True)

all_per100_combined

In [None]:
playoff_per100_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "per_poss_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="per_poss_post")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        playoff_per100_dfs.append(df)

playoff_per100_combined = pd.concat(playoff_per100_dfs, ignore_index=True)

playoff_per100_combined

In [None]:
all_advanced_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if 'id="advanced"' in c)
        table = BeautifulSoup(comment, "html.parser").find(id="advanced")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        all_advanced_dfs.append(df)

all_advanced_combined = pd.concat(all_advanced_dfs, ignore_index=True)

all_advanced_combined

In [None]:
playoff_advanced_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "advanced_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="advanced_post")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        playoff_advanced_dfs.append(df)

playoff_advanced_combined = pd.concat(playoff_advanced_dfs, ignore_index=True)

playoff_advanced_combined

In [None]:
all_adjshooting_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "adj_shooting" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="adj_shooting")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        all_adjshooting_dfs.append(df)

all_adjshooting_combined = pd.concat(all_adjshooting_dfs, ignore_index=True)

all_adjshooting_combined

In [None]:
playoff_adjshooting_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "adj_shooting_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="adj_shooting_post")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        playoff_adjshooting_dfs.append(df)

playoff_adjshooting_combined = pd.concat(playoff_adjshooting_dfs, ignore_index=True)

playoff_adjshooting_combined

In [None]:
all_shooting_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if 'id="shooting"' in c)
        table = BeautifulSoup(comment, "html.parser").find(id="shooting")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        all_shooting_dfs.append(df)

all_shooting_combined = pd.concat(all_shooting_dfs, ignore_index=True)

all_shooting_combined

In [None]:
playoff_shooting_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if 'id="shooting_post"' in c)
        table = BeautifulSoup(comment, "html.parser").find(id="shooting_post")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        playoff_shooting_dfs.append(df)

playoff_shooting_combined = pd.concat(playoff_shooting_dfs, ignore_index=True)

playoff_shooting_combined

In [None]:
all_pbp_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "pbp_stats" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="pbp_stats")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        all_pbp_dfs.append(df)

all_pbp_combined = pd.concat(all_pbp_dfs, ignore_index=True)

all_pbp_combined

In [None]:
playoff_pbp_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "pbp_stats_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="pbp_stats_post")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        playoff_pbp_dfs.append(df)

playoff_pbp_combined = pd.concat(playoff_pbp_dfs, ignore_index=True)

playoff_pbp_combined

In [None]:
all_team_dfs_dict = {
    "pgs": all_pgs_combined,
    "misc": all_misc_combined,
    "totals": all_totals_combined,
    "per36": all_per36_combined,
    "per100": all_per100_combined,
    "advanced": all_advanced_combined,
    "adjshooting": all_adjshooting_combined,
    "shooting": all_shooting_combined,
    "pbp": all_pbp_combined
}

for name, df in all_team_dfs_dict.items():
    df.to_csv(f"/Users/camsmithers/Desktop/Camalytics/NBA/Data-NBA/{name}-2425.csv", index=False)

In [None]:
playoff_team_dfs_dict = {
    "playoff_pgs": playoff_pgs_combined,
    "playoff_totals": playoff_totals_combined,
    "playoff_per36": playoff_per36_combined,
    "playoff_per100": playoff_per100_combined,
    "playoff_advanced": playoff_advanced_combined,
    "playoff_adjshooting": playoff_adjshooting_combined,
    "playoff_shooting": playoff_shooting_combined,
    "playoff_pbp": playoff_pbp_combined
}

for name, df in playoff_team_dfs_dict.items():
    df.to_csv(f"/Users/camsmithers/Desktop/Camalytics/NBA/Data-NBA/{name}-2425.csv", index=False)