## Scraping Using Selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By
import time
import random
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
service = Service(executable_path="/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/chromedriver")
driver = webdriver.Chrome(service=service)

In [None]:
# Getting the URLs for every game from each month

nba_months = ["october", "november", "december", "january",
               "february", "march", "april", "may", "june"]

season_25_links = []

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(), options=options)


for month in nba_months:
    url = f"https://www.basketball-reference.com/leagues/NBA_2025_games-{month}.html"
    print(f"Scraping {url}")
    driver.get(url)
    time.sleep(5)


    links = driver.find_elements(By.XPATH, '//a[contains(@href, "/boxscores/") and contains(text(), "Box Score")]')
    for link in links:
        href = link.get_attribute('href')
        if href:
            season_25_links.append(href)


driver.quit()

# Check results
print(season_25_links[:10])

Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-october.html
Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-november.html
Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-december.html
Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-january.html
Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-february.html
Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-march.html
Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-april.html
Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-may.html
Scraping https://www.basketball-reference.com/leagues/NBA_2025_games-june.html
['https://www.basketball-reference.com/boxscores/202410220BOS.html', 'https://www.basketball-reference.com/boxscores/202410220LAL.html', 'https://www.basketball-reference.com/boxscores/202410230DET.html', 'https://www.basketball-reference.com/boxscores/202410230ATL.html

In [3]:
len(season_25_links)

1321

In [9]:
remaining_games = season_25_links[616:]
len(remaining_games)

705

In [None]:
# Going through the list of links and saving the page as an HTML file

game_folder = "/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Games/Games_2024-25"
os.makedirs(game_folder, exist_ok=True)

options = webdriver.ChromeOptions()
prefs = {
    "profile.managed_default_content_settings.images": 2,
    "profile.managed_default_content_settings.stylesheets": 2,
    "profile.managed_default_content_settings.plugins": 2,
    "profile.managed_default_content_settings.popups": 2,
    "profile.managed_default_content_settings.geolocation": 2,
    "profile.managed_default_content_settings.notifications": 2
}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=Service(), options=options)

for game in remaining_games:
    print(f"Saving {game}")
    driver.get(game)

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.ID, "content"))
    )

    html = driver.page_source
    filename = os.path.join(game_folder, game.split("/")[-1])

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html)

    time.sleep(random.uniform(1, 4))

driver.quit()

In [None]:
# Getting the season data for individual teams.

nba_team_abb_east = ["CLE", "BOS", "NYK", "IND", "MIL", "DET", "ORL", "ATL", "CHI", "MIA", "TOR", "BRK", "PHI", "CHO", "WAS"]
nba_team_abb_west = ["OKC", "HOU", "LAL", "DEN", "LAC", "GSW", "MIN", "MEM", "SAC", "DAL", "PHO", "POR", "SAS", "NOP", "UTA"]

team_folder = "/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425"
os.makedirs(team_folder, exist_ok=True)

options = webdriver.ChromeOptions()
prefs = {
    "profile.managed_default_content_settings.images": 2,
    "profile.managed_default_content_settings.stylesheets": 2,
    "profile.managed_default_content_settings.plugins": 2,
    "profile.managed_default_content_settings.popups": 2,
    "profile.managed_default_content_settings.geolocation": 2,
    "profile.managed_default_content_settings.notifications": 2
}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=Service(), options=options)

for team in nba_team_abb_west:
    print(f"Saving {team}")
    driver.get(f"https://www.basketball-reference.com/teams/{team}/2025.html")

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.ID, "content"))
    )

    html = driver.page_source
    filename = os.path.join(team_folder, f"{team}-2025.html")

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html)
    
    time.sleep(random.uniform(1, 5))

driver.quit()

Saving OKC
Saving HOU
Saving LAL
Saving DEN
Saving LAC
Saving GSW
Saving MIN
Saving MEM
Saving SAC
Saving DAL
Saving PHO
Saving POR
Saving SAS
Saving NOP
Saving UTA


## Parsing HTML Files for Data

In [2]:
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup, Comment
import re
import os
from pathlib import Path

In [3]:
games_path = Path("/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Games/Games_2024-25")
game_data = []

In [4]:
basic_dfs = []

for game in os.listdir(games_path):
    with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Games/Games_2024-25/{game}", encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")

    basic_tables = soup.find_all(id=re.compile(r"^box-.*-game-basic$"))

    table_dfs = []
    for table in basic_tables:
        table_id = table.get("id")
        table_html = str(table)
        df = pd.read_html(StringIO(table_html), header=1)[0]
        df["team_table_id"] = table_id
        df["game_id"] = game
        table_dfs.append(df)

    # Add the opposing team table id
    if len(table_dfs) == 2:
        table_dfs[0]["opp_team_table_id"] = table_dfs[1]["team_table_id"]
        table_dfs[1]["opp_team_table_id"] = table_dfs[0]["team_table_id"]

    basic_dfs.extend(table_dfs)

basic_tables_combined = pd.concat(basic_dfs, ignore_index=True)

In [5]:
basic_tables_combined

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,STL,BLK,TOV,PF,PTS,GmSc,+/-,team_table_id,game_id,opp_team_table_id
0,Shai Gilgeous-Alexander,39:06,16,29,.552,2,6,.333,18,21,...,1,0,3,4,52,37.0,+1,box-OKC-game-basic,202501290GSW.html,box-GSW-game-basic
1,Luguentz Dort,35:46,2,8,.250,0,4,.000,0,0,...,1,0,1,4,4,2.8,-9,box-OKC-game-basic,202501290GSW.html,box-GSW-game-basic
2,Jalen Williams,34:18,12,26,.462,2,6,.333,0,2,...,2,1,1,4,26,16.2,-18,box-OKC-game-basic,202501290GSW.html,box-GSW-game-basic
3,Isaiah Hartenstein,30:58,2,10,.200,0,1,.000,0,0,...,0,1,1,2,4,8.7,-8,box-OKC-game-basic,202501290GSW.html,box-GSW-game-basic
4,Cason Wallace,25:55,1,5,.200,0,3,.000,0,0,...,3,1,2,2,2,2.5,-8,box-OKC-game-basic,202501290GSW.html,box-GSW-game-basic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40378,Kyle Filipowski,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,box-UTA-game-basic,202410230UTA.html,box-MEM-game-basic
40379,Svi Mykhailiuk,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,box-UTA-game-basic,202410230UTA.html,box-MEM-game-basic
40380,Micah Potter,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,box-UTA-game-basic,202410230UTA.html,
40381,Oscar Tshiebwe,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,box-UTA-game-basic,202410230UTA.html,


In [6]:
advanced_dfs = []

for game in os.listdir(games_path):
    with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Games/Games_2024-25/{game}", encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")

    advanced_tables = soup.find_all(id=re.compile(r"^box-.*-game-advanced$"))

    table_dfs = []
    for table in advanced_tables:
        table_id = table.get("id")
        table_html = str(table)
        df = pd.read_html(StringIO(table_html), header=1)[0]
        df["team_table_id"] = table_id
        df["game_id"] = game
        table_dfs.append(df)

    # Add the opposing team table id
    if len(table_dfs) == 2:
        table_dfs[0]["opp_team_table_id"] = table_dfs[1]["team_table_id"]
        table_dfs[1]["opp_team_table_id"] = table_dfs[0]["team_table_id"]

    advanced_dfs.extend(table_dfs)

advanced_tables_combined = pd.concat(advanced_dfs, ignore_index=True)

In [7]:
advanced_tables_combined

Unnamed: 0,Starters,MP,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,...,BLK%,TOV%,USG%,ORtg,DRtg,BPM,team_table_id,game_id,opp_team_table_id,Unnamed: 16
0,Shai Gilgeous-Alexander,39:06,.680,.586,.207,.724,0.0,9.2,4.0,24.1,...,0.0,7.3,40.8,138,118,17.8,box-OKC-game-advanced,202501290GSW.html,box-GSW-game-advanced,
1,Luguentz Dort,35:46,.250,.250,.500,.000,12.9,3.4,8.8,7.2,...,0.0,11.1,9.7,85,119,-7.2,box-OKC-game-advanced,202501290GSW.html,box-GSW-game-advanced,
2,Jalen Williams,34:18,.484,.500,.231,.077,5.4,17.5,10.6,12.1,...,3.3,3.6,31.5,105,111,4.8,box-OKC-game-advanced,202501290GSW.html,box-GSW-game-advanced,
3,Isaiah Hartenstein,30:58,.200,.200,.100,.000,17.9,46.5,30.3,25.2,...,3.6,9.1,13.8,90,113,-0.9,box-OKC-game-advanced,202501290GSW.html,box-GSW-game-advanced,
4,Cason Wallace,25:55,.200,.200,.600,.000,3.6,9.3,6.0,9.7,...,4.3,28.6,10.5,52,105,-3.2,box-OKC-game-advanced,202501290GSW.html,box-GSW-game-advanced,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40378,Kyle Filipowski,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,box-UTA-game-advanced,202410230UTA.html,box-MEM-game-advanced,
40379,Svi Mykhailiuk,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,box-UTA-game-advanced,202410230UTA.html,box-MEM-game-advanced,
40380,Micah Potter,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,box-UTA-game-advanced,202410230UTA.html,,
40381,Oscar Tshiebwe,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,box-UTA-game-advanced,202410230UTA.html,,


In [10]:
basic_tables_combined.to_csv("/Users/camsmithers/Desktop/Camalytics/NBA/Data-NBA/BasicBox2425.csv")
advanced_tables_combined.to_csv("/Users/camsmithers/Desktop/Camalytics/NBA/Data-NBA/AdvancedBox2425.csv")

In [4]:
all_teams = Path("/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams")
playoff_teams = Path("/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams")

In [5]:
all_general_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "team_and_opponent" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="team_and_opponent")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        all_general_dfs.append(df)

all_general_dfs = pd.concat(all_general_dfs, ignore_index=True)

In [6]:
all_misc_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "team_misc" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="team_misc")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        all_misc_dfs.append(df)

all_misc_combined = pd.concat(all_misc_dfs, ignore_index=True)

In [8]:
all_pgs_dfs = []

for team in os.listdir(all_teams):
    with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")

    tables = soup.find_all(id="per_game_stats")

    for table in tables:
        table_html = str(table)
        df = pd.read_html(StringIO(table_html))[0]
        df["team_id"] = team
        all_pgs_dfs.append(df)
all_pgs_combined = pd.concat(all_pgs_dfs, ignore_index=True)

all_pgs_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,team_id
0,1.0,Tyrese Maxey,24.0,PG,52,52,37.7,9.2,21.0,0.437,...,3.1,3.3,6.1,1.8,0.4,2.4,2.2,26.3,CPOY-10,PHI-2025.html
1,2.0,Kelly Oubre Jr.,29.0,SF,60,57,34.6,5.8,12.4,0.470,...,4.5,6.1,1.8,1.5,0.5,1.3,3.0,15.1,,PHI-2025.html
2,3.0,Quentin Grimes,24.0,SG,28,25,33.7,7.7,16.4,0.469,...,4.1,5.2,4.5,1.5,0.4,2.9,2.1,21.9,,PHI-2025.html
3,4.0,Paul George,34.0,PF,41,41,32.5,6.0,13.9,0.430,...,4.8,5.3,4.3,1.8,0.5,2.6,2.5,16.2,,PHI-2025.html
4,5.0,Caleb Martin,29.0,SF,31,24,30.4,3.3,7.5,0.435,...,3.0,4.4,2.2,1.1,0.6,1.4,2.0,9.1,,PHI-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,18.0,Tristan Thompson,33.0,C,40,0,8.2,0.8,1.8,0.437,...,2.6,3.4,0.6,0.1,0.3,0.4,0.9,1.7,,CLE-2025.html
680,19.0,Emoni Bates,21.0,SF,10,0,7.5,1.3,3.8,0.342,...,0.7,0.7,0.8,0.1,0.1,0.2,0.4,3.7,,CLE-2025.html
681,20.0,Luke Travers,23.0,SG,12,0,7.3,0.4,1.7,0.250,...,1.0,1.7,0.7,0.1,0.1,0.7,0.5,1.0,,CLE-2025.html
682,21.0,JT Thor,22.0,PF,9,0,4.7,1.0,1.7,0.600,...,0.3,0.7,0.1,0.2,0.3,0.7,0.7,3.1,,CLE-2025.html


In [9]:
playoff_pgs_dfs = []

for team in os.listdir(playoff_teams):
    with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")

    tables = soup.find_all(id="per_game_stats_post")

    for table in tables:
        table_html = str(table)
        df = pd.read_html(StringIO(table_html))[0]
        df["team_id"] = team
        playoff_pgs_dfs.append(df)
playoff_pgs_combined = pd.concat(playoff_pgs_dfs, ignore_index=True)

playoff_pgs_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,team_id
0,1.0,Anthony Edwards,23.0,SG,15,15,39.0,9.0,19.9,0.453,...,6.6,7.8,5.5,1.1,0.7,2.6,2.1,25.3,,MIN-2025.html
1,2.0,Julius Randle,30.0,PF,15,15,35.5,7.7,15.3,0.502,...,4.1,5.9,4.9,0.8,0.1,3.3,3.0,21.7,,MIN-2025.html
2,3.0,Jaden McDaniels,24.0,PF,15,15,33.1,5.8,11.3,0.515,...,3.8,5.6,1.5,1.3,0.9,1.2,3.3,14.7,,MIN-2025.html
3,4.0,Rudy Gobert,32.0,C,15,15,27.4,3.1,5.3,0.582,...,5.5,8.6,0.7,0.5,1.2,0.9,2.5,7.9,,MIN-2025.html
4,5.0,Donte DiVincenzo,28.0,SG,15,0,25.1,3.1,8.4,0.365,...,2.3,3.1,3.3,1.4,0.3,1.9,3.0,8.7,,MIN-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,12.0,Jaylon Tyson,22.0,SG,4,0,7.8,1.8,3.8,0.467,...,1.0,1.8,1.8,0.5,0.3,0.5,1.0,6.0,,CLE-2025.html
231,13.0,Javonte Green,31.0,SG,6,0,6.5,0.7,1.7,0.400,...,1.3,1.5,0.3,0.7,0.0,0.3,0.3,2.5,,CLE-2025.html
232,14.0,Craig Porter Jr.,24.0,PG,6,0,5.8,1.0,2.0,0.500,...,0.5,1.0,1.5,0.5,0.3,0.3,0.3,2.0,,CLE-2025.html
233,15.0,Chuma Okeke,26.0,PF,3,0,4.3,0.3,1.3,0.250,...,0.0,0.0,0.3,0.3,0.0,0.0,0.0,1.0,,CLE-2025.html


In [10]:
all_totals_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "totals_stats" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="totals_stats")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        all_totals_dfs.append(df)

all_totals_combined = pd.concat(all_totals_dfs, ignore_index=True)

In [11]:
playoff_totals_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "totals_stats_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="totals_stats_post")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        playoff_totals_dfs.append(df)

playoff_totals_combined = pd.concat(playoff_totals_dfs, ignore_index=True)

In [12]:
all_per36_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        tables = soup.find_all(id="per_minute_stats")

        for table in tables:
            table_html = str(table)
            df = pd.read_html(StringIO(table_html))[0]
            df["team_id"] = team
            all_per36_dfs.append(df)

all_per36_combined = pd.concat(all_per36_dfs, ignore_index=True)

all_per36_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,team_id
0,1,Kelly Oubre Jr.,29,SF,60,57,2078,6.0,12.8,0.470,...,4.7,6.4,1.9,1.6,0.5,1.4,3.1,15.7,,PHI-2025.html
1,2,Tyrese Maxey,24,PG,52,52,1960,8.8,20.0,0.437,...,2.9,3.2,5.8,1.7,0.4,2.3,2.1,25.1,CPOY-10,PHI-2025.html
2,3,Guerschon Yabusele,29,C,70,43,1895,5.4,10.7,0.501,...,5.0,7.5,2.8,1.1,0.5,1.7,3.1,14.6,,PHI-2025.html
3,4,Paul George,34,PF,41,41,1334,6.6,15.4,0.430,...,5.3,5.9,4.8,2.0,0.5,2.9,2.7,17.9,,PHI-2025.html
4,5,Ricky Council IV,23,SF,73,12,1250,5.1,13.3,0.382,...,4.7,6.2,2.8,0.9,0.4,1.5,1.9,15.3,,PHI-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,17,Luke Travers,23,SG,12,0,88,2.0,8.2,0.250,...,4.9,8.2,3.3,0.4,0.4,3.3,2.5,4.9,,CLE-2025.html
650,18,Emoni Bates,21,SF,10,0,75,6.2,18.2,0.342,...,3.4,3.4,3.8,0.5,0.5,1.0,1.9,17.8,,CLE-2025.html
651,19,Nae'Qwan Tomlin,24,PF,5,1,63,7.4,18.3,0.406,...,6.9,12.0,1.1,0.0,0.6,1.1,5.1,20.6,,CLE-2025.html
652,20,JT Thor,22,PF,9,0,42,7.7,12.9,0.600,...,2.6,5.1,0.9,1.7,2.6,5.1,5.1,24.0,,CLE-2025.html


In [13]:
playoff_per36_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        tables = soup.find_all(id="per_minute_stats_post")

        for table in tables:
            table_html = str(table)
            df = pd.read_html(StringIO(table_html))[0]
            df["team_id"] = team
            playoff_per36_dfs.append(df)

playoff_per36_combined = pd.concat(playoff_per36_dfs, ignore_index=True)

playoff_per36_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,team_id
0,1,Anthony Edwards,23,SG,15,15,585,8.3,18.3,0.453,...,6.1,7.2,5.0,1.0,0.7,2.4,2.0,23.4,,MIN-2025.html
1,2,Julius Randle,30,PF,15,15,533,7.8,15.5,0.502,...,4.1,5.9,5.0,0.8,0.1,3.3,3.0,22.0,,MIN-2025.html
2,3,Jaden McDaniels,24,PF,15,15,497,6.3,12.2,0.515,...,4.1,6.1,1.7,1.4,0.9,1.3,3.6,15.9,,MIN-2025.html
3,4,Rudy Gobert,32,C,15,15,411,4.0,6.9,0.582,...,7.2,11.3,1.0,0.7,1.6,1.1,3.3,10.3,,MIN-2025.html
4,5,Donte DiVincenzo,28,SG,15,0,377,4.4,12.0,0.365,...,3.2,4.4,4.8,2.0,0.4,2.7,4.3,12.4,,MIN-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,11,Javonte Green,31,SG,6,0,39,3.7,9.2,0.400,...,7.4,8.3,1.8,3.7,0.0,1.8,1.8,13.8,,CLE-2025.html
215,12,Craig Porter Jr.,24,PG,6,0,35,6.2,12.3,0.500,...,3.1,6.2,9.3,3.1,2.1,2.1,2.1,12.3,,CLE-2025.html
216,13,Jaylon Tyson,22,SG,4,0,31,8.1,17.4,0.467,...,4.6,8.1,8.1,2.3,1.2,2.3,4.6,27.9,,CLE-2025.html
217,14,Tristan Thompson,33,C,3,0,29,3.7,7.4,0.500,...,16.1,18.6,1.2,1.2,2.5,2.5,2.5,8.7,,CLE-2025.html


In [14]:
all_per100_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "per_poss" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="per_poss")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        all_per100_dfs.append(df)

all_per100_combined = pd.concat(all_per100_dfs, ignore_index=True)

all_per100_combined

Unnamed: 0,Rk,Player,Age,G,GS,MP,FG,FGA,FG%,3P,...,AST,STL,BLK,TOV,PF,PTS,ORtg,DRtg,Awards,team_id
0,1,Kelly Oubre Jr.,29,60,57,2078,8.3,17.6,0.470,1.7,...,2.6,2.2,0.7,1.9,4.3,21.5,111,118,,PHI-2025.html
1,2,Tyrese Maxey,24,52,52,1960,12.0,27.4,0.437,4.0,...,8.0,2.3,0.5,3.1,2.9,34.4,114,119,CPOY-10,PHI-2025.html
2,3,Guerschon Yabusele,29,70,43,1895,7.3,14.6,0.501,2.7,...,3.9,1.5,0.6,2.3,4.3,20.0,122,119,,PHI-2025.html
3,4,Paul George,34,41,41,1334,9.1,21.1,0.430,3.5,...,6.6,2.8,0.7,3.9,3.7,24.6,104,116,,PHI-2025.html
4,5,Ricky Council IV,23,73,12,1250,7.0,18.3,0.382,2.2,...,3.8,1.3,0.5,2.1,2.6,21.0,106,120,,PHI-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,17,Luke Travers,23,12,0,88,2.7,10.9,0.250,0.0,...,4.4,0.5,0.5,4.4,3.3,6.6,76,115,,CLE-2025.html
650,18,Emoni Bates,21,10,0,75,8.3,24.4,0.342,7.1,...,5.1,0.6,0.6,1.3,2.6,23.7,105,117,,CLE-2025.html
651,19,Nae'Qwan Tomlin,24,5,1,63,9.9,24.4,0.406,1.5,...,1.5,0.0,0.8,1.5,6.9,27.5,112,115,,CLE-2025.html
652,20,JT Thor,22,9,0,42,10.3,17.2,0.600,3.4,...,1.1,2.3,3.4,6.9,6.9,32.1,114,110,,CLE-2025.html


In [15]:
playoff_per100_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "per_poss_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="per_poss_post")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        playoff_per100_dfs.append(df)

playoff_per100_combined = pd.concat(playoff_per100_dfs, ignore_index=True)

playoff_per100_combined

Unnamed: 0,Rk,Player,Age,G,GS,MP,FG,FGA,FG%,3P,...,AST,STL,BLK,TOV,PF,PTS,ORtg,DRtg,Awards,team_id
0,1,Anthony Edwards,23,15,15,585,11.6,25.7,0.453,4.0,...,7.1,1.5,0.9,3.4,2.8,32.8,116.0,111,,MIN-2025.html
1,2,Julius Randle,30,15,15,533,10.9,21.7,0.502,2.8,...,7.0,1.1,0.2,4.6,4.3,30.9,120.0,115,,MIN-2025.html
2,3,Jaden McDaniels,24,15,15,497,8.8,17.2,0.515,2.1,...,2.3,1.9,1.3,1.8,5.1,22.3,122.0,111,,MIN-2025.html
3,4,Rudy Gobert,32,15,15,411,5.6,9.7,0.582,0.0,...,1.4,1.0,2.2,1.6,4.7,14.5,124.0,110,,MIN-2025.html
4,5,Donte DiVincenzo,28,15,0,377,6.2,16.9,0.365,3.7,...,6.7,2.8,0.5,3.7,6.0,17.4,101.0,111,,MIN-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,11,Javonte Green,31,6,0,39,5.1,12.7,0.400,1.3,...,2.5,5.1,0.0,2.5,2.5,19.1,118.0,103,,CLE-2025.html
215,12,Craig Porter Jr.,24,6,0,35,8.5,17.0,0.500,0.0,...,12.8,4.3,2.8,2.8,2.8,17.0,123.0,104,,CLE-2025.html
216,13,Jaylon Tyson,22,4,0,31,11.2,24.0,0.467,8.0,...,11.2,3.2,1.6,3.2,6.4,38.4,145.0,107,,CLE-2025.html
217,14,Tristan Thompson,33,3,0,29,5.1,10.3,0.500,0.0,...,1.7,1.7,3.4,3.4,3.4,12.0,91.0,100,,CLE-2025.html


In [16]:
all_advanced_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if 'id="advanced"' in c)
        table = BeautifulSoup(comment, "html.parser").find(id="advanced")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        all_advanced_dfs.append(df)

all_advanced_combined = pd.concat(all_advanced_dfs, ignore_index=True)

all_advanced_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Awards,team_id
0,1.0,Kelly Oubre Jr.,29.0,SF,60,57,2078,13.5,0.551,0.327,...,1.2,1.4,2.6,0.061,-1.7,-0.6,-2.3,-0.1,,PHI-2025.html
1,2.0,Tyrese Maxey,24.0,PG,52,52,1960,19.5,0.562,0.438,...,2.9,1.0,3.8,0.094,3.2,-1.1,2.0,2.0,CPOY-10,PHI-2025.html
2,3.0,Guerschon Yabusele,29.0,C,70,43,1895,14.7,0.616,0.487,...,2.9,1.0,3.9,0.099,0.1,-1.1,-1.0,0.5,,PHI-2025.html
3,4.0,Paul George,34.0,PF,41,41,1334,14.5,0.543,0.470,...,-0.2,1.2,1.0,0.036,-0.5,0.1,-0.4,0.5,,PHI-2025.html
4,5.0,Ricky Council IV,23.0,SF,73,12,1250,11.6,0.502,0.460,...,0.1,0.5,0.6,0.023,-2.0,-2.3,-4.3,-0.7,,PHI-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,18.0,Emoni Bates,21.0,SF,10,0,75,9.8,0.487,0.789,...,0.0,0.1,0.1,0.038,-0.5,-1.0,-1.5,0.0,,CLE-2025.html
680,19.0,Nae'Qwan Tomlin,24.0,PF,5,1,63,14.4,0.489,0.313,...,0.1,0.1,0.1,0.093,-3.0,-2.4,-5.5,-0.1,,CLE-2025.html
681,20.0,JT Thor,22.0,PF,9,0,42,19.1,0.756,0.400,...,0.0,0.1,0.1,0.132,-1.2,1.3,0.0,0.0,,CLE-2025.html
682,21.0,Chuma Okeke,26.0,PF,2,0,25,6.8,0.357,0.857,...,0.0,0.0,0.0,0.028,-3.6,-0.5,-4.1,0.0,,CLE-2025.html


In [17]:
playoff_advanced_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "advanced_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="advanced_post")
        df = pd.read_html(StringIO(str(table)))[0]
        df["team_id"] = team
        playoff_advanced_dfs.append(df)

playoff_advanced_combined = pd.concat(playoff_advanced_dfs, ignore_index=True)

playoff_advanced_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Awards,team_id
0,1.0,Anthony Edwards,23.0,SG,15,15,585,20.2,0.564,0.436,...,1.1,0.7,1.9,0.154,4.8,1.2,6.0,1.2,,MIN-2025.html
1,2.0,Julius Randle,30.0,PF,15,15,533,18.9,0.622,0.341,...,1.2,0.5,1.7,0.152,3.4,0.8,4.2,0.8,,MIN-2025.html
2,3.0,Jaden McDaniels,24.0,PF,15,15,497,15.6,0.607,0.325,...,0.9,0.6,1.5,0.143,0.6,2.0,2.6,0.6,,MIN-2025.html
3,4.0,Rudy Gobert,32.0,C,15,15,411,14.4,0.584,0.000,...,0.6,0.6,1.2,0.138,-0.8,0.4,-0.3,0.2,,MIN-2025.html
4,5.0,Donte DiVincenzo,28.0,SG,15,0,377,9.8,0.493,0.698,...,-0.1,0.5,0.4,0.046,-2.1,1.5,-0.5,0.1,,MIN-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,12.0,Craig Porter Jr.,24.0,PG,6,0,35,21.8,0.500,0.250,...,0.1,0.1,0.1,0.202,3.0,5.4,8.4,0.1,,CLE-2025.html
231,13.0,Jaylon Tyson,22.0,SG,4,0,31,32.8,0.680,0.600,...,0.2,0.1,0.3,0.406,10.3,5.4,15.6,0.1,,CLE-2025.html
232,14.0,Tristan Thompson,33.0,C,3,0,29,13.5,0.478,0.000,...,0.0,0.1,0.0,0.067,-3.3,2.5,-0.9,0.0,,CLE-2025.html
233,15.0,Chuma Okeke,26.0,PF,3,0,13,7.1,0.375,1.000,...,0.0,0.0,0.0,0.035,-2.1,0.3,-1.8,0.0,,CLE-2025.html


In [18]:
all_adjshooting_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "adj_shooting" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="adj_shooting")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        all_adjshooting_dfs.append(df)

all_adjshooting_combined = pd.concat(all_adjshooting_dfs, ignore_index=True)

all_adjshooting_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,FG%,2P%,3P%,...,3P+,eFG+,FT+,TS+,FTr+,3PAr+,FG Add,TS Add,Awards,team_id
0,1.0,Marcus Bagley,23.0,SF,10,4,253,0.391,0.595,0.156,...,43.0,79.0,103.0,79.0,60.0,110.0,-15.9,-17.6,,PHI-2025.html
1,2.0,Adem Bona,21.0,C,58,11,905,0.703,0.707,0.000,...,0.0,129.0,86.0,124.0,214.0,1.0,61.5,65.0,,PHI-2025.html
2,3.0,Oshae Brissett,26.0,SF,6,2,142,0.487,0.619,0.333,...,93.0,104.0,73.0,100.0,148.0,110.0,1.6,0.0,,PHI-2025.html
3,4.0,Jared Butler,24.0,SG,28,17,682,0.426,0.490,0.352,...,98.0,93.0,112.0,94.0,68.0,110.0,-19.9,-21.5,,PHI-2025.html
4,5.0,Colin Castleton,24.0,C,5,0,98,0.500,0.600,0.000,...,0.0,92.0,85.0,93.0,154.0,40.0,-2.1,-2.2,,PHI-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,18.0,Nae'Qwan Tomlin,24.0,PF,5,1,63,0.406,0.500,0.200,...,56.0,81.0,93.0,85.0,142.0,74.0,-6.8,-6.5,,CLE-2025.html
680,19.0,Luke Travers,23.0,SG,12,0,88,0.250,0.455,0.000,...,0.0,46.0,128.0,50.0,41.0,107.0,-11.7,-12.1,,CLE-2025.html
681,20.0,Jaylon Tyson,22.0,SG,47,3,453,0.430,0.484,0.345,...,96.0,91.0,102.0,91.0,66.0,92.0,-13.8,-16.9,,CLE-2025.html
682,21.0,Dean Wade,28.0,PF,59,30,1252,0.413,0.653,0.360,...,100.0,103.0,68.0,98.0,46.0,195.0,9.7,-7.5,,CLE-2025.html


In [19]:
playoff_adjshooting_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "adj_shooting_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="adj_shooting_post")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        playoff_adjshooting_dfs.append(df)

playoff_adjshooting_combined = pd.concat(playoff_adjshooting_dfs, ignore_index=True)

playoff_adjshooting_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,FG%,2P%,3P%,...,FG+,2P+,3P+,eFG+,FT+,TS+,FTr+,3PAr+,Awards,team_id
0,1.0,Nickeil Alexander-Walker,26.0,SG,15,0,310,0.389,0.440,0.349,...,83.0,81.0,97.0,90.0,113.0,90.0,62.0,132.0,,MIN-2025.html
1,2.0,Jaylen Clark,23.0,SG,5,0,28,0.800,1.000,0.000,...,171.0,183.0,0.0,147.0,96.0,141.0,329.0,47.0,,MIN-2025.html
2,3.0,Mike Conley,37.0,PG,15,15,356,0.302,0.250,0.333,...,65.0,46.0,93.0,75.0,118.0,77.0,56.0,148.0,,MIN-2025.html
3,4.0,Rob Dillingham,20.0,PG,3,0,16,0.375,0.333,0.500,...,80.0,61.0,139.0,81.0,64.0,78.0,103.0,59.0,,MIN-2025.html
4,5.0,Donte DiVincenzo,28.0,SG,15,0,377,0.365,0.474,0.318,...,78.0,87.0,88.0,88.0,99.0,86.0,42.0,166.0,,MIN-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,12.0,Max Strus,28.0,SF,9,9,253,0.416,0.500,0.388,...,89.0,92.0,108.0,103.0,107.0,99.0,28.0,179.0,,CLE-2025.html
231,13.0,Tristan Thompson,33.0,C,3,0,29,0.500,0.500,,...,107.0,92.0,,92.0,43.0,83.0,206.0,0.0,,CLE-2025.html
232,14.0,Jaylon Tyson,22.0,SG,4,0,31,0.467,0.333,0.556,...,100.0,61.0,154.0,117.0,107.0,118.0,165.0,142.0,,CLE-2025.html
233,15.0,Dean Wade,28.0,PF,9,1,142,0.333,0.750,0.214,...,71.0,138.0,59.0,77.0,,72.0,0.0,185.0,,CLE-2025.html


In [20]:
all_shooting_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if 'id="shooting"' in c)
        table = BeautifulSoup(comment, "html.parser").find(id="shooting")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        all_shooting_dfs.append(df)

all_shooting_combined = pd.concat(all_shooting_dfs, ignore_index=True)

all_shooting_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,FG%,Dist.,2P,...,2P.2,3P.2,%FGA,#,%3PA,3P%,Att.,Md.,Awards,team_id
0,1.0,Kelly Oubre Jr.,29.0,SF,60,57,2078,0.470,11.3,0.673,...,0.534,0.972,0.097,63,0.318,0.351,0,0,,PHI-2025.html
1,2.0,Tyrese Maxey,24.0,PG,52,52,1960,0.437,15.1,0.562,...,0.307,0.491,0.014,15,0.084,0.275,3,0,CPOY-10,PHI-2025.html
2,3.0,Guerschon Yabusele,29.0,C,70,43,1895,0.501,14.4,0.513,...,0.725,1.000,0.089,44,0.241,0.364,3,0,,PHI-2025.html
3,4.0,Paul George,34.0,PF,41,41,1334,0.430,18.0,0.530,...,0.282,0.656,0.021,8,0.175,0.277,1,0,,PHI-2025.html
4,5.0,Ricky Council IV,23.0,SF,73,12,1250,0.382,15.2,0.540,...,0.516,0.927,0.071,31,0.183,0.410,4,0,,PHI-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,18.0,Emoni Bates,21.0,SF,10,0,75,0.342,23.4,0.211,...,0.000,0.909,0.000,0,0.100,0.667,0,0,,CLE-2025.html
680,19.0,Nae'Qwan Tomlin,24.0,PF,5,1,63,0.406,10.2,0.688,...,0.455,1.000,0.094,2,0.000,,0,0,,CLE-2025.html
681,20.0,JT Thor,22.0,PF,9,0,42,0.600,15.8,0.600,...,0.833,1.000,0.067,1,0.500,0.333,0,0,,CLE-2025.html
682,21.0,Chuma Okeke,26.0,PF,2,0,25,0.286,21.5,0.143,...,1.000,1.000,0.143,1,0.500,0.333,0,0,,CLE-2025.html


In [21]:
playoff_shooting_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if 'id="shooting_post"' in c)
        table = BeautifulSoup(comment, "html.parser").find(id="shooting_post")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        playoff_shooting_dfs.append(df)

playoff_shooting_combined = pd.concat(playoff_shooting_dfs, ignore_index=True)

playoff_shooting_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,FG%,Dist.,2P,...,2P.2,3P.2,%FGA,#,%3PA,3P%,Att.,Md.,Awards,team_id
0,1.0,Anthony Edwards,23.0,SG,15,15,585,0.453,15.2,0.564,...,0.382,0.609,0.034,9,0.100,0.462,3,0,,MIN-2025.html
1,2.0,Julius Randle,30.0,PF,15,15,533,0.502,12.8,0.659,...,0.447,0.700,0.013,3,0.205,0.250,0,0,,MIN-2025.html
2,3.0,Jaden McDaniels,24.0,PF,15,15,497,0.515,12.0,0.675,...,0.606,1.000,0.095,14,0.600,0.364,0,0,,MIN-2025.html
3,4.0,Rudy Gobert,32.0,C,15,15,411,0.582,2.1,1.000,...,0.630,,0.392,29,,,0,0,,MIN-2025.html
4,5.0,Donte DiVincenzo,28.0,SG,15,0,377,0.365,19.6,0.302,...,0.500,0.929,0.016,2,0.284,0.480,2,0,,MIN-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,12.0,Craig Porter Jr.,24.0,PG,6,0,35,0.500,10.0,0.750,...,0.167,,0.083,1,0.000,,0,0,,CLE-2025.html
231,13.0,Jaylon Tyson,22.0,SG,4,0,31,0.467,16.3,0.400,...,1.000,1.000,0.133,1,0.444,1.000,0,0,,CLE-2025.html
232,14.0,Tristan Thompson,33.0,C,3,0,29,0.500,4.9,1.000,...,0.667,,0.000,0,,,0,0,,CLE-2025.html
233,15.0,Chuma Okeke,26.0,PF,3,0,13,0.250,28.0,0.000,...,,1.000,0.000,0,0.000,,0,0,,CLE-2025.html


In [22]:
all_pbp_dfs = []

for team in os.listdir(all_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/All_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "pbp_stats" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="pbp_stats")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        all_pbp_dfs.append(df)

all_pbp_combined = pd.concat(all_pbp_dfs, ignore_index=True)

all_pbp_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,PG%,SG%,SF%,...,LostBall,Shoot,Off.,Shoot.1,Off..1,PGA,And1,Blkd,Awards,team_id
0,1.0,Kelly Oubre Jr.,29.0,SF,60,57,2078,0,8,52,...,37,84,9,96,22,267,30,66,,PHI-2025.html
1,2.0,Tyrese Maxey,24.0,PG,52,52,1960,83,17,0,...,41,56,7,122,8,791,38,61,CPOY-10,PHI-2025.html
2,3.0,Guerschon Yabusele,29.0,C,70,43,1895,0,0,0,...,33,76,17,62,13,357,13,31,,PHI-2025.html
3,4.0,Paul George,34.0,PF,41,41,1334,0,0,9,...,29,42,12,44,7,435,11,20,,PHI-2025.html
4,5.0,Ricky Council IV,23.0,SF,73,12,1250,0,11,64,...,16,31,8,77,5,239,16,25,,PHI-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,18.0,Emoni Bates,21.0,SF,10,0,75,0,3,62,...,1,1,0,0,0,18,0,0,,CLE-2025.html
680,19.0,Nae'Qwan Tomlin,24.0,PF,5,1,63,0,0,0,...,0,6,0,7,0,6,3,4,,CLE-2025.html
681,20.0,JT Thor,22.0,PF,9,0,42,0,0,0,...,2,2,1,4,0,3,0,0,,CLE-2025.html
682,21.0,Chuma Okeke,26.0,PF,2,0,25,0,0,8,...,0,0,0,0,0,6,0,0,,CLE-2025.html


In [23]:
playoff_pbp_dfs = []

for team in os.listdir(playoff_teams):
    if team.endswith(".html"):
        with open(f"/Users/camsmithers/Desktop/Camalytics/CamalyticsEnv/Projects/Sports/NBA/Teams/Teams_2425/Playoff_Teams/{team}", encoding="ISO-8859-1") as f:
            page = f.read()
        
        soup = BeautifulSoup(page, "html.parser")

        comment = next(c for c in soup.find_all(string=lambda text: isinstance(text, Comment)) if "pbp_stats_post" in c)
        table = BeautifulSoup(comment, "html.parser").find(id="pbp_stats_post")
        df = pd.read_html(StringIO(str(table)), header=1)[0]
        df["team_id"] = team
        playoff_pbp_dfs.append(df)

playoff_pbp_combined = pd.concat(playoff_pbp_dfs, ignore_index=True)

playoff_pbp_combined

Unnamed: 0,Rk,Player,Age,Pos,G,GS,MP,PG%,SG%,SF%,...,LostBall,Shoot,Off.,Shoot.1,Off..1,PGA,And1,Blkd,Awards,team_id
0,1.0,Anthony Edwards,23.0,SG,15,15,585,1,65,33,...,18,14,3,39,3,205,9,10,,MIN-2025.html
1,2.0,Julius Randle,30.0,PF,15,15,533,0,0,1,...,18,14,11,35,2,186,9,14,,MIN-2025.html
2,3.0,Jaden McDaniels,24.0,PF,15,15,497,0,1,87,...,4,24,3,13,2,59,4,8,,MIN-2025.html
3,4.0,Rudy Gobert,32.0,C,15,15,411,0,0,0,...,4,23,4,21,1,28,1,10,,MIN-2025.html
4,5.0,Donte DiVincenzo,28.0,SG,15,0,377,80,20,0,...,6,17,8,2,2,113,1,3,,MIN-2025.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,12.0,Craig Porter Jr.,24.0,PG,6,0,35,100,0,0,...,1,2,0,0,0,24,0,0,,CLE-2025.html
231,13.0,Jaylon Tyson,22.0,SG,4,0,31,0,0,42,...,0,1,0,3,0,15,0,1,,CLE-2025.html
232,14.0,Tristan Thompson,33.0,C,3,0,29,0,0,0,...,0,0,2,1,0,3,1,0,,CLE-2025.html
233,15.0,Chuma Okeke,26.0,PF,3,0,13,0,0,0,...,0,0,0,0,0,3,0,0,,CLE-2025.html


In [24]:
all_team_dfs_dict = {
    "pgs": all_pgs_combined,
    "misc": all_misc_combined,
    "totals": all_totals_combined,
    "per36": all_per36_combined,
    "per100": all_per100_combined,
    "advanced": all_advanced_combined,
    "adjshooting": all_adjshooting_combined,
    "shooting": all_shooting_combined,
    "pbp": all_pbp_combined
}

for name, df in all_team_dfs_dict.items():
    df.to_csv(f"/Users/camsmithers/Desktop/Camalytics/NBA/Data-NBA/{name}-2425.csv", index=False)

In [25]:
playoff_team_dfs_dict = {
    "playoff_pgs": playoff_pgs_combined,
    "playoff_totals": playoff_totals_combined,
    "playoff_per36": playoff_per36_combined,
    "playoff_per100": playoff_per100_combined,
    "playoff_advanced": playoff_advanced_combined,
    "playoff_adjshooting": playoff_adjshooting_combined,
    "playoff_shooting": playoff_shooting_combined,
    "playoff_pbp": playoff_pbp_combined
}

for name, df in playoff_team_dfs_dict.items():
    df.to_csv(f"/Users/camsmithers/Desktop/Camalytics/NBA/Data-NBA/{name}-2425.csv", index=False)