In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from random import randint

# Step 1: Send an HTTP request to the website (replace with the URL of the page with the table you want to scrape)
url = 'https://www.basketball-reference.com/leagues/NBA_2024_games-january.html'
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    print('Successfully fetched the webpage')
else:
    print('Failed to fetch the webpage:', response.status_code)
    exit()

Successfully fetched the webpage


In [6]:

# Step 2: Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Step 3: Find the table in the HTML (by id, class, or tag)
# In this case, let's assume we are looking for a table with the id 'schedule'
table = soup.find('table', {'id': 'schedule'})

# Step 4: Extract the table headers (if needed)
headers = []
for th in table.find('thead').find_all('th'):
    headers.append(th.text.strip())

# Step 5: Extract the table rows
rows = []
for row in table.find('tbody').find_all('tr'):
    row_data = []
    for cell in row.find_all(['th', 'td']):
        row_data.append(cell.text.strip())
    
 # Step 6: Find the specific <td> tag with data_stat="box_score_text"
    boxscore_td = row.find('td', {'data-stat': 'box_score_text'})
    
    # Extract the box score link if it exists
    if boxscore_td:
        boxscore_link = boxscore_td.find('a', href=True)
        if boxscore_link and boxscore_link.text == 'Box Score':  # Ensure it's the correct link
            row_data.append('https://www.basketball-reference.com' + boxscore_link['href'])
        else:
            row_data.append(None)  # If no link or incorrect, append None
    else:
        row_data.append(None)
    
    rows.append(row_data)

# Step 7: Convert the scraped data into a DataFrame
headers.append('Boxscore Link')  # Add the boxscore link as a new column
df = pd.DataFrame(rows, columns=headers)

# Step 8: Save the DataFrame to a CSV file
df.to_csv('basketball_games_with_boxscores.csv', index=False)

print('Scraping completed and data saved to CSV!')

Scraping completed and data saved to CSV!


In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to scrape advanced box score data from a given boxscore link
def scrape_advanced_boxscore_data(boxscore_url, data_stats):
    # Send a GET request to the boxscore page
    response = requests.get(boxscore_url)
    if response.status_code != 200:
        print(f"Failed to fetch boxscore at {boxscore_url}")
        return None
    
    # Parse the HTML content of the boxscore page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize two dictionaries to store the scraped data for Team 1 and Team 2
    boxscore_data_team1 = {}
    boxscore_data_team2 = {}

    try:
        # Step 1: Locate both tables that contain "Advanced Box Score Stats" in <th>
        advanced_tables = []
        
        # Find all <th> tags with the class "over_header center" and text "Advanced Box Score Stats"
        th_tags = soup.find_all('th', class_='over_header center', string='Advanced Box Score Stats')
        
        if not th_tags:
            print(f"No <th> tags found with class 'over_header center' and text 'Advanced Box Score Stats' for {boxscore_url}")
            return None

        # Step 2: Find the parent <table> for each matching <th> and add to advanced_tables
        for th in th_tags:
            table = th.find_parent('table')
            if table:
                advanced_tables.append(table)

        if len(advanced_tables) < 2:
            print(f"Expected 2 advanced box stats tables but found {len(advanced_tables)} for {boxscore_url}")
            return None
        
        
        # Step 2: Extract data from the first table (Team 1)
        tfoot_team1 = advanced_tables[0].find('tfoot')
        if tfoot_team1:
            for td in tfoot_team1.find_all('td'):
                data_stat = td.get('data-stat')
                if data_stat in data_stats:
                    boxscore_data_team1[f'team1_{data_stat}'] = td.text.strip()
        
        # Step 3: Extract data from the second table (Team 2)
        tfoot_team2 = advanced_tables[1].find('tfoot')
        if tfoot_team2:
            for td in tfoot_team2.find_all('td'):
                data_stat = td.get('data-stat')
                if data_stat in data_stats:
                    boxscore_data_team2[f'team2_{data_stat}'] = td.text.strip()

    except AttributeError as e:
        print(f"Error extracting advanced box score data: {e}")
        return None
    
    # Return combined dictionary for both teams
    return {**boxscore_data_team1, **boxscore_data_team2}

# Main function to load CSV, scrape advanced boxscore data, and update the table
def main():
    # Step 1: Load the CSV file generated from the previous program
    df = pd.read_csv('basketball_games_with_boxscores.csv')

    # Define the data-stat attributes you are interested in
    data_stats = [
        'ts_pct',  # True shooting percentage
        'efg_pct',  # Effective field goal percentage
        'fg3a_per_fga_pct',  # 3-point attempt rate
        'fta_per_fga_pct',  # Free throw attempt rate
        'trb_pct', # Total rebound percentage
        'ast_pct',  # Assist percentage
        'stl_pct',  # Steal percentage
        'blk_pct',  # Block percentage
        'tov_pct',  # Turnover percentage
        'off_rtg',  # Offensive rating
        'def_rtg',  # Defensive rating
        # Add more data-stat attributes as needed
    ]

    # Step 2: Initialize empty lists to store the scraped advanced stats data
    team1_ts_pct = []
    team2_ts_pct = []
    team1_efg_pct = []
    team2_efg_pct = []
    team1_fg3a_per_fga_pct = []
    team2_fg3a_per_fga_pct = []
    team1_fta_per_fga_pct = []
    team2_fta_per_fga_pct = []
    team1_trb_pct = []
    team2_trb_pct = []
    team1_ast_pct = []
    team2_ast_pct = []
    team1_stl_pct = []
    team2_stl_pct = []
    team1_blk_pct = []
    team2_blk_pct = []
    team1_tov_pct = []
    team2_tov_pct = []
    team1_off_rtg = []
    team2_off_rtg = []
    team1_def_rtg = []
    team2_def_rtg = []
    # Add more lists as needed for additional data-stat attributes

    # Step 3: Loop over each row in the CSV file and scrape advanced box score data for each game
    for index, row in df.iterrows():
        boxscore_url = row['Boxscore Link']
        
        if pd.notna(boxscore_url):  # Only scrape if the boxscore link is present
            print(f"Scraping advanced boxscore for game {index + 1} of {len(df)}")
            
            # Scrape advanced box score data from the boxscore page
            advanced_boxscore_data = scrape_advanced_boxscore_data(boxscore_url, data_stats)
            
            # If data was successfully scraped, append it to the respective lists
            if advanced_boxscore_data:
                team1_ts_pct.append(advanced_boxscore_data.get('team1_ts_pct'))
                team2_ts_pct.append(advanced_boxscore_data.get('team2_ts_pct'))
                team1_efg_pct.append(advanced_boxscore_data.get('team1_efg_pct'))
                team2_efg_pct.append(advanced_boxscore_data.get('team2_efg_pct'))
                team1_fg3a_per_fga_pct.append(advanced_boxscore_data.get('team1_fg3a_per_fga_pct'))
                team2_fg3a_per_fga_pct.append(advanced_boxscore_data.get('team2_fg3a_per_fga_pct'))
                team1_fta_per_fga_pct.append(advanced_boxscore_data.get('team1_fta_per_fga_pct'))
                team2_fta_per_fga_pct.append(advanced_boxscore_data.get('team2_fta_per_fga_pct'))
                team1_trb_pct.append(advanced_boxscore_data.get('team1_trb_pct'))
                team2_trb_pct.append(advanced_boxscore_data.get('team2_trb_pct'))
                team1_ast_pct.append(advanced_boxscore_data.get('team1_ast_pct'))
                team2_ast_pct.append(advanced_boxscore_data.get('team2_ast_pct'))
                team1_stl_pct.append(advanced_boxscore_data.get('team1_stl_pct'))
                team2_stl_pct.append(advanced_boxscore_data.get('team2_stl_pct'))
                team1_blk_pct.append(advanced_boxscore_data.get('team1_blk_pct'))
                team2_blk_pct.append(advanced_boxscore_data.get('team2_blk_pct'))
                team1_tov_pct.append(advanced_boxscore_data.get('team1_tov_pct'))
                team2_tov_pct.append(advanced_boxscore_data.get('team2_tov_pct'))
                team1_off_rtg.append(advanced_boxscore_data.get('team1_off_rtg'))
                team2_off_rtg.append(advanced_boxscore_data.get('team2_off_rtg'))
                team1_def_rtg.append(advanced_boxscore_data.get('team1_def_rtg'))
                team2_def_rtg.append(advanced_boxscore_data.get('team2_def_rtg'))
                
            else:
                # If scraping failed, append None to keep the data aligned
                team1_ts_pct.append(None)
                team2_ts_pct.append(None)
                team1_efg_pct.append(None)
                team2_efg_pct.append(None)
                team1_fg3a_per_fga_pct.append(None)
                team2_fg3a_per_fga_pct.append(None)
                team1_fta_per_fga_pct.append(None)
                team2_fta_per_fga_pct.append(None)
                team1_trb_pct.append(None)
                team2_trb_pct.append(None)
                team1_ast_pct.append(None)
                team2_ast_pct.append(None)
                team1_stl_pct.append(None)
                team2_stl_pct.append(None)
                team1_blk_pct.append(None)
                team2_blk_pct.append(None)
                team1_tov_pct.append(None)
                team2_tov_pct.append(None)
                team1_off_rtg.append(None)
                team2_off_rtg.append(None)
                team1_def_rtg.append(None)
                team2_def_rtg.append(None)
                
            
            # Add a delay to avoid overloading the server (polite scraping)
            time.sleep(5)
        else:
            # If no box score link, append None values
            team1_ts_pct.append(None)
            team2_ts_pct.append(None)
            team1_efg_pct.append(None)
            team2_efg_pct.append(None)
            team1_fg3a_per_fga_pct.append(None)
            team2_fg3a_per_fga_pct.append(None)
            team1_fta_per_fga_pct.append(None)
            team2_fta_per_fga_pct.append(None)
            team1_trb_pct.append(None)
            team2_trb_pct.append(None)
            team1_ast_pct.append(None)
            team2_ast_pct.append(None)
            team1_stl_pct.append(None)
            team2_stl_pct.append(None)
            team1_blk_pct.append(None)
            team2_blk_pct.append(None)
            team1_tov_pct.append(None)
            team2_tov_pct.append(None)
            team1_off_rtg.append(None)
            team2_off_rtg.append(None)
            team1_def_rtg.append(None)
            team2_def_rtg.append(None)
    
    # Step 4: Add the scraped data as new columns in the DataFrame
    df['Team 1 TS PCT'] = team1_ts_pct
    df['Team 2 TS PCT'] = team2_ts_pct
    df['Team 1 EFG PCT'] = team1_efg_pct
    df['Team 2 EFG PCT'] = team2_efg_pct
    df['Team 1 3P AR'] = team1_fg3a_per_fga_pct
    df['Team 2 3P AR'] = team2_fg3a_per_fga_pct
    df['Team 1 FT AR'] = team1_fta_per_fga_pct
    df['Team 2 FT AR'] = team2_fta_per_fga_pct
    df['Team 1 TRB PCT'] = team1_trb_pct
    df['Team 2 TRB PCT'] = team2_trb_pct
    df['Team 1 AST PCT'] = team1_ast_pct
    df['Team 2 AST PCT'] = team2_ast_pct
    df['Team 1 STL PCT'] = team1_stl_pct
    df['Team 2 STL PCT'] = team2_stl_pct
    df['Team 1 BLK PCT'] = team1_blk_pct
    df['Team 2 BLK PCT'] = team2_blk_pct
    df['Team 1 TO PCT'] = team1_tov_pct
    df['Team 2 TO PCT'] = team2_tov_pct
    df['Team 1 OFF RT'] = team1_off_rtg
    df['Team 2 OFF RT'] = team2_off_rtg
    df['Team 1 DEF RT'] = team1_def_rtg
    df['Team 2 DEF RT'] = team2_def_rtg

    
    # Step 5: Save the updated DataFrame to a new CSV file
    df.to_csv('2024january.csv', index=False)
    print('Updated data with advanced stats saved to CSV!')

if __name__ == "__main__":
    main()

Scraping advanced boxscore for game 1 of 208
Scraping advanced boxscore for game 2 of 208
Scraping advanced boxscore for game 3 of 208
Scraping advanced boxscore for game 4 of 208
Scraping advanced boxscore for game 5 of 208
Scraping advanced boxscore for game 6 of 208
Scraping advanced boxscore for game 7 of 208
Scraping advanced boxscore for game 8 of 208
Scraping advanced boxscore for game 9 of 208
Scraping advanced boxscore for game 10 of 208
Scraping advanced boxscore for game 11 of 208
Scraping advanced boxscore for game 12 of 208
Scraping advanced boxscore for game 13 of 208
Scraping advanced boxscore for game 14 of 208
Scraping advanced boxscore for game 15 of 208
Scraping advanced boxscore for game 16 of 208
Scraping advanced boxscore for game 17 of 208
Scraping advanced boxscore for game 18 of 208
Scraping advanced boxscore for game 19 of 208
Scraping advanced boxscore for game 20 of 208
Scraping advanced boxscore for game 21 of 208
Scraping advanced boxscore for game 22 of 2