In [1]:
#########################################################################################################################
# COLLECT DATA FROM PREVIOUS GAME DAY AND ADD THEM TO GENERAL DATASET#

# Script 1 of 4
# This script collects and processes data for the previous game day.
# Ensure to run this script before executing `_2. 03012025_get_data_next_game_day.ipynb`.
#########################################################################################################################


In [2]:
# Define the current season
current_season = 2025

In [3]:
import os
import pandas as pd
import shutil
from io import StringIO
import numpy as np

import re
from bs4 import BeautifulSoup

import logging

import time
import calendar
from datetime import datetime, timedelta

from selenium import webdriver
from selenium.webdriver.common.by import By  # Corrected import
from selenium.webdriver.chrome.service import Service  # Corrected import
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException

from webdriver_manager.chrome import ChromeDriverManager


In [4]:
today = datetime.now() - timedelta(days=0)
today_str = today.strftime("%a, %b ") + str(int(today.strftime("%d"))) + today.strftime(", %Y")
today_date = datetime.strptime(today_str, "%a, %b %d, %Y")
today_str_format = today_date.strftime("%Y-%m-%d")

yesterday = datetime.now() - timedelta(days=1)

print(f"Today's date: {today_str}")

Today's date: Tue, Jan 21, 2025


In [5]:
# Directories
DATA_DIR = os.path.join("D:\\", "1. Python", "1. NBA Script", "2025", "Gathering_Data")

STAT_DIR = os.path.join(DATA_DIR, "Whole_Statistic")
STANDINGS_DIR = os.path.join(DATA_DIR,"data", f"{current_season}_standings")
SCORES_DIR = os.path.join(DATA_DIR,"data", f"{current_season}_scores")

DST_DIR = os.path.join("D:\\", "_Laufwerk C", "11. Sorare", "NBA","2025", "Gathering_Data", "Whole_Statistic")



In [6]:
# Path to the manually downloaded ChromeDriver
chromedriver_path = r"D:\1. Python\3. chromedriver-win64\chromedriver-win64\chromedriver.exe"  # Replace with your actual path
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (optional)

In [7]:
# Get today's date
# Determine the current and next month/year

if today.day == 1:
    current_month = today.month - 1
    
    if current_month == 0:
        current_month = 12
        current_year = today.year - 1
        last_month = 12
        last_month_name = calendar.month_name[last_month].lower()
    else:
        current_year = today.year
        last_month = current_month# - 1
        last_month_name = calendar.month_name[last_month].lower()
        print(last_month)
else:
    current_month = today.month
    current_year = today.year
    last_month = None  # Not used in this case

In [8]:
# Configure logging
chrome_options.add_argument("--disable-ipv6")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_current_year_and_month():
    """
    Returns the current year and month.
    """
    #today = datetime.now()
    return today.year, today.month

def get_html(url, selector, sleep=5, retries=3, headless=True):
    """
    Retrieves HTML content from a webpage using Selenium WebDriver.
    """
    html = None
    driver = None

    try:
        # WebDriver options
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")

####### Initialize WebDriver with Service #######
        service = Service(chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
#############################################################################
        
        for attempt in range(retries):
            try:
                driver.get(url)
                time.sleep(sleep * (2 ** attempt))  # Exponential backoff
                element = driver.find_element(By.CSS_SELECTOR, selector)
                html = element.get_attribute("innerHTML")
                break
            except TimeoutException:
                logging.warning(f"Attempt {attempt + 1}: Timeout error on {url}. Retrying...")
            except WebDriverException as e:
                logging.error(f"Webdriver error: {e}")
                break
    finally:
        if driver is not None:
            driver.quit()

    if html is None:
        logging.error(f"Failed to retrieve HTML content from {url} after {retries} attempts.")

    return html

def scrape_season_for_month(season, month, month_name, standings_dir, get_html_function):
    """
    Scrapes NBA games data for a specific month and season from basketball-reference.com.

    Args:
        season (int): The NBA season year.
        month (int): The month number (1-12).
        month_name (str): The name of the month.
        standings_dir (str): Directory path to save the scraped data.
        get_html_function (function): Function to get HTML content from a URL.
    """
    if season < current_year or (season == current_year and month < current_month):
        logging.warning("Invalid year or month already passed.")
        return

    logging.info(f"Scraping games for: {season}, {month_name.title()}")
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    selector = "#content .filter"
    html_content = get_html_function(url, selector)

    if not html_content:
        logging.error(f"Failed to retrieve data from {url}.")
        return

    soup = BeautifulSoup(html_content, 'html.parser')
    links = soup.find_all("a", href=re.compile("/leagues/NBA_[0-9]{4}_games-[a-z]+\.html"))
    standings_pages = [f"https://www.basketball-reference.com{l['href']}" for l in links]

    for url in standings_pages:
        save_path = os.path.join(standings_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            logging.info(f"Path already exists: {save_path}")
            continue

        if month_name in save_path.lower():
            html = get_html_function(url, "#all_schedule")
            #print(html)
            if html:
                try:
                    with open(save_path, "w+", encoding='utf-8') as f:
                        f.write(html)
                    logging.info(f"Data for {month_name.title()} saved.")
                except Exception as e:
                    logging.error(f"Failed to save data for {month_name.title()}: {e}")
            else:
                logging.error(f"Failed to retrieve data for {month_name.title()} from {url}.")



In [9]:
def scrape_game(standings_file, scores_dir, get_html_function):
    """
    Scrapes box scores for NBA games from the provided standings file.

    Args:
        standings_file (str): Path to the file containing the standings data.
        scores_dir (str): Directory path to save the scraped box scores.
        get_html_function (function): Function to get HTML content from a URL.
    """
    # Calculate the date of yesterday
    yesterday_date = yesterday.strftime("%Y%m%d")  # Format: YYYYMMDD
    #print(yesterday_date)
    
    with open(standings_file, 'r',encoding='utf-8') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all("a")
    hrefs = [l.get('href') for l in links]
    box_scores = [f"https://www.basketball-reference.com{l}" for l in hrefs if l and "boxscore" in l and '.html' in l]

    filtered_urls = [url for url in box_scores if yesterday_date in url]

    for url in filtered_urls:
        save_path = os.path.join(scores_dir, url.split("/")[-1])
        
        if os.path.exists(save_path):
            continue

        html = get_html_function(url, "#content")
        
        if not html:
            continue

        try:
            with open(save_path, "wb+") as f:
                f.write(html.encode("utf-8"))
            print(f"Box score saved: {save_path}")
        except Exception as e:
            print(f"Failed to save box score: {e}")


In [10]:
next_month = current_month + 1 if current_month < 12 else 1
next_year = current_year if next_month != 1 else current_year + 1

current_month_name = calendar.month_name[current_month].lower()

next_month_name = calendar.month_name[next_month].lower()

# Create directories if they don't exist
os.makedirs(STANDINGS_DIR, exist_ok=True)
os.makedirs(SCORES_DIR, exist_ok=True)

# File removal logic (customize as needed)
file_to_remove = f"NBA_{current_season}_games-{current_month_name}.html"
file_path = os.path.join(STANDINGS_DIR, file_to_remove)

try:
    if os.path.exists(file_path):
        os.remove(file_path)
        logging.info(f"File {file_to_remove} has been removed.")
    else:
        logging.info(f"File {file_to_remove} does not exist.")
except Exception as e:
    logging.error(f"An error occurred: {e}")

# Scrape games for the current and next month (customize as needed)
if today.day == 1: # and last_month:
    scrape_season_for_month(current_season, last_month, last_month_name, STANDINGS_DIR, get_html)
    scrape_season_for_month(current_season, next_month, next_month_name, STANDINGS_DIR, get_html)
    #print(current_year)

else:
    scrape_season_for_month(current_season, current_month, current_month_name, STANDINGS_DIR, get_html)


2025-01-21 21:24:03,342 - INFO - File NBA_2025_games-january.html has been removed.
2025-01-21 21:24:03,344 - INFO - Scraping games for: 2025, January
2025-01-21 21:24:42,202 - INFO - Path already exists: D:\1. Python\1. NBA Script\2025\Gathering_Data\data\2025_standings\NBA_2025_games-october.html
2025-01-21 21:24:42,204 - INFO - Path already exists: D:\1. Python\1. NBA Script\2025\Gathering_Data\data\2025_standings\NBA_2025_games-november.html
2025-01-21 21:24:42,206 - INFO - Path already exists: D:\1. Python\1. NBA Script\2025\Gathering_Data\data\2025_standings\NBA_2025_games-december.html
2025-01-21 21:25:02,388 - INFO - Data for January saved.


In [11]:
def process_standings_files(standings_dir, current_season):
    """
    Process standings files for a specific season.

    Args:
        standings_dir (str): Directory containing standings files.
        current_season (int): The current NBA season year.
    """
    standings_files = os.listdir(standings_dir)
    #print(standings_files)

    # Filter files for the current season
    files = [s for s in standings_files if str(current_season) in s]

    for f in files:
        filepath = os.path.join(standings_dir, f)
        #print(filepath)

        scrape_game(filepath, SCORES_DIR, get_html)  # Assuming scrape_game is implemented

# Call the function with your STANDINGS_DIR and current_season
process_standings_files(STANDINGS_DIR, current_season)


In [12]:
from bs4 import BeautifulSoup

def get_first_game_date(standings_file):
    """
    Extract the date of the first game day from the standings file.
    
    Args:
        standings_file (str): Path to the standings HTML file.
    
    Returns:
        str: The date of the first game, formatted as 'Day, Month Date, Year'.
    """
    with open(standings_file, 'r', encoding='utf-8') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the first game date in the standings table
    table = soup.find("table", {"id": "schedule"})
    
    if not table:
        print(f"No schedule table found in {standings_file}.")
        return None

    # Look for the first non-header row (actual game data)
    first_game_row = table.find_all("tr")[1]  # Skip the header row and take the first game row
    if first_game_row:
        game_date_tag = first_game_row.find("th", {"data-stat": "date_game"})
        if game_date_tag:
            return game_date_tag.text.strip()  # Returns the date of the first game
    
    return None

# Example usage
standings_file =  os.path.join(STANDINGS_DIR, 'NBA_2025_games-october.html')  # Replace with the actual path to your file

first_game_date_str = get_first_game_date(standings_file)
if first_game_date_str:
    print(f"The first game is scheduled on: {first_game_date_str}")
else:
    print("No game dates found in the file.")


The first game is scheduled on: Tue, Oct 22, 2024


In [13]:
#########################################################################################################################
# PARSE DATA FROM PREVIOUS GAME DAY #
#########################################################################################################################


In [21]:
import os
import logging
from datetime import datetime, timedelta

# Constants
MAX_DAYS_BACK = 150
#SRC_DIR = "path_to_source_directory"  # Update this to your actual source directory
#first_game_date_str = "2024-10-01"  # Replace with actual start date

first_game_date = datetime.strptime(first_game_date_str, "%a, %b %d, %Y").date()
#first_game_date = datetime.strptime(first_game_date_str, "%Y-%m-%d").date()

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to check if a file exists
def file_exists(date_str):
    filename = f"nba_games_{date_str}.csv"
    return os.path.isfile(os.path.join(STAT_DIR, filename))

# Optimized function to find the most recent file
def find_most_recent_file(max_days=MAX_DAYS_BACK):
    #today = datetime.now().date()
    for days_back in range(1, max_days + 1):
        date_str = (today - timedelta(days=days_back)).strftime("%Y-%m-%d")
        if file_exists(date_str):
            return date_str
    return None

# Main script
try:
    #today = datetime.now().date()
    today_date = today.date()
    if today_date < first_game_date:
        raise RuntimeError(f"Season has not started. Today: {today_date}, Start Date: {first_game_date}")

    most_recent_date = find_most_recent_file()
    if most_recent_date:
        logging.info(f"Processing file for {most_recent_date}.")
        # Add your processing logic here
    else:
        logging.warning("No recent files found.")
except Exception as e:
    logging.error(f"An error occurred: {e}")


2025-01-21 21:26:41,697 - INFO - Processing file for 2025-01-20.


In [22]:
#Functions
def parse_html(box_score):
    """Parse HTML content from a box score file."""
    try:
        with open(box_score, encoding='utf-8') as f:
            html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        [s.decompose() for s in soup.select("tr.over_header, tr.thead")]
        return soup
    except Exception as e:
        logging.error(f"Error parsing HTML for {box_score}: {e}")
        return None

def read_line_score(soup):
    """Read line score from the soup object."""
    line_score = pd.read_html(StringIO(str(soup)), attrs={'id': 'line_score'})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols
    line_score = line_score[["team", "total"]]
    return line_score

def read_stats(soup, team, stat):
    """Read team statistics from the soup object."""
    df = pd.read_html(StringIO(str(soup)), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
    return df.apply(pd.to_numeric, errors="coerce")

def read_season_info(soup):
    """Extract season information from the soup object."""
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    return os.path.basename(hrefs[1]).split("_")[0]

def rename_duplicated_columns(df):
    """Rename duplicated columns by appending a suffix"""
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

def copy_missing_files(src_dir, dst_dir):
    """Copy missing files from source to destination directory."""
    src_files = set(os.listdir(src_dir))
    dst_files = set(os.listdir(dst_dir))
    diff = src_files - dst_files

    for file_name in diff:
        if not file_name.startswith('.') and not file_name.endswith('.ipynb'):
            shutil.copy2(os.path.join(src_dir, file_name), dst_dir)
            logging.info(f'File {file_name} copied successfully')

In [25]:
def process_nba_data():
    """Main function to process NBA data."""
    # Use the most recent date for the file
    last_file_date = most_recent_date
    print(f"Processing file for: {last_file_date}")

    filename = f"nba_games_{last_file_date}.csv"

    # Set up logging
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

    try:
        # Load existing statistics
        existing_statistics = pd.read_csv(os.path.join(STAT_DIR, filename))
    except FileNotFoundError:
        logging.error(f"File {filename} not found in {STAT_DIR}.")
        return

    base_cols = None
    games = []

    # List of all box score HTML files
    box_scores = [os.path.join(SCORES_DIR, f) for f in os.listdir(SCORES_DIR) if f.endswith(".html")]

    if not box_scores:
        logging.warning("No box score files found in the SCORES_DIR.")
        return

    logging.info(f"Number of box score files found: {len(box_scores)}")

    # Process each box score
    for box_score in box_scores:
        try:
            date = pd.Timestamp(os.path.basename(box_score)[:8]).date()
            if date < pd.Timestamp(yesterday).date():
                continue

            logging.debug(f"Processing box score: {box_score}, Date: {date}")

            soup = parse_html(box_score)
            if soup is None:
                continue

            line_score = read_line_score(soup)
            teams = list(line_score["team"])
            summaries = []

            for team in teams:
                basic = read_stats(soup, team, "basic")
                advanced = read_stats(soup, team, "advanced")

                totals = pd.concat([basic.iloc[-1], advanced.iloc[-1]])
                totals.index = totals.index.str.lower()

                maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
                maxes.index = maxes.index.str.lower() + "_max"

                summary = pd.concat([totals, maxes])
                if base_cols is None:
                    base_cols = [b for b in summary.index.drop_duplicates(keep="first") if "bpm" not in b]
                summary = summary[base_cols]
                summaries.append(summary)

            summary = pd.concat(summaries, axis=1).T
            game = pd.concat([summary, line_score], axis=1)
            game["home"] = [0, 1]

            game_opp = game.iloc[::-1].reset_index()
            game_opp.columns += "_opp"

            full_game = pd.concat([game, game_opp], axis=1)

            full_game["season"] = read_season_info(soup)
            full_game["date"] = pd.Timestamp(os.path.basename(box_score)[:8])
            full_game["won"] = full_game["total"] > full_game["total_opp"]
            games.append(full_game)

            if len(games) % 100 == 0:
                logging.info(f"{len(games)} / {len(box_scores)} processed.")

        except Exception as e:
            logging.error(f"Error processing {box_score}: {e}")

    # Warning if no games were processed
    if not games:
        logging.warning("No games were played yesterday or no valid box scores were found.")
        return
    else:
        logging.info(f"{len(games)} games were processed.")

    # If games were processed, create a DataFrame
    games_df = pd.concat(games, ignore_index=True)
    print(f"Sample processed data:\n{games_df.head(1).to_string(index=False)}")

    # Rename duplicated columns in games_df (if any)
    games_df = rename_duplicated_columns(games_df)

    # Align columns to existing statistics
    games_df = games_df.reindex(columns=existing_statistics.columns)

    # Combine new data with existing statistics
    combined_statistics = pd.concat([existing_statistics, games_df], ignore_index=True)

    # Save the combined statistics
    file_name = f"nba_games_{today_date}.csv"
    combined_statistics.to_csv(os.path.join(STAT_DIR, file_name), index=False)
    logging.info(f"Combined statistics saved to: {os.path.join(STAT_DIR, file_name)}")

    # Copy any missing files
    copy_missing_files(STAT_DIR, DST_DIR)


In [26]:
if __name__ == "__main__":
    process_nba_data()

Processing file for: 2025-01-20


2025-01-21 21:27:37,952 - INFO - Number of box score files found: 632
2025-01-21 21:27:46,070 - INFO - 8 games were processed.


Sample processed data:
   mp    mp   fg  fga   fg%  3p  3pa   3p%   ft  fta   ft%  orb  drb  trb  ast  stl  blk  tov   pf   pts  gmsc  +/-   ts%  efg%  3par   ftr  orb%  drb%  trb%  ast%  stl%  blk%  tov%  usg%  ortg  drtg  mp_max  mp_max  fg_max  fga_max  fg%_max  3p_max  3pa_max  3p%_max  ft_max  fta_max  ft%_max  orb_max  drb_max  trb_max  ast_max  stl_max  blk_max  tov_max  pf_max  pts_max  gmsc_max  +/-_max  ts%_max  efg%_max  3par_max  ftr_max  orb%_max  drb%_max  trb%_max  ast%_max  stl%_max  blk%_max  tov%_max  usg%_max  ortg_max  drtg_max team  total  home  index_opp  mp_opp  mp_opp  fg_opp  fga_opp  fg%_opp  3p_opp  3pa_opp  3p%_opp  ft_opp  fta_opp  ft%_opp  orb_opp  drb_opp  trb_opp  ast_opp  stl_opp  blk_opp  tov_opp  pf_opp  pts_opp  gmsc_opp  +/-_opp  ts%_opp  efg%_opp  3par_opp  ftr_opp  orb%_opp  drb%_opp  trb%_opp  ast%_opp  stl%_opp  blk%_opp  tov%_opp  usg%_opp  ortg_opp  drtg_opp  mp_max_opp  mp_max_opp  fg_max_opp  fga_max_opp  fg%_max_opp  3p_max_opp  3pa_max_opp

2025-01-21 21:27:48,675 - INFO - Combined statistics saved to: D:\1. Python\1. NBA Script\2025\Gathering_Data\Whole_Statistic\nba_games_2025-01-21.csv
2025-01-21 21:27:48,729 - INFO - File nba_games_2025-01-20.csv copied successfully
