In [None]:
nfl_team_urls = [
    # AFC East
    "https://www.buffalobills.com/", "https://www.miamidolphins.com/", "https://www.patriots.com/", "https://www.newyorkjets.com/", 
    # AFC North
    "https://www.baltimoreravens.com/", "https://www.bengals.com/", "https://www.clevelandbrowns.com/", "https://www.steelers.com/",    
    # AFC South
    "https://www.houstontexans.com/", "https://www.colts.com/", "https://www.jaguars.com/", "https://www.tennesseetitans.com/",    
    # AFC West
    "https://www.denverbroncos.com/", "https://www.chiefs.com/", "https://www.raiders.com/", "https://www.chargers.com/",    
    # NFC East
    "https://www.dallascowboys.com/", "https://www.giants.com/", "https://www.philadelphiaeagles.com/", "https://www.commanders.com/",    
    # NFC North
    "https://www.chicagobears.com/", "https://www.detroitlions.com/", "https://www.packers.com/", "https://www.vikings.com/",    
    # NFC South
    "https://www.atlantafalcons.com/", "https://www.panthers.com/", "https://www.neworleanssaints.com/", "https://www.buccaneers.com/",    
    # NFC West
    "https://www.azcardinals.com/", "https://www.therams.com/", "https://www.49ers.com/", "https://www.seahawks.com/"
]

In [None]:
nfl_team_abbreviations = [
    # AFC East
    "BUF", "MIA", "NE", "NYJ", 
    # AFC North
    "BAL", "CIN", "CLE", "PIT",
    # AFC South
    "HOU", "IND", "JAX", "TEN", 
    # AFC West
    "DEN", "KC", "LV", "LAC", 
    # NFC East
    "DAL", "NYG", "PHI", "WAS",
    # NFC North
    "CHI", "DET", "GB", "MIN",
    # NFC South
    "ATL", "CAR", "NO", "TB",
    # NFC West
    "ARI", "LAR", "SF", "SEA"
]

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
from io import StringIO

def get_team_injury_report(game_week: str, team_site_url: str, nfl_team_name: str) -> pd.DataFrame:
    """
    Scrapes a single NFL team's injury report for a specific week from a dynamically
    loaded team website (based on the common NFL league platform template).

    Args:
        game_week (str): The specific week number/identifier (e.g., "1", "10", "REG-10").
        team_site_url (str): The base URL of the team's injury report page 
                             (e.g., "https://www.commanders.com/team/injury-report/week/").
        nfl_team_name (str): The team abbreviation (e.g., "WAS") used for error logging.

    Returns:
        pd.DataFrame: The successfully parsed injury report table, or an empty DataFrame 
                      (pd.DataFrame()) if scraping or parsing fails.
    """
    
    # 1. Initialize WebDriver üåê
    try:
        # Use webdriver.Chrome() which is generally reliable.
        # Requires the Selenium package (v4.6.0+) to automatically manage the ChromeDriver executable.
        driver = webdriver.Chrome() 
    except Exception as e:
        # Critical failure: WebDriver could not be initialized (e.g., browser not found, path issue)
        print(f"CRITICAL ERROR: WebDriver initialization failed. Team: {nfl_team_name}. Error: {e}")
        return pd.DataFrame()

    # Construct the final URL for the specific week (e.g., .../week/REG-10)
    full_url = f"{team_site_url}{game_week}"
    driver.get(full_url)
    
    # 2. Define the Target Element Selector
    # This XPath targets the table element itself, relying on a specific CSS class 
    # used by the standard NFL web platform for the horizontal-scrolling injury table.
    TABLE_XPATH = '//div[@class="d3-o-table--horizontal-scroll nfl-o-injury-report"]/table'
    
    # 3. Explicit Wait for Dynamic Content Loading ‚è≥
    # This is essential because the table content is often loaded via JavaScript
    # AFTER the initial page HTML is returned.
    try:
        # Wait up to 20 seconds for the element to appear in the DOM.
        WebDriverWait(driver, 20).until(
            # Expected Condition: check if the table element is present
            EC.presence_of_element_located((By.XPATH, TABLE_XPATH))
        )
    except TimeoutException:
        # Failure: The table never appeared in the time limit (usually due to slow network or URL error)
        print(f"TIMEOUT: Table did not load within 20s. Team: {nfl_team_name}, Week: {game_week}.")
        driver.quit()
        return pd.DataFrame()
    except NoSuchElementException:
        # Failure: The XPath structure for the table is incorrect or the team uses a custom site.
        print(f"XPATH ERROR: Table element not found. Team: {nfl_team_name}, Week: {game_week}. (Likely a custom site).")
        driver.quit()
        return pd.DataFrame()

    # 4. Extract HTML and Parse with Pandas üìÑ
    try:
        # Get the complete HTML source of the located table element
        table_element = driver.find_element(By.XPATH, TABLE_XPATH)
        table_html = table_element.get_attribute('outerHTML')
        
        # Resolve Deprecation Warning: Wrap the HTML string in StringIO.
        # This converts the string into a file-like object, which is the 
        # preferred input format for pd.read_html in recent pandas versions.
        df_list = pd.read_html(StringIO(table_html))

        if df_list:
            # pd.read_html returns a list of DataFrames; we assume the first one is our target.
            df = df_list[0]
            
            # Column Cleaning: Flatten any MultiIndex headers (common in scraped tables).
            # If a column is a tuple (e.g., ('Player', 'Player')), it uses the first element.
            df.columns = [col[0] if isinstance(col, tuple) else col for col in df.columns]
            
            # --- FILE SAVING (Assumes 'team_filepath' is defined globally or passed) ---
            # NOTE: This saving logic should ideally be handled outside this function 
            # to maintain separation of concerns (scraping vs. saving).
            # It relies on the external variable 'team_filepath'.
            # injury_filepath = team_filepath + nfl_team_name + game_week + '.csv'
            # df.to_csv(injury_filepath)
            
            driver.quit()
            return df
        
        # Failure: The HTML was found, but pandas could not parse a valid table structure from it.
        print(f"PARSE ERROR: Pandas found HTML but no table structure. Team: {nfl_team_name}, Week: {game_week}.")
        return pd.DataFrame()

    except Exception as e:
        # General catch for any unexpected errors (e.g., Network issues, Pandas errors)
        print(f"UNEXPECTED ERROR: Failed to process table data. Team: {nfl_team_name}, Week: {game_week}. Exception: {e}")
        driver.quit()
        return pd.DataFrame()

In [None]:
import pandas as pd
from selenium import webdriver
# NOTE: You'll need other imports like 'os', 'time', 'By', etc., 
# for a fully functional version of this script.

# --- DEPENDENCIES: Ensure these variables are defined elsewhere ---
# nfl_team_urls: List of 32 official team website URLs
# nfl_team_abbreviations: List of 32 official team abbreviations (e.g., 'BUF')
# week_number: Integer representing the last week to scrape (e.g., 18)
# season: String representing the current season (e.g., '2025')
# get_team_injury_report: External function that handles the scraping (Selenium/BS4)

def run_team_injury_reports():
    """
    Main function to orchestrate the scraping of NFL injury reports for 
    all 32 teams across a specified range of regular season weeks.
    
    The function iterates through teams and weeks, calls an external function
    to scrape the data, adds metadata, and consolidates all results into one file.
    """
    
    # Define the headers for the metadata columns to be added to the scraped data.
    # These columns help uniquely identify the source of the injury report row.
    inj_headers = ['season', 'game_type', 'team', 'week']
    
    # Initialize an empty master DataFrame with the metadata columns pre-defined.
    inj_report_2025 = pd.DataFrame(columns=inj_headers)

    # OUTER LOOP: Iterate through all 32 team URLs and their abbreviations simultaneously.
    # The abbreviation (nfl_team_name) is used as metadata.
    for team_url, nfl_team_name in zip(nfl_team_urls, nfl_team_abbreviations):
        
        i = 0
        # INNER LOOP: Iterate through the specified number of weeks (from 1 up to week_number).
        while i <= (week_number - 1):
            i += 1
            
            # Construct the base URL for the injury report page.
            # Example: https://www.commanders.com/team/injury-report/week/REG-
            team_url1 = team_url + 'team/injury-report/week/REG-'
            
            # CALL EXTERNAL SCRAPING FUNCTION
            # Calls the function to navigate the site, extract the table, and return a DataFrame.
            # game_week is passed as '1', '2', '3', etc.
            team_inj_df = get_team_injury_report(
                game_week=str(i), 
                team_site_url=team_url1, 
                nfl_team_name=nfl_team_name
            )

            # --- Data Validation and Processing ---
            
            # Check if the scraping function failed or returned an empty table.
            if team_inj_df.empty:
                # Print a message indicating which URL/week failed.
                print(f"FAILED to scrape: {team_url}. Last successful week check: {i-1}")

            else:
                # Prepare the constant values for the new metadata columns.
                inj_head_values = [season, 'REG', nfl_team_name, i]
                
                # Assign Constant Metadata Columns:
                # Loop through the headers and values, assigning the single value
                # to the entire new column in the scraped DataFrame.
                for header, head_value in zip(inj_headers, inj_head_values):
                    team_inj_df[header] = head_value
                
                # Consolidate Data: Append the current team/week DataFrame 
                # to the master consolidation DataFrame.
                # ignore_index=True re-calculates the index for the master DataFrame.
                inj_report_2025 = pd.concat([inj_report_2025, team_inj_df], ignore_index=True)

    # --- FINAL OUTPUT ---
    
    # Define the full file path for the consolidated output CSV.
    # Note: Ensure the parent directories exist before running this section!
    all_team_inj_report_filepath = 'C:/Users/sarae/Documents/NFL_Modeling/2025/Week11/all_team_inj_report.csv'

    # Save the consolidated DataFrame to a CSV file.
    # index=False prevents writing the pandas DataFrame row index to the CSV.
    inj_report_2025.to_csv(all_team_inj_report_filepath, index=False) 

    # Confirmation message
    print(f"Successfully finished scraping. Data saved to: {all_team_inj_report_filepath}")
    
    # Optional: Return the final DataFrame for further in-script use.
    return inj_report_2025

# Example function call (currently commented out)
# injury_df = run_team_injury_reports()