In [1]:
"""
Author: Andres Melendez
Description: This script scrapes NFL Franchise data from Pro-Football-Reference using Selenium and parses it into a Pandas DataFrame.
"""

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

def scrape_nfl_franchise_data_with_selenium(url: str) -> pd.DataFrame:
    """
    Uses Selenium to load the webpage and scrapes NFL franchise data from the teams table with id 'teams_active'.
    
    Args:
        url (str): The URL of the page containing the NFL franchise history data.

    Returns:
        pd.DataFrame: A DataFrame containing the scraped NFL franchise data with selected columns.
    """
    driver = None  # Initialize driver as None to avoid UnboundLocalError
    try:
        # Setup Selenium WebDriver using the Service class for ChromeDriver
        service = Service(r'path\to\chromedriver.exe')  # Update this path to your chromedriver
        driver = webdriver.Chrome(service=service)  # Use the Service instance to start the browser
        driver.get(url)

        # Wait explicitly for the table with id 'teams_active' to be loaded
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'teams_active')))

        # Extract the page source once the page is fully loaded
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find the table by its id 'teams_active'
        table = soup.find('table', {'id': 'teams_active'})
        if table is None:
            print("No table with id 'teams_active' found.")
            return None

        rows = table.find_all('tr')

        # Extract the desired columns from each row
        data = []
        for row in rows:
            # Team name is in the <th> tag
            team_name_element = row.find('th', {'data-stat': 'team_name'})
            team_name = team_name_element.text.strip() if team_name_element else ''

            # Extract other data from <td> elements
            cols = row.find_all('td')
            if len(cols) > 0:
                from_year = row.find('td', {'data-stat': 'year_min'}).text.strip() if row.find('td', {'data-stat': 'year_min'}) else ''
                to_year = row.find('td', {'data-stat': 'year_max'}).text.strip() if row.find('td', {'data-stat': 'year_max'}) else ''
                wins = row.find('td', {'data-stat': 'wins'}).text.strip() if row.find('td', {'data-stat': 'wins'}) else ''
                losses = row.find('td', {'data-stat': 'losses'}).text.strip() if row.find('td', {'data-stat': 'losses'}) else ''
                ties = row.find('td', {'data-stat': 'ties'}).text.strip() if row.find('td', {'data-stat': 'ties'}) else ''
                win_loss_pct = row.find('td', {'data-stat': 'win_loss_perc'}).text.strip() if row.find('td', {'data-stat': 'win_loss_perc'}) else ''
                playoff_years = row.find('td', {'data-stat': 'years_playoffs'}).text.strip() if row.find('td', {'data-stat': 'years_playoffs'}) else ''
                playoff_wins = row.find('td', {'data-stat': 'wins_playoffs'}).text.strip() if row.find('td', {'data-stat': 'wins_playoffs'}) else ''
                playoff_losses = row.find('td', {'data-stat': 'losses_playoffs'}).text.strip() if row.find('td', {'data-stat': 'losses_playoffs'}) else ''
                championships = row.find('td', {'data-stat': 'championships'}).text.strip() if row.find('td', {'data-stat': 'championships'}) else ''
                super_bowl_wins = row.find('td', {'data-stat': 'championships_super_bowl'}).text.strip() if row.find('td', {'data-stat': 'championships_super_bowl'}) else ''

                # Append the data to the list
                data.append([team_name, from_year, to_year, wins, losses, ties, win_loss_pct, playoff_years, playoff_wins, playoff_losses, championships, super_bowl_wins])

        # If no data was found, print a message
        if not data:
            print("No data found in the table.")
            return None

        # Create a DataFrame with the correct column names
        df = pd.DataFrame(data, columns=['Team Name', 'From Year', 'To Year', 'Wins', 'Losses', 'Ties', 'Win-Loss Percentage', 'Playoff Years', 'Playoff Wins', 'Playoff Losses', 'Championships', 'Super Bowl Wins'])
        return df
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None
    finally:
        if driver:  # Only quit if driver was successfully assigned
            driver.quit()  # Close the browser after scraping


In [2]:
# Define the URL for scraping
url = 'https://www.pro-football-reference.com/teams/'

# Scrape the data using Selenium
df = scrape_nfl_franchise_data_with_selenium(url)
if df is not None and not df.empty:
    print(df.head())
else:
    print("No data found or table structure might have changed.")


                Team Name From Year To Year Wins Losses Ties  \
0       Arizona Cardinals      1920    2024  586    806   41   
1       Chicago Cardinals      1920    1943   99    141   21   
2  Chi/Pit Cards/Steelers      1944    1944    0     10    0   
3       Chicago Cardinals      1945    1959   66    107    4   
4     St. Louis Cardinals      1960    1987  186    202   14   

  Win-Loss Percentage Playoff Years Playoff Wins Playoff Losses Championships  \
0                .421            11            7             10             2   
1                .413             0            0              0             1   
2                .000             0            0              0             0   
3                .382             2            1              1             1   
4                .480             3            0              3             0   

  Super Bowl Wins  
0               0  
1               0  
2               0  
3               0  
4               0  


In [3]:
# Identify Duplicates and Drop Them
df.drop_duplicates(inplace=True)

# Fix inconsistent values by ensuring consistent casing for team names
df['Team Name'] = df['Team Name'].str.title()

# Calculate additional columns (Add Data) - Total Games Played
df['Total Games Played'] = df['Wins'].astype(float) + df['Losses'].astype(float) + df['Ties'].astype(float)

# Format percentage data for better readability (convert to percentage string)
df['Win-Loss Percentage'] = df['Win-Loss Percentage'].astype(float) * 100
df['Win-Loss Percentage'] = df['Win-Loss Percentage'].apply(lambda x: "{:.2f}%".format(x))

# Display the transformed DataFrame
print(df.head())

                Team Name From Year To Year Wins Losses Ties  \
0       Arizona Cardinals      1920    2024  586    806   41   
1       Chicago Cardinals      1920    1943   99    141   21   
2  Chi/Pit Cards/Steelers      1944    1944    0     10    0   
3       Chicago Cardinals      1945    1959   66    107    4   
4     St. Louis Cardinals      1960    1987  186    202   14   

  Win-Loss Percentage Playoff Years Playoff Wins Playoff Losses Championships  \
0              42.10%            11            7             10             2   
1              41.30%             0            0              0             1   
2               0.00%             0            0              0             0   
3              38.20%             2            1              1             1   
4              48.00%             3            0              3             0   

  Super Bowl Wins  Total Games Played  
0               0              1433.0  
1               0               261.0  
2       