In [1]:
"""
Author: Andres Melendez
Description: This script scrapes NFL Combine data for 2019 from a specified website, performs data extraction using Selenium, 
and includes data transformation and cleansing steps to ensure consistency and readability.
"""

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time

def fetch_combine_data_selenium(url: str) -> pd.DataFrame:
    """
    Opens the specified URL using Selenium, extracts NFL Combine data, and parses it into a DataFrame.
    
    Args:
        url (str): The URL containing the NFL Combine data.
    
    Returns:
        pd.DataFrame: A DataFrame containing the raw, extracted data.
    """
    service = Service(r'C:\Users\2dre3\OneDrive\Documents\chromedriver-win64\chromedriver.exe')
    driver = webdriver.Chrome(service=service)
    
    try:
        # Load the target webpage
        driver.get(url)
        time.sleep(5)

        # Parse the page's HTML
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.find('table')
        
        if not table:
            print("No table found on the page.")
            return pd.DataFrame()

        # Define headers based on known column structure
        headers = ['Year', 'Name', 'College', 'Position', 'Height', 'Weight', 'Wonderlic',
                   '40_Yard', 'Bench_Press', 'Vertical_Leap', 'Broad_Jump', 'Shuttle', '3_Cone']

        rows = []
        for i, row in enumerate(table.find_all('tr')[1:], start=1):
            # Extract each cell's text content
            cols = [td.text.strip() for td in row.find_all('td')]
            
            # Verify that the row matches the expected column count
            if len(cols) == len(headers):
                rows.append(cols)
            else:
                print(f"Skipping row {i}: Expected {len(headers)} columns, found {len(cols)}")

        df = pd.DataFrame(rows, columns=headers)
        
        if df.empty:
            print("Data extraction completed, but no rows were found.")
        else:
            print(f"Extracted {len(df)} rows of data.")
        
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()
    finally:
        # Close the browser
        driver.quit()
        print("Combine data retrieval complete.")

def clean_and_transform_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans and transforms the extracted NFL Combine data by converting columns, handling missing values, 
    and standardizing text formats.
    
    Args:
        df (pd.DataFrame): The raw DataFrame containing extracted data.
    
    Returns:
        pd.DataFrame: A DataFrame containing the cleaned and transformed data.
    """
    df.columns = ['Year', 'Name', 'College', 'Position', 'Height', 'Weight', 'Wonderlic',
                  '40_Yard', 'Bench_Press', 'Vertical_Leap', 'Broad_Jump', 'Shuttle', '3_Cone']
    
    # Drop the Wonderlic column
    df = df.drop(columns=['Wonderlic'])

    # Convert 'Weight' to numeric
    df['Weight'] = pd.to_numeric(df['Weight'], errors='coerce')
    
    # Convert 'Height' to inches, handling both string and numeric formats
    def height_to_inches(height):
        if isinstance(height, str) and '-' in height:
            feet, inches = height.split('-')
            return int(feet) * 12 + int(inches)
        return pd.to_numeric(height, errors='coerce')
    
    df['Height'] = df['Height'].apply(height_to_inches)

    # Convert numeric performance columns to proper data types
    numeric_cols = ['40_Yard', 'Bench_Press', 'Vertical_Leap', 'Broad_Jump', 'Shuttle', '3_Cone']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    
    # Standardize 'Position' and 'College' formats for consistency
    df['Position'] = df['Position'].str.upper()
    df['College'] = df['College'].str.title()
    
    # Remove duplicate rows to ensure unique entries
    df = df.drop_duplicates()
    
    # Drop rows with missing values in essential performance columns
    df = df.dropna(subset=['40_Yard', 'Bench_Press', 'Height', 'Weight'])
    
    # If your notebook contains a DataFrame named `df`, export it:
    df.to_csv('player_combine.csv', index=False)
    
    return df

def main():
    """
    Main function to execute the data extraction and transformation process.
    """
    url = 'https://nflcombineresults.com/nflcombinedata.php?year=2019&pos=&college='
    combine_data = fetch_combine_data_selenium(url)
    
    if not combine_data.empty:
        transformed_data = clean_and_transform_data(combine_data)
        print(transformed_data.head())
    else:
        print("No data available for transformation.")

if __name__ == "__main__":
    main()


Skipping row 664: Expected 13 columns, found 1
Extracted 663 rows of data.
Combine data retrieval complete.
   Year                 Name         College Position  Height  Weight  \
0  2019       Britton Abbott  Oklahoma State       FB   74.25     246   
1  2019      Micah Abernathy       Tennessee       FS   71.63     195   
3  2019           Paul Adams        Missouri       OT   77.88     317   
4  2019       Nasir Adderley        Delaware        S   71.75     206   
5  2019  Freedom Akinmoladun        Nebraska       DT   75.25     284   

   40_Yard  Bench_Press  Vertical_Leap  Broad_Jump  Shuttle  3_Cone  
0     4.75         19.0           36.0       111.0     4.45    7.45  
1     4.52         15.0           38.5       130.0     4.09    6.69  
3     5.18         16.0           27.0       103.0     4.74     NaN  
4     4.62         19.0           38.0       129.0      NaN     NaN  
5     4.96         19.0           33.0       118.0     4.62    7.35  
