# FBREF web scraping using Selenium

### Import Required Libraries

In [4]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


### Define a dictionary containing URLs for different leagues, incorporating the selected year.


In [5]:
year ="2020-2021"
league_links = {
    "Laliga": f"https://fbref.com/en/comps/12/{year}/stats/{year}-LaLiga-Stats",
    "Laliga2": f"https://fbref.com/en/comps/17/{year}/stats/{year}-LaLiga2-Stats",
    "BelgianProLeague": f"https://fbref.com/en/comps/37/{year}/stats/{year}-BelgianProLeague-Stats",
    "BrazilSerieA": f"https://fbref.com/en/comps/24/{year}/stats/{year}-BrazilSerieA-Stats",
    "Bundesliga": f"https://fbref.com/en/comps/20/{year}/stats/{year}-Bundesliga-Stats",
    "Bundesliga2": f"https://fbref.com/en/comps/33/{year}/stats/{year}-Bundesliga2-Stats",
    "LigaProfesionalArgentina": f"https://fbref.com/en/comps/21/2021/stats/2021-Liga-Profesional-Argentina-Stats",
    "Ligue1": f"https://fbref.com/en/comps/13/{year}/stats/{year}-Ligue1-Stats",
    "Ligue2": f"https://fbref.com/en/comps/60/{year}/stats/{year}-Ligue2-Stats",
    "SerieA": f"https://fbref.com/en/comps/11/{year}/stats/{year}-SerieA-Stats",
    "SerieB": f"https://fbref.com/en/comps/18/{year}/stats/{year}-SerieB-Stats",
    "Netherlands": f"https://fbref.com/en/comps/23/{year}/stats/{year}-Eredivisie-Stats",
    "PrimeiraLigaPortugal": f"https://fbref.com/en/comps/32/{year}/stats/{year}-PrimeiraLigaPortugal-Stats",
    "PremierLeague": f"https://fbref.com/en/comps/9/{year}/stats/{year}-PremierLeague-Stats",
    "Championship": f"https://fbref.com/en/comps/10/{year}/stats/{year}-Championship-Stats"
}

### Initialize Selenium WebDriver

In [None]:
PATH_TO_DRIVER = "C:/WebDriver/msedgedriver.exe" 
service = EdgeService(PATH_TO_DRIVER)
options = webdriver.EdgeOptions()
options.add_argument("--ignore-certificate-errors")  
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")  

# Initialize WebDriver
driver = webdriver.Edge(service=service, options=options)
driver.maximize_window() 

# Create directory for the selected year
year_folder = f"./{year}"
os.makedirs(year_folder, exist_ok=True)


### Loop through each league, extract data, and save it as a CSV file

In [None]:
for league_name, url in league_links.items():
    print(f"Processing {league_name} for {year}...")
    driver.get(url)
    time.sleep(5)  
    
    # Wait until table is loaded
    wait = WebDriverWait(driver, 10)
    try:
        table = wait.until(EC.presence_of_element_located((By.ID, "stats_standard")))
    except:
        print(f"Failed to load data for {league_name}. Skipping...")
        continue
    
    # Extract column headers using 'aria-label'
    header_row = table.find_elements(By.XPATH, ".//thead/tr[not(@class='over_header')]/th")
    column_headers = [th.get_attribute("aria-label").strip() if th.get_attribute("aria-label") else th.text.strip() for th in header_row]
    column_headers = column_headers[1:-1]  # Remove last empty column if present
    
    # Extract table rows & filter players under 23 during scraping
    rows = table.find_elements(By.TAG_NAME, "tr")
    data = []
    
    for row in rows[1:]:  # Skip header row
        cols = row.find_elements(By.TAG_NAME, "td")
        row_data = [col.text.strip() for col in cols]
        row_data = row_data[0:-1]  # Remove last empty column
        
        if len(row_data) == len(column_headers):  
            try:
                age_index = column_headers.index("Age")  # Find Age column index
                if int(row_data[age_index]) <= 23:  # Filter during scraping
                    data.append(row_data)
            except (ValueError, IndexError):
                print(f"Skipping row due to missing or non-numeric Age: {row_data}")
        else:
            print(f"Skipping row with {len(row_data)} columns (Expected {len(column_headers)})")
    
    # Ensure data is not empty
    if not data:
        print(f"No valid data extracted for {league_name}. Skipping...")
        continue
    
    # Save data to CSV in the main year folder
    df_filtered = pd.DataFrame(data, columns=column_headers)
    file_path = os.path.join(year_folder, f"{league_name}.csv")
    df_filtered.to_csv(file_path, index=False)
    
    print(f"Data saved for {league_name}: {file_path}")


### quit driver and print the final message

In [None]:
driver.quit()
print("Scraping completed!")