In [1]:
import csv
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import os
from getpass import getuser

In [2]:
# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set up Chrome driver
user = getuser()
webdriver_service = Service(r'C:\Users\{}\Downloads\chromedriver.exe'.format(user))
driver = webdriver.Chrome(service=webdriver_service, options=options)

# Define the range of seasons
start_season = 2016
end_season = 2024

# Create an empty DataFrame for player stats
players_stats = pd.DataFrame()

# Loop through each season
for season in range(start_season, end_season + 1):
    # Format the URL for the current season
    url = f"https://fbref.com/en/comps/9/{season}-{season+1}/stats/{season}-{season+1}-Premier-League-Stats"

    # Load the webpage
    driver.get(url)
    sleep(3)  # Allow time for the page to load dynamically

    # Get the page source and create a BeautifulSoup object
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    # Find the table by id
    table = soup.find("table", id="stats_standard")

    if table is None:
        print(f"Table not found on the webpage for season {season}-{season+1}.")
        continue

    # Remove the tr element with class "thead"
    thead_row = table.find("tr", class_="thead")
    if thead_row:
        thead_row.decompose()

    # Extract column names
    header_row = table.find("thead").find_all("tr")[1]  # Second row of the header
    columns = []

    # Retrieve the remaining column names
    for header in header_row.find_all("th")[1:]:  # Exclude the first "Rk" column
        column_name = header.text.strip()

        # Check for duplicate column names and modify them to make them unique
        if column_name in columns:
            column_name = f"{column_name}_{columns.count(column_name) + 1}"

        columns.append(column_name)

    # Extract player data
    data_rows = table.find("tbody").find_all("tr")
    player_data = []
    for row in data_rows:
        player = [cell.text for cell in row.find_all("td")]
        player_data.append(player)

    # Add season column to player data
    for player in player_data:
        player.insert(0, f"{season}-{season+1}")

    # Create a DataFrame for the player data of the current season
    season_df = pd.DataFrame(player_data, columns=["Season"] + columns)

    # Reset the index of the DataFrame
    season_df.reset_index(drop=True, inplace=True)

    # Append the current season's player data to the overall players_stats DataFrame
    players_stats = pd.concat([players_stats, season_df], ignore_index=True)

    print(f"All data for season {season}-{season+1} successfully extracted.")

# Remove empty rows
players_stats.dropna(how='all', inplace=True)

# Remove rows where only the season variable is present
players_stats = players_stats[~(players_stats.drop("Season", axis=1).isna().all(axis=1))]

# Save data to a CSV file
output_folder = r'C:\Users\{}\Documents\GitHub\dream-team-fpl-prediction\data'.format(user)
output_path = os.path.join(output_folder, 'players_stats.xlsx')
players_stats.to_excel(output_path, index=False)

print(f"Data saved to {output_path}")


All data for season 2016-2017 successfully extracted.
All data for season 2017-2018 successfully extracted.
All data for season 2018-2019 successfully extracted.
All data for season 2019-2020 successfully extracted.
All data for season 2020-2021 successfully extracted.
All data for season 2021-2022 successfully extracted.
All data for season 2022-2023 successfully extracted.
All data for season 2023-2024 successfully extracted.
All data for season 2024-2025 successfully extracted.
Data saved to C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\players_stats.xlsx
