In [11]:
import pandas as pd
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from getpass import getuser

# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set up Chrome driver
user = getuser()
webdriver_service = Service(r'C:\Users\{}\Downloads\chromedriver.exe'.format(user))
driver = webdriver.Chrome(service=webdriver_service, options=options)

# Define the range of seasons
start_season = 2016
end_season = 2023

# Create empty DataFrames for each table
keepers_stats = pd.DataFrame()
advanced_keepers_stats = pd.DataFrame()

# Loop through each season
for season in range(start_season, end_season + 1):
    # Format the URL for the current season
    url = f"https://fbref.com/en/comps/9/{season}-{season+1}/keepers/{season}-{season+1}-Premier-League-Stats"

    # Load the webpage
    driver.get(url)
    sleep(3)  # Allow time for the page to load dynamically

    # Get the page source and create a BeautifulSoup object
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    # Function to extract data from a table
    def extract_table_data(table_id, season, exclude_columns=None):
        table = soup.find("table", id=table_id)

        if table is None:
            print(f"Table '{table_id}' not found on the webpage for season {season}-{season+1}.")
            return pd.DataFrame()

        # Remove the tr element with class "thead"
        thead_row = table.find("tr", class_="thead")
        if thead_row:
            thead_row.decompose()

        # Extract column names
        header_row = table.find("thead").find_all("tr")[1]  # Second row of the header
        columns = ["Squad"]  # Add Squad column as the first column

        # Retrieve the remaining column names
        for header in header_row.find_all("th")[1:]:  # Exclude the first "Rk" column
            column_name = header.text.strip()

            # Check for duplicate column names and modify them to make them unique
            if column_name in columns:
                column_name = f"{column_name}_{columns.count(column_name) + 1}"

            columns.append(column_name)

        # Extract player data
        data_rows = table.find("tbody").find_all("tr")
        player_data = []
        for row in data_rows:
            # Extract the Squad (team name) from the <th> tag if it exists
            squad_cell = row.find("th")
            squad_name = squad_cell.text.strip() if squad_cell else "Unknown Squad"

            # Extract the rest of the data from the <td> tags
            cells = row.find_all("td")
            player = [squad_name]  # Start the player list with the squad name

            for cell in cells:
                if 'data-stat="nationality"' in str(cell):  # Check for Nation column
                    nation_flag = cell.find("span", class_="f-i")
                    if nation_flag and nation_flag.get("title"):
                        player.append(nation_flag.get("title"))
                    else:
                        player.append(cell.text.strip())
                else:
                    player.append(cell.text.strip())

            player_data.append(player)

        # Convert player data to DataFrame
        season_df = pd.DataFrame(player_data, columns=columns)

        # Add season column to DataFrame
        season_df.insert(0, "Season", f"{season}-{season+1}")

        # Exclude specified columns from DataFrame
        if exclude_columns:
            season_df = season_df.drop(columns=exclude_columns)

        return season_df

    # Extract data for keepers and advanced keepers
    keepers_df = extract_table_data("stats_squads_keeper_for", season)
    advanced_keepers_df = extract_table_data("stats_keeper", season, exclude_columns=["Matches"])

    # Append the current season's player data to the overall DataFrames
    keepers_stats = pd.concat([keepers_stats, keepers_df], ignore_index=True)
    advanced_keepers_stats = pd.concat([advanced_keepers_stats, advanced_keepers_df], ignore_index=True)

    print(f"Data for season {season}-{season+1} successfully extracted.")

# Remove empty rows
keepers_stats.dropna(how='all', inplace=True)
advanced_keepers_stats.dropna(how='all', inplace=True)

# Remove rows where only the season variable is present
keepers_stats = keepers_stats[~(keepers_stats.drop("Season", axis=1).isna().all(axis=1))]
advanced_keepers_stats = advanced_keepers_stats[~(advanced_keepers_stats.drop("Season", axis=1).isna().all(axis=1))]

# Clean the Nation column in the advanced_keepers_stats DataFrame
if 'Nation' in advanced_keepers_stats.columns:
    advanced_keepers_stats['Nation'] = advanced_keepers_stats['Nation'].apply(lambda x: x.split()[-1])

# Save data to Excel files
output_folder = r'C:\Users\{}\Documents\GitHub\dream-team-fpl-prediction\data'.format(user)
keepers_output_path = os.path.join(output_folder, 'keepers_stats.xlsx')
advanced_keepers_output_path = os.path.join(output_folder, 'advanced_keepers_stats.xlsx')

keepers_stats.to_excel(keepers_output_path, index=False)
advanced_keepers_stats.to_excel(advanced_keepers_output_path, index=False)

print(f"Keepers data saved to {keepers_output_path}")
print(f"Advanced Keepers data saved to {advanced_keepers_output_path}")

# Close the browser
driver.quit()


Data for season 2016-2017 successfully extracted.
Data for season 2017-2018 successfully extracted.
Data for season 2018-2019 successfully extracted.
Data for season 2019-2020 successfully extracted.
Data for season 2020-2021 successfully extracted.
Data for season 2021-2022 successfully extracted.
Data for season 2022-2023 successfully extracted.
Data for season 2023-2024 successfully extracted.
Keepers data saved to C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data\keepers_stats.xlsx
Advanced Keepers data saved to C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data\advanced_keepers_stats.xlsx
