In [1]:
import pandas as pd
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from getpass import getuser
import re

# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set up Chrome driver
user = getuser()
webdriver_service = Service(r'C:\Users\{}\Downloads\chromedriver.exe'.format(user))
driver = webdriver.Chrome(service=webdriver_service, options=options)

# Define the range of seasons
start_season = 2016
end_season = 2024

# Create an empty DataFrame for team stats
teams_stats = pd.DataFrame()

# Loop through each season
for season in range(start_season, end_season + 1):
    # Format the URL for the current season
    url = f"https://fbref.com/en/comps/9/{season}-{season+1}/stats/{season}-{season+1}-Premier-League-Stats"

    # Load the webpage
    driver.get(url)
    sleep(3)  # Allow time for the page to load dynamically

    # Get the page source and create a BeautifulSoup object
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    # Find the table by id
    table = soup.find("table", id="stats_squads_standard_for")

    if table is None:
        print(f"Table not found on the webpage for season {season}-{season+1}.")
        continue

    # Remove the tr element with class "thead"
    thead_row = table.find("tr", class_="thead")
    if thead_row:
        thead_row.decompose()

    # Extract column names
    header_row = table.find("thead").find_all("tr")[1]  # Second row of the header
    columns = []

    # Retrieve the column names
    for header in header_row.find_all("th"):  # Include all columns
        column_name = header.text.strip()

        # Check for duplicate column names and modify them to make them unique
        if column_name in columns:
            column_name = f"{column_name}_{columns.count(column_name) + 1}"

        columns.append(column_name)

    # Extract team data
    data_rows = table.find("tbody").find_all("tr")
    team_data = []
    for row in data_rows:
        team = [cell.text for cell in row.find_all("td")]
        team_data.append(team)

    # Extract squad names from href links
    squad_names = []
    for row in data_rows:
        squad_link = row.find("a", href=re.compile(r"\/squads\/[a-zA-Z0-9]+\/[a-zA-Z0-9\-]+"))
        if squad_link:
            squad_name = squad_link.text.strip()
            squad_names.append(squad_name)

    # Add season column name to column list
    columns.insert(0, "Season")

    # Add squad names to team data
    for i, team in enumerate(team_data):
        team.insert(0, f"{season}-{season+1}")
        if i < len(squad_names):
            team.insert(1, squad_names[i])

    # Create a DataFrame for the team data of the current season
    season_df = pd.DataFrame(team_data, columns=columns)

    # Reset the index of the DataFrame
    season_df.reset_index(drop=True, inplace=True)

    # Append the current season's team data to the overall teams_stats DataFrame
    teams_stats = pd.concat([teams_stats, season_df], ignore_index=True)

    print(f"All data for season {season}-{season+1} successfully extracted.")

# Convert all column names to lowercase
teams_stats.columns = teams_stats.columns.str.lower()

# Remove specific columns before renaming
teams_stats.drop(columns=['mp', 'starts', 'min', '90s'], inplace=True, errors='ignore')

# Rename specific columns
teams_stats.rename(columns={
    '# pl': 'num_players',
    'age': 'avg_age'
}, inplace=True)

# Add "_team" to all columns except 'season' and 'squad'
columns_to_modify = teams_stats.columns.difference(['season', 'squad'])
teams_stats.rename(columns={col: f"{col}_team" for col in columns_to_modify}, inplace=True)

# Replace _2 with _90minutes
teams_stats.columns = teams_stats.columns.str.replace('_2', '_90minutes', regex=False)

# Print final columns to confirm changes
print("Final columns in the dataset after renaming:", teams_stats.columns.tolist())

# Remove empty rows
teams_stats.dropna(how='all', inplace=True)

# Remove rows where only the season and team variables are present
teams_stats = teams_stats[~(teams_stats.drop(["season", "squad"], axis=1).isna().all(axis=1))]

# Save data to an Excel file
output_folder = fr'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data'
output_path = os.path.join(output_folder, 'teams_stats.xlsx')
teams_stats.to_excel(output_path, index=False)

print(f"Data saved to {output_path}")

# Close the browser
driver.quit()


All data for season 2016-2017 successfully extracted.
All data for season 2017-2018 successfully extracted.
All data for season 2018-2019 successfully extracted.
All data for season 2019-2020 successfully extracted.
All data for season 2020-2021 successfully extracted.
All data for season 2021-2022 successfully extracted.
All data for season 2022-2023 successfully extracted.
All data for season 2023-2024 successfully extracted.
All data for season 2024-2025 successfully extracted.
Final columns in the dataset after renaming: ['season', 'squad', 'num_players_team', 'avg_age_team', 'poss_team', 'gls_team', 'ast_team', 'g+a_team', 'g-pk_team', 'pk_team', 'pkatt_team', 'crdy_team', 'crdr_team', 'gls_90minutes_team', 'ast_90minutes_team', 'g+a_90minutes_team', 'g-pk_90minutes_team', 'g+a-pk_team', 'xg_team', 'npxg_team', 'xag_team', 'npxg+xag_team', 'prgc_team', 'prgp_team', 'xg_90minutes_team', 'xag_90minutes_team', 'xg+xag_team', 'npxg_90minutes_team', 'npxg+xag_90minutes_team']
Data save