In [3]:
import pandas as pd
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from getpass import getuser

# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set up Chrome driver
user = getuser()
webdriver_service = Service(r'C:\Users\{}\Downloads\chromedriver.exe'.format(user))
driver = webdriver.Chrome(service=webdriver_service, options=options)

# Define the range of seasons
start_season = 2016
end_season = 2023

# Create empty DataFrames for each table
team_defense_stats = pd.DataFrame()
player_defense_stats = pd.DataFrame()

# Loop through each season
for season in range(start_season, end_season + 1):
    # Format the URL for the current season
    next_season = season + 1
    url = f"https://fbref.com/en/comps/9/{season}-{next_season}/defense/{season}-{next_season}-Premier-League-Stats"

    # Load the webpage
    driver.get(url)
    sleep(3)  # Allow time for the page to load dynamically

    # Get the page source and create a BeautifulSoup object
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    # Function to extract data from a table
    def extract_table_data(table_id, season, exclude_columns=None):
        table = soup.find("table", id=table_id)

        if table is None:
            print(f"Table '{table_id}' not found on the webpage for season {season}-{season+1}.")
            return pd.DataFrame()

        # Remove the tr element with class "thead"
        thead_row = table.find("tr", class_="thead")
        if thead_row:
            thead_row.decompose()

        # Extract column names
        header_row = table.find("thead").find_all("tr")[1]  # Second row of the header
        columns = ["Squad"]  # Add Squad column as the first column

        # Retrieve the remaining column names
        for header in header_row.find_all("th")[1:]:  # Exclude the first "Rk" column
            column_name = header.text.strip()

            # Check for duplicate column names and modify them to make them unique
            if column_name in columns:
                column_name = f"{column_name}_{columns.count(column_name) + 1}"

            columns.append(column_name)

        # Extract player data
        data_rows = table.find("tbody").find_all("tr")
        player_data = []
        for row in data_rows:
            # Extract the Squad (team name) from the <th> tag if it exists
            squad_cell = row.find("th")
            squad_name = squad_cell.text.strip() if squad_cell else "Unknown Squad"

            # Extract the rest of the data from the <td> tags
            cells = row.find_all("td")
            player = [squad_name]  # Start the player list with the squad name

            for cell in cells:
                player.append(cell.text.strip())

            player_data.append(player)

        # Convert player data to DataFrame
        season_df = pd.DataFrame(player_data, columns=columns)

        # Add season column to DataFrame
        season_df.insert(0, "Season", f"{season}-{next_season}")

        # Exclude specified columns from DataFrame
        if exclude_columns:
            season_df = season_df.drop(columns=exclude_columns)

        return season_df

    # Extract data for team and player defense stats
    team_defense_df = extract_table_data("stats_squads_defense_for", season)
    player_defense_df = extract_table_data("all_stats_defense", season, exclude_columns=["Matches"])

    # Append the current season's player data to the overall DataFrames
    team_defense_stats = pd.concat([team_defense_stats, team_defense_df], ignore_index=True)
    player_defense_stats = pd.concat([player_defense_stats, player_defense_df], ignore_index=True)

    print(f"Data for season {season}-{next_season} successfully extracted.")

# Remove empty rows
team_defense_stats.dropna(how='all', inplace=True)
player_defense_stats.dropna(how='all', inplace=True)

# Remove rows where only the season variable is present
team_defense_stats = team_defense_stats[~(team_defense_stats.drop("Season", axis=1).isna().all(axis=1))]
player_defense_stats = player_defense_stats[~(player_defense_stats.drop("Season", axis=1).isna().all(axis=1))]

# Clean the Nation column in the player_defense_stats DataFrame
if 'Nation' in player_defense_stats.columns:
    player_defense_stats['Nation'] = player_defense_stats['Nation'].apply(lambda x: x.split()[-1])

# Rename all variables in team_defense_stats with the suffix _team
team_defense_stats.columns = ["season", "squad"] + [f"{col.lower()}_team" for col in team_defense_stats.columns[2:]]

# Clean player_defense_stats DataFrame
# Remove specified columns and rename as needed
player_defense_stats = player_defense_stats.drop(columns=["Squad"], errors='ignore')

# First convert all column names to lowercase
player_defense_stats.columns = player_defense_stats.columns.str.lower()

# Rename columns as needed
player_defense_stats = player_defense_stats.rename(columns={"cs%": "cs_prct"})

# Append '_player' suffix to all remaining column names except for specific columns
player_defense_stats.columns = [f"{col}_player" if col not in ["season", "squad", "nation", "born", "player"] else col for col in player_defense_stats.columns]

# Save data to Excel files
output_folder = r'C:\Users\{}\Documents\GitHub\dream-team-fpl-prediction\data'.format(user)
team_defense_output_path = os.path.join(output_folder, 'team_defense_stats.xlsx')
player_defense_output_path = os.path.join(output_folder, 'player_defense_stats.xlsx')

team_defense_stats.to_excel(team_defense_output_path, index=False)
player_defense_stats.to_excel(player_defense_output_path, index=False)

print(f"Team defense data saved to {team_defense_output_path}")
print(f"Player defense data saved to {player_defense_output_path}")

# Close the browser
driver.quit()


Table 'all_stats_defense' not found on the webpage for season 2016-2017.
Data for season 2016-2017 successfully extracted.
Table 'all_stats_defense' not found on the webpage for season 2017-2018.
Data for season 2017-2018 successfully extracted.
Table 'all_stats_defense' not found on the webpage for season 2018-2019.
Data for season 2018-2019 successfully extracted.
Table 'all_stats_defense' not found on the webpage for season 2019-2020.
Data for season 2019-2020 successfully extracted.
Table 'all_stats_defense' not found on the webpage for season 2020-2021.
Data for season 2020-2021 successfully extracted.


KeyboardInterrupt: 