# libraries

In [1]:
import pandas as pd
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from getpass import getuser

# scraping functions

In [2]:


# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set up Chrome driver
user = getuser()
webdriver_service = Service(r'C:\Users\{}\Downloads\chromedriver.exe'.format(user))
driver = webdriver.Chrome(service=webdriver_service, options=options)

# Define the range of seasons
start_season = 2016
end_season = 2023

# Create empty DataFrames for each table
team_defense_stats = pd.DataFrame()
player_defense_stats = pd.DataFrame()

# Loop through each season
for season in range(start_season, end_season + 1):
    # Format the URL for the current season
    next_season = season + 1
    url = f"https://fbref.com/en/comps/9/{season}-{next_season}/defense/{season}-{next_season}-Premier-League-Stats"

    # Load the webpage
    driver.get(url)
    sleep(3)  # Allow time for the page to load dynamically

    # Get the page source and create a BeautifulSoup object
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    # Function to extract data from a table
    def extract_table_data(table_id, season, exclude_columns=None):
        table = soup.find("table", id=table_id)

        if table is None:
            print(f"Table '{table_id}' not found on the webpage for season {season}-{season+1}.")
            return pd.DataFrame()

        # Remove the tr element with class "thead"
        thead_row = table.find("tr", class_="thead")
        if thead_row:
            thead_row.decompose()

        # Extract column names
        header_row = table.find("thead").find_all("tr")[1]  # Second row of the header
        columns = ["Squad"]  # Add Squad column as the first column

        # Retrieve the remaining column names
        for header in header_row.find_all("th")[1:]:  # Exclude the first "Rk" column
            column_name = header.text.strip()

            # Check for duplicate column names and modify them to make them unique
            if column_name in columns:
                column_name = f"{column_name}_{columns.count(column_name) + 1}"

            columns.append(column_name)

        # Extract player data
        data_rows = table.find("tbody").find_all("tr")
        player_data = []
        for row in data_rows:
            # Extract the Squad (team name) from the <th> tag if it exists
            squad_cell = row.find("th")
            squad_name = squad_cell.text.strip() if squad_cell else "Unknown Squad"

            # Extract the rest of the data from the <td> tags
            cells = row.find_all("td")
            player = [squad_name]  # Start the player list with the squad name

            for cell in cells:
                player.append(cell.text.strip())

            player_data.append(player)

        # Convert player data to DataFrame
        season_df = pd.DataFrame(player_data, columns=columns)

        # Add season column to DataFrame
        season_df.insert(0, "Season", f"{season}-{next_season}")

        # Exclude specified columns from DataFrame
        if exclude_columns:
            season_df = season_df.drop(columns=exclude_columns)

        return season_df

    # Extract data for team and player defense stats
    team_defense_df = extract_table_data("stats_squads_defense_for", season)
    player_defense_df = extract_table_data("stats_defense", season, exclude_columns=["Matches"])

    # Append the current season's player data to the overall DataFrames
    team_defense_stats = pd.concat([team_defense_stats, team_defense_df], ignore_index=True)
    player_defense_stats = pd.concat([player_defense_stats, player_defense_df], ignore_index=True)

    print(f"Data for season {season}-{next_season} successfully extracted.")

# Append '_player' suffix to all remaining column names except for specific columns
player_defense_stats.columns = [f"{col}_player" if col not in ["season", "squad", "nation", "born", "player"] else col for col in player_defense_stats.columns]

# Convert all column names to lowercase for both DataFrames
team_defense_stats.columns = [col.lower() for col in team_defense_stats.columns]
player_defense_stats.columns = [col.lower() for col in player_defense_stats.columns]

# Close the browser
driver.quit()





Data for season 2016-2017 successfully extracted.
Data for season 2017-2018 successfully extracted.
Data for season 2018-2019 successfully extracted.
Data for season 2019-2020 successfully extracted.
Data for season 2020-2021 successfully extracted.
Data for season 2021-2022 successfully extracted.
Data for season 2022-2023 successfully extracted.
Data for season 2023-2024 successfully extracted.


# clean datasets

In [3]:
# Remove empty rows
team_defense_stats.dropna(how='all', inplace=True)
player_defense_stats.dropna(how='all', inplace=True)

In [4]:
# Rename specific columns in the team_defense_stats DataFrame
team_defense_stats = team_defense_stats.rename(columns={
    'def 3rd': 'tkl_def',
    'mid 3rd': 'tkl_mid',
    'att 3rd': 'tkl_att',
    'tkl_2': 'challenges_won',
    'att': 'challenges_att',
    'tkl%': 'tkl_pct',
    'lost': 'challenges_lost',
    'sh': 'shots_blocked',
    'pass': 'passes_blocked',
    'int': 'interceptions',
    'tkl+int': 'tackles_interceptions',
    'clr': 'clearances',
    'err': 'errors',
    '# pl': 'np'
})

In [5]:
# Remove specified columns
player_defense_stats = player_defense_stats.drop(columns=["squad_player"], errors='ignore')

In [6]:
# Rename specific columns in the team_defense_stats DataFrame
player_defense_stats = player_defense_stats.rename(columns={
    'squad_2_player': 'squad_player',
    'player_player': 'player',
    'def 3rd_player': 'tkl_def_player',
    'mid 3rd_player': 'tkl_mid_player',
    'att 3rd_player': 'tkl_att_player',
    'tkl_2_player': 'challenges_won_player',
    'att_player': 'challenges_att_player',
    'tkl%_player': 'tkl_pct_player',
    'lost_player': 'challenges_lost_player',
    'sh_player': 'shots_blocked_player',
    'pass_player': 'passes_blocked_player',
    'int_player': 'interceptions_player',
    'tkl+int_player': 'tackles_interceptions_player',
    'clr_player': 'clearances_player',
    'err_player': 'errors_player'
})

In [7]:
# Split 'pos_player' column into two new columns 'pos_player_1' and 'pos_player_2'
player_defense_stats[['pos_player_1', 'pos_player_2']] = player_defense_stats['pos_player'].str.split(',', expand=True)

# Drop the original 'pos_player' column since we now have split it into two
player_defense_stats = player_defense_stats.drop(columns=['pos_player'])

In [8]:
# Clean the nation_player column in the player_defense_stats DataFrame using .loc to avoid SettingWithCopyWarning
if 'nation_player' in player_defense_stats.columns:
    player_defense_stats.loc[:, 'nation_player'] = player_defense_stats['nation_player'].apply(lambda x: x.split()[-1] if isinstance(x, str) else None)

# Remove rows where only 'season_player' has a value and all other columns are NaN
if 'season_player' in player_defense_stats.columns:
    player_defense_stats = player_defense_stats.dropna(how='all', subset=[col for col in player_defense_stats.columns if col != 'season_player'])


# export data

In [10]:
# Save data to Excel files
output_folder = r'C:\Users\{}\Documents\GitHub\dream-team-fpl-prediction\data\def'.format(user)
team_defense_output_path = os.path.join(output_folder, 'team_defense_stats.xlsx')
player_defense_output_path = os.path.join(output_folder, 'player_defense_stats.xlsx')

team_defense_stats.to_excel(team_defense_output_path, index=False)
player_defense_stats.to_excel(player_defense_output_path, index=False)

print(f"Team defense data saved to {team_defense_output_path}")
print(f"Player defense data saved to {player_defense_output_path}")


Team defense data saved to C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\def\team_defense_stats.xlsx
Player defense data saved to C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\def\player_defense_stats.xlsx
