# Introduction

I will create webscraping scripts to collect and aggregate player data, mainly from this website https://fbref.com/en/comps/9/Premier-League-Stats

General Player Stats
https://fbref.com/en/comps/9/stats/Premier-League-Stats#all_stats_standard

Goalkeeping Stats
https://fbref.com/en/comps/9/keepers/Premier-League-Stats

Passing Stats
https://fbref.com/en/comps/9/passing/Premier-League-Stats

Defensive Stats
https://fbref.com/en/comps/9/defense/Premier-League-Stats

Shooting Stats
https://fbref.com/en/comps/9/shooting/Premier-League-Stats

My plan is to create 4 different sets of data. One for goalkeepers, the next for defensive stats, then midfielders and finally attacking.

## Importing libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

##  Scrapping Goalie Data

In [113]:
#since the page dynamically loads, I am going to use Selenium

# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set path to chromedriver executable
# Replace 'path/to/chromedriver' with the actual path to your chromedriver
driver_path = 'path/to/chromedriver'

# Set up the Selenium service
service = Service(driver_path)

# Set up the WebDriver instance
driver = webdriver.Chrome(service=service, options=options)

# Navigate to the webpage
url = "https://fbref.com/en/comps/9/keepers/Premier-League-Stats"
driver.get(url)

# Find the parent div using Selenium
div = driver.find_element(By.ID, "div_stats_keeper")

# Get the HTML content of the div
div_html = div.get_attribute("innerHTML")

# Close the WebDriver instance
driver.quit()

# Create a BeautifulSoup object from the div HTML
soup = BeautifulSoup(div_html, 'html.parser')

# Find the table within the div
table = soup.find("table")

# Create empty lists to store the data
player_data = []

# Iterate over the rows in the table
rows = table.find("tbody").find_all("tr")
#print(rows)

for row in rows:
    
    if "thead" in row.get("class", []):
        continue
    
    # Extract the data from each column in the row
    columns = row.find_all("td")
    player_name = columns[0].text.strip()
    position = columns[2].text.strip()
    team = columns[3].text.strip()
    saves = int(columns[13].text.strip())
    save_percentage = float(columns[14].text.strip())
    clean_sheet_percentage = float(columns[19].text.strip())
    
    # Append the data as a dictionary to the player_data list
    player_data.append({
        "Player Name": player_name,
        "Position": position,
        "Team": team,
        "Saves": saves,
        "Save Percentage": save_percentage,
        "Clean Sheet Percentage": clean_sheet_percentage
    })

# Create a pandas DataFrame from the player_data list
df = pd.DataFrame(player_data)

current_directory = os.getcwd()
file_path = os.path.join(current_directory, "goalkeeper_stats.csv")
df.to_csv(file_path, index=False)


## Scrapping Defender Data

In [3]:
# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set path to chromedriver executable
# Replace 'path/to/chromedriver' with the actual path to your chromedriver
driver_path = 'path/to/chromedriver'

# Set up the Selenium service
service = Service(driver_path)

# Set up the WebDriver instance
driver = webdriver.Chrome(service=service, options=options)

# Navigate to the webpage
url = "https://fbref.com/en/comps/9/defense/Premier-League-Stats"
driver.get(url)

# Find the parent div using Selenium
div = driver.find_element(By.ID, "div_stats_defense")

# Get the HTML content of the div
div_html = div.get_attribute("innerHTML")

# Close the WebDriver instance
driver.quit()

# Create a BeautifulSoup object from the div HTML
soup = BeautifulSoup(div_html, 'html.parser')

# Find the table within the div
table = soup.find("table")

# Create empty lists to store the data
player_data = []

# Iterate over the rows in the table
rows = table.find("tbody").find_all("tr")
#print(rows)

for row in rows:
    
    if "thead" in row.get("class", []):
        continue
    
    # Extract the data from each column in the row
    columns = row.find_all("td")
    player_name = columns[0].text.strip()
    position = columns[2].text.strip()
    team = columns[3].text.strip()
    tackles_won = int(columns[8].text.strip())
    shots_blocked = int(columns[17].text.strip())
    interceptions = int(columns[19].text.strip())
    clearances = int(columns[21].text.strip())
    
    
    # Append the data as a dictionary to the player_data list
    player_data.append({
        "Player Name": player_name,
        "Position": position,
        "Team": team,
        "Tackles Won": tackles_won,
        "Shots Blocked": shots_blocked,
        "Interceptions": interceptions,
        "Clearances": clearances
    })

# Create a pandas DataFrame from the player_data list
df = pd.DataFrame(player_data)

current_directory = os.getcwd()
file_path = os.path.join(current_directory, "defensive_stats.csv")
df.to_csv(file_path, index=False)

## Scrapping Midfielder Data

In [133]:
# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set path to chromedriver executable
# Replace 'path/to/chromedriver' with the actual path to your chromedriver
driver_path = 'path/to/chromedriver'

# Set up the Selenium service
service = Service(driver_path)

# Set up the WebDriver instance
driver = webdriver.Chrome(service=service, options=options)

# Navigate to the webpage
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
driver.get(url)

# Find the parent div using Selenium
div = driver.find_element(By.ID, "div_stats_passing")

# Get the HTML content of the div
div_html = div.get_attribute("innerHTML")

# Create a BeautifulSoup object from the div HTML
soup = BeautifulSoup(div_html, 'html.parser')

# Find the table within the div
table = soup.find("table")

# Create empty lists to store the data
player_data = []

# Iterate over the rows in the table
rows = table.find("tbody").find_all("tr")
#print(rows)

for row in rows:
    
    if "thead" in row.get("class", []):
        continue
    
    # Extract the data from each column in the row
    columns = row.find_all("td")
    player_name = columns[0].text.strip()
    position = columns[2].text.strip()
    team = columns[3].text.strip()
    passes = int(columns[7].text.strip())
    pass_completion = str(columns[9].text.strip())
    assists = int(columns[21].text.strip())
    key_passes = int(columns[25].text.strip())
    
    
    # Append the data as a dictionary to the player_data list
    player_data.append({
        "Player Name": player_name,
        "Position": position,
        "Team": team,
        "Passes": passes,
        "Pass Completion Rate": pass_completion,
        "Assists": assists,
        "Key Passses": key_passes
    })

#from a table on another index of the site, I have to get the successul dribbles of each player

# Scrape successful dribbles
dribbles_url = "https://fbref.com/en/comps/9/possession/Premier-League-Stats"
driver.get(dribbles_url)

dribbles_data = []

# Find the parent div using Selenium
dribbles_div = driver.find_element(By.ID, "div_stats_possession")

# Get the HTML content of the div
dribbles_div_html = dribbles_div.get_attribute("innerHTML")

# Create a BeautifulSoup object from the div HTML
dribbles_soup = BeautifulSoup(dribbles_div_html, 'html.parser')

# Find the table within the div
dribbles_table = dribbles_soup.find("table")

# Iterate over the rows in the table
dribbles_rows = dribbles_table.find("tbody").find_all("tr")

for row in dribbles_rows:
    if "thead" in row.get("class", []):
        continue
    
    # Extract the data from each column in the row
    columns = row.find_all("td")
    player_name = columns[0].text.strip()
    successful_dribbles = int(columns[15].text.strip())
    
    # Append the data as a dictionary to the dribbles_data list
    dribbles_data.append({
        "Player Name": player_name,
        "Successful Dribbles": successful_dribbles
    })

# Close the WebDriver instance
driver.quit()

# Create pandas DataFrames from the data lists
passing_df = pd.DataFrame(player_data)
dribbles_df = pd.DataFrame(dribbles_data)

# Merge the passing and dribbles DataFrames based on player name
merged_df = passing_df.merge(dribbles_df, on="Player Name", how="left")

# Save the merged DataFrame to the passing_stats.csv file
file_path = "passing_stats.csv"
merged_df.to_csv(file_path, index=False)

## Scrapping Attacking Data

In [120]:
# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set path to chromedriver executable
# Replace 'path/to/chromedriver' with the actual path to your chromedriver
driver_path = 'path/to/chromedriver'

# Set up the Selenium service
service = Service(driver_path)

# Set up the WebDriver instance
driver = webdriver.Chrome(service=service, options=options)

# Navigate to the webpage
url = "https://fbref.com/en/comps/9/shooting/Premier-League-Stats"
driver.get(url)

# Find the parent div using Selenium
div = driver.find_element(By.ID, "div_stats_shooting")

# Get the HTML content of the div
div_html = div.get_attribute("innerHTML")

# Close the WebDriver instance
driver.quit()

# Create a BeautifulSoup object from the div HTML
soup = BeautifulSoup(div_html, 'html.parser')

# Find the table within the div
table = soup.find("table")

# Create empty lists to store the data
player_data = []

# Iterate over the rows in the table
rows = table.find("tbody").find_all("tr")
#print(rows)

for row in rows:
    
    if "thead" in row.get("class", []):
        continue
    
    # Extract the data from each column in the row
    columns = row.find_all("td")
    player_name = columns[0].text.strip()
    position = columns[2].text.strip()
    team = columns[3].text.strip()
    goals = int(columns[7].text.strip())
    shots_on_target = float(columns[9].text.strip())
    goals_per_shot = str(columns[13].text.strip())
    
    
    # Append the data as a dictionary to the player_data list
    player_data.append({
        "Player Name": player_name,
        "Position": position,
        "Team": team,
        "Goals": goals,
        "Shots On Target": shots_on_target,
        "Goals Per Shot": goals_per_shot,
    })

# Create a pandas DataFrame from the player_data list
df = pd.DataFrame(player_data)

current_directory = os.getcwd()
file_path = os.path.join(current_directory, "forward_stats.csv")
df.to_csv(file_path, index=False)

## General Player Stats

In [17]:
# Set up Selenium options
options = Options()
options.add_argument("--headless")  # Run Chrome in headless mode

# Set path to chromedriver executable
# Replace 'path/to/chromedriver' with the actual path to your chromedriver
driver_path = 'path/to/chromedriver'

# Set up the Selenium service
service = Service(driver_path)

# Set up the WebDriver instance
driver = webdriver.Chrome(service=service, options=options)

# Navigate to the webpage
url = "https://fbref.com/en/comps/9/stats/Premier-League-Stats"
driver.get(url)

# Find the parent div using Selenium
div = driver.find_element(By.ID, "div_stats_standard")

# Get the HTML content of the div
div_html = div.get_attribute("innerHTML")

# Close the WebDriver instance
driver.quit()

# Create a BeautifulSoup object from the div HTML
soup = BeautifulSoup(div_html, 'html.parser')

# Find the table within the div
table = soup.find("table")

# Create empty lists to store the data
player_data = []

# Iterate over the rows in the table
rows = table.find("tbody").find_all("tr")
#print(rows)

for row in rows:
    
    if "thead" in row.get("class", []):
        continue
    
    # Extract the data from each column in the row
    columns = row.find_all("td")
    player_name = columns[0].text.strip()
    position = columns[2].text.strip()
    team = columns[3].text.strip()
    appearances = int(columns[6].text.strip())
    minutes_played = str(columns[8].text.strip())
    
    
    
    # Append the data as a dictionary to the player_data list
    player_data.append({
        "Player Name": player_name,
        "Position": position,
        "Team": team,
        "Appearances": appearances,
        "Minutes Played": minutes_played,
    })

# Create a pandas DataFrame from the player_data list
df = pd.DataFrame(player_data)

current_directory = os.getcwd()
file_path = os.path.join(current_directory, "general_stats.csv")
df.to_csv(file_path, index=False)

## Cleaning our Data

In [11]:
# Read the CSV file
file_path = cwd+"/goalkeeper_stats.csv"
df = pd.read_csv(file_path)

# Fill missing values with zeros
df.fillna(0, inplace=True)

# Save the cleaned dataframe to a new CSV file
cleaned_file_path = cwd+"/goalkeeper_stats.csv"
df.to_csv(cleaned_file_path, index=False)

# Read the CSV file
file_path = cwd+"/defensive_stats.csv"
df = pd.read_csv(file_path)

# Fill missing values with zeros
df.fillna(0, inplace=True)

# Save the cleaned dataframe to a new CSV file
cleaned_file_path = cwd+"/defensive_stats.csv"
df.to_csv(cleaned_file_path, index=False)

# Read the CSV file
file_path = cwd+"/passing_stats.csv"
df = pd.read_csv(file_path)

# Fill missing values with zeros
df.fillna(0, inplace=True)

# Save the cleaned dataframe to a new CSV file
cleaned_file_path = cwd+"/passing_stats.csv"
df.to_csv(cleaned_file_path, index=False)

# Read the CSV file
file_path = cwd+"/forward_stats.csv"
df = pd.read_csv(file_path)

# Fill missing values with zeros
df.fillna(0, inplace=True)

# Save the cleaned dataframe to a new CSV file
cleaned_file_path = cwd+"/forward_stats.csv"
df.to_csv(cleaned_file_path, index=False)