Importing necessary Libraries.

In [None]:
import requests
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
from io import StringIO

Scraping data from the URL

In [None]:
dataset_url = "https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"}
data = requests.get (dataset_url , headers=headers)
data.text

Getting the League standing table data from the website using Beautiful Soup

In [None]:
soup = BeautifulSoup(data.content ,"html.parser")
standing_table = soup.select('table.stats_table')[0]
standing_table

# Getting the Team links using the a tag and then gets its value using the href property.
team_links = standing_table.find_all('a')
team_links = [l.get("href") for l in team_links]
# Filtering only the squad links as there are other a elements as well
team_links =  [ l for l in team_links if '/squads/' in l]
team_links

Formating the relative URL to Absolute URL

In [None]:
team_absoluteurl = [f"https://fbref.com{l}" for l in team_links]
team_absoluteurl

Extracting One Team Match Stats data Other details using Pandas and Requests

In [None]:
# Getting the Team match stats data
oneteam_url = team_absoluteurl [0]
oneteam_data = requests.get(oneteam_url)
matches = pd.read_html(oneteam_data.text , match ="Scores & Fixtures")
matches[0].head()

In [None]:
# Getting each team shooting stats data.
soup= BeautifulSoup(oneteam_data.text)
links = soup.find_all('a')
links =[l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]
shooting_data = requests.get(f"https://fbref.com{links[0]}")

# Here Getting the shooting stats that were done for that team. To consider against that team them it will be next index[1]
shooting = pd.read_html(shooting_data.text , match ="Shooting")[0]  
shooting.columns = shooting.columns.droplevel()
shooting.head()

Merging the Two dataframe (Matches data and Shooting data) for this single team using Pandas

In [None]:
oneteam_combineddata = matches[0].merge(shooting[["Date","Sh","SoT","Dist","FK","PK","PKatt"]] , on="Date")
# oneteam_combineddata.head()

# Checked whether the rows and columns are merged properly or not using shape function.
print(matches[0].shape)
print(shooting.shape)
print(oneteam_combineddata.shape)

Scraping the Match Stats Data and Shooting data for multiple teams and multiple seasons 

In [None]:
# Declaration of variables
years = list(range(2023, 2020, -1))
allteams_combined_data = []
dataset_url = "https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats"

for year in years:
    year_data = []
    # Getting the Multiple Team Link for each year
    data = requests.get(dataset_url)
    time.sleep(2)
    soup = BeautifulSoup(data.text, 'html.parser')
    standing_table = soup.select('table.stats_table')[0]

    # Getting the Team links
    team_links = [l.get("href") for l in standing_table.find_all('a')]
    team_links = [l for l in team_links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in team_links]
    
    # Update URL to the previous season
    previous_seasons = soup.select("a.prev")[0].get("href")
    dataset_url = f"https://fbref.com{previous_seasons}"

    # Process teams in batches
    batch_size = 5
    for i in range(0, len(team_urls), batch_size):
        batch_urls = team_urls[i:i + batch_size]
        
        for team_url in batch_urls:
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            try:
                oneteam_data = requests.get(team_url)
                oneteam_data = StringIO(oneteam_data.text)
                matches = pd.read_html(oneteam_data, match="Scores & Fixtures")[0]
                
                soup = BeautifulSoup(oneteam_data, 'html.parser')
                links = [l.get("href") for l in soup.find_all('a')]
                links = [l for l in links if l and 'all_comps/shooting/' in l]
                
                if links:
                    shooting_data = requests.get(f"https://fbref.com{links[0]}")
                    shooting_data = StringIO(shooting_data.text)
                    shooting = pd.read_html(shooting_data, match="Shooting")[0]
                    shooting.columns = shooting.columns.droplevel()
                    
                    try:
                        team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
                    except ValueError:
                        continue

                    team_data = team_data[team_data["Comp"] == "Premier League"]
                    team_data["Season"] = year
                    team_data["Team"] = team_name
                    year_data.append(team_data)
                    
                    print(f"Processed {team_name} for {year}")
                    
                # Wait for 30 seconds after processing each team
                time.sleep(30)
                
            except Exception as e:
                print(f"Error processing {team_name} for {year}: {e}")
            
        # Wait for 60 seconds after processing each batch
        print(f"Waiting for 60 seconds before processing next batch...")
        time.sleep(60)
    
    # Combine year data and append to all teams combined data
    if year_data:
        year_data_df = pd.concat(year_data)
        year_data_df.columns = [c.lower() for c in year_data_df.columns]
        allteams_combined_data.append(year_data_df)
        print(f"Data for {year} processed and added to the combined data.")

# Combine all years' data and save to CSV
if allteams_combined_data:
    match_df = pd.concat(allteams_combined_data)
    match_df.to_csv("matches.csv", index=False)
    print("All data saved to matches.csv")
