In [1]:
import requests
from bs4 import BeautifulSoup
import time

In [None]:
# Define the initial URL for Premier League standings
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

# Fetch the standings data from the URL
data = requests.get(standings_url)
print(data.status_code)  # Print the status code of the request (should be ~ 200 for success)

In [3]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(data.text, 'html.parser')
standings_table = soup.select('table.stats_table')[0] # Select the standings table

# Extract all links from the standings table
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]  # Filter links for team URLs

In [None]:
# Construct full team URLs
team_urls = [f"https://fbref.com{l}" for l in links]

# Get the data for the first team
data = requests.get(team_urls[0])

# Extract match data from the team's page
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

# Parse the HTML content again to extract shooting statistics
soup = BeautifulSoup(data.text, 'html.parser')
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

# Get shooting data for the first team
data = requests.get(f"https://fbref.com{links[0]}")
shooting = pd.read_html(data.text, match="Shooting")[0]

# Display the first few rows of the shooting data
shooting.head()

In [None]:
# Clean up the shooting DataFrame by dropping the first level of column headers
shooting.columns = shooting.columns.droplevel()

# Merge match and shooting data on 'Date'
team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
team_data.head()

In [16]:
# List of years to iterate over (2022 to 2020)
years = list(range(2022, 2020, -1))
all_matches = []  # Initialize a list to store data for all matches

In [None]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
# Loop through each year to scrape data
for year in years:
    # Request the standings data for the current year
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text, 'html.parser')
    standings_table = soup.select('table.stats_table')[0]

    # Extract team links from the standings table
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    # Get the URL for the previous season's standings
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # Loop through each team's URL to get data
    for team_url in team_urls:
        # Extract the team name from the URL
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        
        # Extract match data for the team
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text, 'html.parser')
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]

        # Get shooting data for the team
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        
        # Clean up the shooting DataFrame
        shooting.columns = shooting.columns.droplevel()
        
        try:
            # Merge match and shooting data on 'Date'
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue  # Skip if there’s a ValueError during merging
        
        # Filter team data for the Premier League matches
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        # Add season and team name to the data
        team_data["Season"] = year
        team_data["Team"] = team_name
        
        # Append the team data to the list
        all_matches.append(team_data)
        
        # Wait for 1-5 seconds to avoid hitting rate limits (print(data.status_code) will print 429 if you hit the limit)
        time.sleep(5)


In [None]:
len(all_matches)

In [None]:
# Concatenate all matches into a single DataFrame
match_df = pd.concat(all_matches)

# Convert column names to lowercase for consistency
match_df.columns = [c.lower() for c in match_df.columns]

# Output the final DataFrame
match_df

In [None]:
# Save the match data to a CSV file
match_df.to_csv("matches.csv", index=False)  # Save DataFrame to CSV without index