In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from io import StringIO
import os
import random

In [14]:
def combine_columns(col, team_name):
    if 'Unnamed' in col[0]:
        return col[1]
    if team_name in col[0]:
        return col[1]
    return f'{col[0]}__{col[1]}'

In [15]:
def scrape(start, end, rank_start = 0, rank_end = 20):
    if end < 2024:
        standings_url = f"https://fbref.com/en/comps/12/{end}-{end+1}/{end}-{end+1}-La-Liga-Stats"
    else:
        standings_url = "https://fbref.com/en/comps/12/La-Liga-Stats"
    tags = ["Shooting", "Goalkeeping", "Passing", "Pass Types",
            "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous Stats"]
    wait_time = 303
    output_dir = "team_data"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for year in range(end, start - 1, -1):
        print(f"Fetching data for the {year} season...")
        data = requests.get(standings_url)
        soup = BeautifulSoup(data.text, 'html.parser')
        standings_table = soup.select('table.stats_table')[0]

        links = [l.get("href") for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]

        team_urls = team_urls[rank_start:rank_end]

        previous_season = soup.select("a.prev")[0].get("href")
        standings_url = f"https://fbref.com{previous_season}"

        for rank, team_url in enumerate(team_urls):
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            print(f"  {rank + 1 + rank_start}. Fetching data for {team_name}...")
            
            data = requests.get(team_url)
            html_io = StringIO(data.text)
            matches = pd.read_html(html_io, match="Scores & Fixtures")[0].drop(columns=["Match Report"])
            team_data = matches.copy()

            soup = BeautifulSoup(data.text, 'html.parser')
            links = [l.get("href") for l in soup.find_all('a')]
            links = [l for l in links if l and 'matchlogs/all_comps/' in l][1:9] #this pattern is false for Mallorca
            for i in range(len(links)):
                link = links[i]
                print(f"                 Processing {tags[i]} data...")
                print(link)
                success = False
                for attempt in range(13):
                    try:
                        data = requests.get(f"https://fbref.com{link}")
                        html_io = StringIO(data.text)
                        tag = pd.read_html(html_io, match=tags[i])[0]
                        success = True
                        break
                    except ValueError:
                        print(f"Attempt {attempt + 1}.  Waiting {wait_time} seconds before retrying...")
                        print(f"No tables found matching pattern {tags[i]}")
                        time.sleep(wait_time)
                if not success:
                    raise Exception(f"Failed to fetch table matching '{tags[i]}' after 13 attempts")
                tag.columns = [combine_columns(col, team_name) for col in tag.columns]
                tag = tag.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])

                try:
                    team_data = team_data.merge(tag, on="Date")
                except ValueError:
                    print(f"             Skipping {tags[i]} due to missing data.")
                    continue
                time.sleep(random.uniform(20, 30))  

            team_data["Season"] = year
            team_data["Team"] = team_name
            # Save team data to a CSV file after each team is scraped
            file_path = os.path.join(output_dir, f"{team_name}_{year}.csv")
            team_data.to_csv(file_path, index=False)
            print(f"             Data for {team_name} in {year} saved to {file_path}.")

In [None]:
start = 2024
end = 2024
rank_start = 5
rank_end = 20
scrape(start, end, rank_start, rank_end)

Fetching data for the 2024 season...
  6. Fetching data for Mallorca...
                 Processing Shooting data...
/en/squads/2aa12281/2024-2025/matchlogs/all_comps/shooting/Mallorca-Match-Logs-All-Competitions
                 Processing Goalkeeping data...
/en/squads/2aa12281/2024-2025/matchlogs/all_comps/keeper/Mallorca-Match-Logs-All-Competitions
                 Processing Passing data...
/en/squads/2aa12281/2024-2025/matchlogs/all_comps/passing/Mallorca-Match-Logs-All-Competitions
                 Processing Pass Types data...
/en/squads/2aa12281/2024-2025/matchlogs/all_comps/passing_types/Mallorca-Match-Logs-All-Competitions
                 Processing Goal and Shot Creation data...
/en/squads/2aa12281/2024-2025/matchlogs/all_comps/gca/Mallorca-Match-Logs-All-Competitions
                 Processing Defensive Actions data...
/en/squads/2aa12281/2024-2025/matchlogs/all_comps/defense/Mallorca-Match-Logs-All-Competitions
                 Processing Possession data...
/en/squads/2

In [2]:
# Directory where all team data CSVs are stored
data_dir = "team_data"
all_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]

# Read and concatenate all CSV files
all_data = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)

# Save the concatenated data to a new CSV
all_data.to_csv("all_teams_data.csv", index=False)


In [3]:
import numpy as np
print(len(all_files))
x = np.array(all_data)
x.shape

160


(6630, 168)