In [2]:
# !pip install bs4
# !pip install requests
# !pip install pandas

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from io import StringIO
import random

In [1]:
def scrape(start, end):
    standings_url = "https://fbref.com/en/comps/12/La-Liga-Stats"
    tags = ["Shooting", "Goalkeeping", "Passing", "Pass Types",
            "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous Stats"]
    all_matches = []

    # List of user-agents for rotation
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14E5239e Safari/602.1',
        # Add more if needed
    ]

    def make_request(url, pattern, convert_to_df = True, max_retries=12):
        wait_time = 303  # Initial wait time in seconds
        for attempt in range(max_retries):
            try:
                headers = {
                    'User-Agent': random.choice(user_agents)  # Rotate User-Agent
                }

                response = requests.get(url, headers=headers)
                # response = requests.get(url)
                response.raise_for_status()  # Raise an exception for 4xx/5xx status codes
                if convert_to_df:
                    # Attempt to parse the table using the given pattern
                    html_io = StringIO(response.text)
                    return pd.read_html(html_io, match=pattern)[0], response
                else:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    return soup.select(pattern)[0], response
            except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e:
                print(f"Request failed: {e}. Waiting {wait_time} seconds before retrying...")
            except (ValueError, IndexError):
                # Handle case where table is not found
                print(f"Table matching pattern '{pattern}' not found. Waiting {wait_time} seconds before retrying...")
            except Exception as e:
                print(f"{e} Waiting {wait_time} seconds before retrying...")

            # Increase wait time with each attempt to avoid being blocked
            time.sleep(wait_time)

        raise Exception(f"Failed to retrieve {url} after {max_retries} attempts.")

    for year in range(end, start - 1, -1):
        print(f"Fetching data for the {year} season...")
        standings_table, data = make_request(standings_url, 'table.stats_table', convert_to_df = False)

        links = [l.get("href") for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]
        
        soup = BeautifulSoup(data.text, 'html.parser')
        previous_season = soup.select("a.prev")[0].get("href")
        standings_url = f"https://fbref.com{previous_season}"

        for rank, team_url in enumerate(team_urls):
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            print(f"  {rank + 1}. Fetching data for {team_name}...")

            matches, data = make_request(team_url, "Scores & Fixtures")
            matches = matches.drop(columns=["Match Report", "Notes"])

            soup = BeautifulSoup(data.text, 'html.parser')
            links = [l.get("href") for l in soup.find_all('a')]
            links = [l for l in links if l and 'matchlogs/all_comps/' in l][1:9]

            for i, link in enumerate(links):
                print(f"      Processing {tags[i]} data...")

                tag, _ = make_request(f"https://fbref.com{link}", pattern = tags[i])
                tag.columns = tag.columns.droplevel()
                tag = tag.drop(columns=["Time", "Comp", "Round", "Day", "Venue", 
                                        "Result", "GF", "GA", "Opponent", "Match Report"])

                try:
                    team_data = matches.merge(tag, on="Date")
                except (ValueError, IndexError):
                    print(f"        Skipping {tags[i]} due to missing data.")
                    continue
                # time.sleep(random.uniform(20, 30))  # Random delay between 20-30 seconds

            team_data["Season"] = year
            team_data["Team"] = team_name
            print(team_data.head(5).to_string(index=False))
            all_matches.append(team_data)
            # time.sleep(random.uniform(10, 15))  # Random delay between 10-15 seconds

    return all_matches


In [None]:
start = 2023
end = 2024
all_matches = scrape(start, end)

Fetching data for the 2024 season...
Request failed: 429 Client Error: Too Many Requests for url: https://fbref.com/en/comps/12/La-Liga-Stats. Waiting 303 seconds before retrying...


In [None]:
match_df = pd.concat(all_matches)

In [None]:
match_df.to_csv(f"matches:{start}, {end}.csv")