In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from io import StringIO
import random

In [12]:
def scrape(start, end):
    standings_url = "https://fbref.com/en/comps/12/La-Liga-Stats"
    tags = ["Shooting", "Goalkeeping", "Passing", "Pass Types",
            "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous Stats"]
    all_matches = []

    # List of user-agents for rotation
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14E5239e Safari/602.1',
        # Add more if needed
    ]

    # Optional: Proxy list for rotating IPs
    proxies = [
        # 'http://proxy1.com:8080',
        # 'http://proxy2.com:8080',
        # Add more proxies here if available
    ]

    def make_request(url, pattern, max_retres=12):
        wait_time = 303  # Initial wait time in seconds
        for attempt in range(max_retries):
            try:
                headers = {
                    'User-Agent': random.choice(user_agents)  # Rotate User-Agent
                }

                # Optional: Use a random proxy if you have a proxy list
                proxy = {'http': random.choice(proxies)} if proxies else None

                response = requests.get(url, headers=headers, proxies=proxy)
                response.raise_for_status()  # Raise an exception for 4xx/5xx status codes

                # Detect if we're blocked or rate-limited by inspecting the title/content
                html_io = StringIO(response.text)
                

                # If no issues, return the response
                return pd.read_html(html_io, match=pattern)[0]

            except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e:
                print(f"Request failed: {e}. Waiting {wait_time} seconds before retrying...")
            except Exception as e:
                print(f"{e} Waiting {wait_time} seconds before retrying...")
            except ValueError:


            # Increase wait time with each attempt to avoid being blocked
            time.sleep(wait_time)

        raise Exception(f"Failed to retrieve {url} after {max_retries} attempts.")

    for year in range(end, start - 1, -1):
        print(f"Fetching data for the {year} season...")
        data = make_request(standings_url)
        html_io = StringIO(data.text)
        soup = BeautifulSoup(data.text, 'html.parser')
        standings_table = soup.select('table.stats_table')[0]

        links = [l.get("href") for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]

        previous_season = soup.select("a.prev")[0].get("href")
        standings_url = f"https://fbref.com{previous_season}"

        for rank, team_url in enumerate(team_urls):
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            print(f"  {rank + 1}. Fetching data for {team_name}...")

            data = make_request(team_url)
            html_io = StringIO(data.text)
            matches = pd.read_html(html_io, match="Scores & Fixtures")[0].drop(columns=["Match Report", "Notes"])

            soup = BeautifulSoup(data.text, 'html.parser')
            links = [l.get("href") for l in soup.find_all('a')]
            links = [l for l in links if l and 'matchlogs/all_comps/' in l][1:9]

            for i, link in enumerate(links):
                print(f"    Processing {tags[i]} data...")

                data = make_request(f"https://fbref.com{link}")
                html_io = StringIO(data.text)
                tag = pd.read_html(html_io, match=tags[i])[0]
                tag.columns = tag.columns.droplevel()
                tag = tag.drop(columns=["Time", "Comp", "Round", "Day", "Venue", 
                                        "Result", "GF", "GA", "Opponent", "Match Report"])

                try:
                    team_data = matches.merge(tag, on="Date")
                except (ValueError, IndexError):
                    print(f"      Skipping {tags[i]} due to missing data.")
                    continue
                time.sleep(random.uniform(20, 30))  # Random delay between 20-30 seconds

            team_data["Season"] = year
            team_data["Team"] = team_name
            print(team_data.head(5).to_string(index=False))
            all_matches.append(team_data)
            time.sleep(random.uniform(10, 15))  # Random delay between 10-15 seconds

    return all_matches


In [13]:
all_matches = scrape(2024, 2024)

IndexError: list index out of range

In [None]:
match_df = pd.concat(all_matches)

In [None]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Match Report,,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Match Report,,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Match Report,,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Match Report,,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United


In [None]:
match_df.to_csv("matches.csv")