# Script to scrape player statistics
# Do teams with higher average player age perform worse in tournaments?

https://fbref.com/en/comps/9/stats/Premier-League-Stats

Collected DATA:

- Player Name
- Team
- Position
- Age
- Goals
- Assists
- Minutes played
- Matches played


In [5]:

import time
from pathlib import Path


import pandas as pd
import requests
import cloudscraper
from bs4 import BeautifulSoup, Comment


In [6]:
BASE_URL = "https://fbref.com"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://fbref.com/",
    "DNT": "1",
}

LEAGUE_CONFIGS = [
    {"league": "Premier League", "competition": "9", "slug": "Premier-League-Stats", "country": "England"},
    {"league": "La Liga", "competition": "12", "slug": "La-Liga-Stats", "country": "Spain"},
    {"league": "Serie A", "competition": "11", "slug": "Serie-A-Stats", "country": "Italy"},
    {"league": "Bundesliga", "competition": "20", "slug": "Bundesliga-Stats", "country": "Germany"},
    {"league": "Ligue 1", "competition": "13", "slug": "Ligue-1-Stats", "country": "France"},
]


def build_stats_url(config: dict) -> str:
    return f"{BASE_URL}/en/comps/{config['competition']}/stats/{config['slug']}"


In [7]:
def fetch_with_fallback(url: str, delay_seconds: int = 3) -> tuple[str, str]:
    """Fetch HTML with requests, falling back to cloudscraper on failure."""
    time.sleep(delay_seconds)
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        time.sleep(2)
        response.raise_for_status()
        return response.text, "requests"
    except Exception as error:
        print(f"Primary request blocked for {url} ({error}); retrying with cloudscraper...")
        time.sleep(5)
        scraper = cloudscraper.create_scraper(
            browser={"browser": "chrome", "platform": "windows", "mobile": False}
        )
        fallback_response = scraper.get(url, timeout=30)
        fallback_response.raise_for_status()
        return fallback_response.text, "cloudscraper"

In [8]:
def parse_players_from_html(html_text: str, league_config: dict, source: str) -> list[dict]:
    """Extract player rows from FBref stats page HTML."""
    soup = BeautifulSoup(html_text, "html.parser")
    container = soup.find(id="all_stats_standard")
    if not container:
        print(f"Stats container not found for {league_config['league']}.")
        return []

    comment_block = None
    for element in container.children:
        if isinstance(element, Comment):
            comment_block = element
            break
    if not comment_block:
        print(f"Hidden stats table not found for {league_config['league']}.")
        return []

    table_html = BeautifulSoup(comment_block, "html.parser")
    table = table_html.find("table", id="stats_standard")
    if not table:
        print(f"Stats table not located for {league_config['league']}.")
        return []

    tbody = table.find("tbody")
    if not tbody:
        print(f"Table body missing for {league_config['league']}.")
        return []

    players: list[dict] = []
    for tr in tbody.find_all("tr"):
        header = tr.find("th", attrs={"scope": "row"})
        if not header:
            continue

        def get_stat(stat: str) -> str | None:
            cell = tr.find(attrs={"data-stat": stat})
            return cell.get_text(strip=True) if cell else None

        row = {
            "Player": get_stat("player"),
            "Squad": get_stat("team"),
            "Pos": get_stat("position"),
            "Age": get_stat("age"),
            "Gls": get_stat("goals"),
            "Ast": get_stat("assists"),
            "Min": get_stat("minutes"),
            "MP": get_stat("games"),
            "Sh": get_stat("shots"),
            "SoT": get_stat("shots_on_target"),
            "xG": get_stat("xg"),
            "xAG": get_stat("xa"),
            "Cmp": get_stat("passes_completed"),
            "League": league_config["league"],
            "Country": league_config["country"],
            "Source": source,
            "Source_URL": build_stats_url(league_config),
        }

        if row["Player"] and row["Squad"]:
            players.append(row)

    return players

In [11]:
all_players: list[dict] = []

for league_cfg in LEAGUE_CONFIGS:
    url = build_stats_url(league_cfg)
    print(f"Fetching {league_cfg['league']} stats from {url}...")
    
    try:
        html_text, source = fetch_with_fallback(url)
    except Exception as fetch_error:
        print(f"Unable to fetch {league_cfg['league']} ({fetch_error}); skipping.")
        continue

    league_players = parse_players_from_html(html_text, league_cfg, source)
    print(f"Collected {len(league_players)} players for {league_cfg['league']} via {source}.")
    all_players.extend(league_players)
    time.sleep(2)

if not all_players:
    raise RuntimeError("No player rows gathered. Check league URLs or parsing logic.")

print(f"\nTotal players collected: {len(all_players)}")

Fetching Premier League stats from https://fbref.com/en/comps/9/stats/Premier-League-Stats...
Primary request blocked for https://fbref.com/en/comps/9/stats/Premier-League-Stats (403 Client Error: Forbidden for url: https://fbref.com/en/comps/9/stats/Premier-League-Stats); retrying with cloudscraper...
Collected 468 players for Premier League via cloudscraper.
Fetching La Liga stats from https://fbref.com/en/comps/12/stats/La-Liga-Stats...
Primary request blocked for https://fbref.com/en/comps/12/stats/La-Liga-Stats (403 Client Error: Forbidden for url: https://fbref.com/en/comps/12/stats/La-Liga-Stats); retrying with cloudscraper...
Collected 500 players for La Liga via cloudscraper.
Fetching Serie A stats from https://fbref.com/en/comps/11/stats/Serie-A-Stats...
Primary request blocked for https://fbref.com/en/comps/11/stats/Serie-A-Stats (403 Client Error: Forbidden for url: https://fbref.com/en/comps/11/stats/Serie-A-Stats); retrying with cloudscraper...
Collected 493 players for S

In [12]:
players_df = pd.DataFrame(all_players)
print(f"DataFrame shape: {players_df.shape}")
print(f"Columns: {list(players_df.columns)}")
players_df.head(10)

DataFrame shape: (2346, 17)
Columns: ['Player', 'Squad', 'Pos', 'Age', 'Gls', 'Ast', 'Min', 'MP', 'Sh', 'SoT', 'xG', 'xAG', 'Cmp', 'League', 'Country', 'Source', 'Source_URL']


Unnamed: 0,Player,Squad,Pos,Age,Gls,Ast,Min,MP,Sh,SoT,xG,xAG,Cmp,League,Country,Source,Source_URL
0,Brenden Aaronson,Leeds United,"FW,MF",25-055,1,2,835,15,,,2.1,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
1,Zach Abbott,Nott'ham Forest,DF,19-217,0,0,45,1,,,0.0,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
2,Joshua Acheampong,Chelsea,DF,19-225,1,0,349,5,,,0.6,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
3,Tyler Adams,Bournemouth,MF,26-305,2,1,1195,15,,,0.5,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
4,Tosin Adarabioyo,Chelsea,DF,28-083,0,0,561,9,,,0.1,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
5,Simon Adingra,Sunderland,"FW,MF",23-349,0,0,320,9,,,0.1,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
6,Amine Adli,Bournemouth,"FW,MF",25-220,1,0,393,12,,,0.9,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
7,Emmanuel Agbadou,Wolves,DF,28-182,0,0,1011,13,,,0.5,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
8,Nayef Aguerd,West Ham,DF,29-261,0,0,180,2,,,0.1,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
9,Ola Aina,Nott'ham Forest,DF,29-069,0,0,270,3,,,0.0,,,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...


In [13]:
players_df.to_csv("../data_raw/playerStats.csv", index=False)