# Script to scrape matches and possession data?

# Does possession correlate with match outcomes?


https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures


COLLECTED DATA:


    Team

    Position

    Age

    Goals

    Assists

    Minutes played

    Matches played

#### Web-scraping plan

1. Target pages  
   - Season schedule page for match list  
   - Individual match report pages for possession and goals  
   - Optional advanced stats tables

2. Data fields  
   - Match metadata (ID, date, competition, season, venue)  
   - Teams (home, away)  
   - Possession shares (home %, away %)  
   - Scoreline and result indicators  
   - Optional controls (shots, xG, cards)

3. Retrieval workflow  
   - Fetch schedule table, extract match links  
   - Respect delays and rotate headers/cloudscraper fallback  
   - Parse HTML (tables hidden in comments) with BeautifulSoup

4. Storage  
   - Append cleaned rows to `data_raw/match_possession.csv`  
   - Save run timestamp and source URL

5. Validation  
   - Ensure numeric conversions succeed  
   - Check required columns not missing  
   - Log failures for manual review

In [36]:

import csv
from itertools import zip_longest
from pathlib import Path
import time
import pandas as pd
import numpy as np
import requests
import cloudscraper
from bs4 import BeautifulSoup, Comment


In [37]:
BASE_URL = "https://fbref.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

LEAGUE_CONFIGS = [
    {
        "league": "Premier League",
        "competition": "9",
        "slug": "Premier-League-Scores-and-Fixtures",
        "country": "England",
    },
    {
        "league": "La Liga",
        "competition": "12",
        "slug": "La-Liga-Scores-and-Fixtures",
        "country": "Spain",
    },
    {
        "league": "Serie A",
        "competition": "11",
        "slug": "Serie-A-Scores-and-Fixtures",
        "country": "Italy",
    },
    {
        "league": "Bundesliga",
        "competition": "20",
        "slug": "Bundesliga-Scores-and-Fixtures",
        "country": "Germany",
    },
    {
        "league": "Ligue 1",
        "competition": "13",
        "slug": "Ligue-1-Scores-and-Fixtures",
        "country": "France",
    },
]

def build_schedule_url(config: dict) -> str:
    return f"{BASE_URL}/en/comps/{config['competition']}/schedule/{config['slug']}"


In [38]:
def fetch_with_fallback(url: str, delay_seconds: int = 3) -> tuple[str, str]:
    time.sleep(delay_seconds)
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        time.sleep(2)
        response.raise_for_status()
        return response.text, "requests"
    except Exception as error:
        print(f"Primary request blocked for {url} ({error}); retrying with cloudscraper...")
        time.sleep(5)
        scraper = cloudscraper.create_scraper(
            browser={"browser": "chrome", "platform": "windows", "mobile": False}
        )
        fallback_response = scraper.get(url, timeout=30)
        fallback_response.raise_for_status()
        return fallback_response.text, "cloudscraper"


In [39]:
def parse_schedule_rows(html_text: str, league_config: dict, schedule_url: str, schedule_source: str) -> list[dict]:
    soup = BeautifulSoup(html_text, "html.parser")
    match_records: list[dict] = []

    match_report_tds = soup.find_all("td", {"data-stat": "match_report"})
    for td in match_report_tds:
        anchor = td.find("a")
        if not anchor or not anchor.get("href"):
            continue

        href = anchor["href"]
        match_link = requests.compat.urljoin(BASE_URL, href)
        parts = href.strip("/").split("/") if href else []
        match_id = None
        if "matches" in parts:
            idx = parts.index("matches") + 1
            if idx < len(parts):
                match_id = parts[idx]
        elif parts:
            match_id = parts[-1]
        if not match_id:
            continue

        row = td.parent

        def get_text_cell(stat: str) -> str | None:
            cell = row.find("td", {"data-stat": stat})
            return cell.get_text(strip=True) if cell else None

        date = get_text_cell("date")
        home_team = get_text_cell("home_team")
        away_team = get_text_cell("away_team")
        venue = get_text_cell("venue")
        attendance_txt = get_text_cell("attendance")
        score_txt = get_text_cell("score") or None

        home_score = away_score = None
        if score_txt:
            score_norm = score_txt.replace("\u2013", "-").replace("\u2014", "-")
            score_parts = score_norm.split("-")
            if len(score_parts) == 2:
                try:
                    home_score = int(score_parts[0])
                    away_score = int(score_parts[1])
                except ValueError:
                    home_score = away_score = None

        result = None
        if home_score is not None and away_score is not None:
            if home_score > away_score:
                result = "H"
            elif home_score < away_score:
                result = "A"
            else:
                result = "D"

        attendance = None
        if attendance_txt:
            try:
                attendance = int(attendance_txt.replace(",", ""))
            except ValueError:
                attendance = None

        match_records.append(
            {
                "match_id": match_id,
                "league": league_config["league"],
                "country": league_config["country"],
                "competition_id": league_config["competition"],
                "date": date,
                "home_team": home_team,
                "away_team": away_team,
                "score": score_txt,
                "home_score": home_score,
                "away_score": away_score,
                "result": result,
                "venue": venue,
                "attendance": attendance,
                "match_report_url": match_link,
                "source_url": schedule_url,
                "source": schedule_source,
            }
        )

    return match_records


In [40]:
LEAGUE_CONFIGS

[{'league': 'Premier League',
  'competition': '9',
  'slug': 'Premier-League-Scores-and-Fixtures',
  'country': 'England'},
 {'league': 'La Liga',
  'competition': '12',
  'slug': 'La-Liga-Scores-and-Fixtures',
  'country': 'Spain'},
 {'league': 'Serie A',
  'competition': '11',
  'slug': 'Serie-A-Scores-and-Fixtures',
  'country': 'Italy'},
 {'league': 'Bundesliga',
  'competition': '20',
  'slug': 'Bundesliga-Scores-and-Fixtures',
  'country': 'Germany'},
 {'league': 'Ligue 1',
  'competition': '13',
  'slug': 'Ligue-1-Scores-and-Fixtures',
  'country': 'France'}]

In [41]:
all_match_records: list[dict] = []

for league_cfg in LEAGUE_CONFIGS:
    schedule_url = build_schedule_url(league_cfg)
    html_text, schedule_source = fetch_with_fallback(schedule_url)
    print(f"Fetched {league_cfg['league']} schedule via {schedule_source}.")
    league_records = parse_schedule_rows(html_text, league_cfg, schedule_url, schedule_source)
    print(f"Collected {len(league_records)} matches for {league_cfg['league']}.")
    all_match_records.extend(league_records)
    time.sleep(2)

if not all_match_records:
    raise RuntimeError("No match data collected. Verify league configurations or network access.")


Primary request blocked for https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures (403 Client Error: Forbidden for url: https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures); retrying with cloudscraper...
Fetched Premier League schedule via cloudscraper.
Collected 380 matches for Premier League.
Primary request blocked for https://fbref.com/en/comps/12/schedule/La-Liga-Scores-and-Fixtures (403 Client Error: Forbidden for url: https://fbref.com/en/comps/12/schedule/La-Liga-Scores-and-Fixtures); retrying with cloudscraper...
Fetched La Liga schedule via cloudscraper.
Collected 379 matches for La Liga.
Primary request blocked for https://fbref.com/en/comps/11/schedule/Serie-A-Scores-and-Fixtures (403 Client Error: Forbidden for url: https://fbref.com/en/comps/11/schedule/Serie-A-Scores-and-Fixtures); retrying with cloudscraper...
Fetched Serie A schedule via cloudscraper.
Collected 380 matches for Serie A.
Primary request blocked for https://fbref.co

In [42]:
match_details_df = pd.DataFrame(all_match_records)
match_details_df.head()

Unnamed: 0,match_id,league,country,competition_id,date,home_team,away_team,score,home_score,away_score,result,venue,attendance,match_report_url,source_url,source
0,a071faa8,Premier League,England,9,2025-08-15,Liverpool,Bournemouth,4–2,4.0,2.0,H,Anfield,60315.0,https://fbref.com/en/matches/a071faa8/Liverpoo...,https://fbref.com/en/comps/9/schedule/Premier-...,cloudscraper
1,bbdf4739,Premier League,England,9,2025-08-16,Aston Villa,Newcastle Utd,0–0,0.0,0.0,D,Villa Park,42526.0,https://fbref.com/en/matches/bbdf4739/Aston-Vi...,https://fbref.com/en/comps/9/schedule/Premier-...,cloudscraper
2,e1058522,Premier League,England,9,2025-08-16,Sunderland,West Ham,3–0,3.0,0.0,H,Stadium of Light,46233.0,https://fbref.com/en/matches/e1058522/Sunderla...,https://fbref.com/en/comps/9/schedule/Premier-...,cloudscraper
3,57c49bae,Premier League,England,9,2025-08-16,Brighton,Fulham,1–1,1.0,1.0,D,The American Express Stadium,31478.0,https://fbref.com/en/matches/57c49bae/Brighton...,https://fbref.com/en/comps/9/schedule/Premier-...,cloudscraper
4,d41bb8b3,Premier League,England,9,2025-08-16,Tottenham,Burnley,3–0,3.0,0.0,H,Tottenham Hotspur Stadium,61077.0,https://fbref.com/en/matches/d41bb8b3/Tottenha...,https://fbref.com/en/comps/9/schedule/Premier-...,cloudscraper


In [44]:
match_details_df.to_csv('../data_raw/matchDetails.csv', index=False)