In [1]:
import requests

In [2]:
standings_url = "https://www.premierleague.com/tables"

In [3]:
data = requests.get(standings_url)

In [4]:
from bs4 import BeautifulSoup

In [5]:
soup = BeautifulSoup(data.text)

In [6]:
standings_table = soup.select('table')[0]

In [7]:
links = standings_table.find_all('a')

In [8]:
links = [l.get("href") for l in links]

In [9]:
links = [l for l in links if '/clubs/' in l]

In [10]:
team_urls = list(set(f"https://www.premierleague.com{l}/results" for l in links if '/clubs/' in l))

In [11]:
season_ids = [274, 363, 418, 489, 578, 719]

team_urls = set()
for l in links:
    base = l.split('/overview')[0] if '/overview' in l else l
    for season_id in season_ids:
        full_url = f"https://www.premierleague.com{base}/results?co=1&se={season_id}&cl=-1"
        team_urls.add(full_url)

team_urls = list(team_urls)

In [12]:
team_urls

['https://www.premierleague.com/clubs/127/Bournemouth/results?co=1&se=719&cl=-1',
 'https://www.premierleague.com/clubs/7/Everton/results?co=1&se=363&cl=-1',
 'https://www.premierleague.com/clubs/127/Bournemouth/results?co=1&se=418&cl=-1',
 'https://www.premierleague.com/clubs/130/Brentford/results?co=1&se=274&cl=-1',
 'https://www.premierleague.com/clubs/10/Liverpool/results?co=1&se=578&cl=-1',
 'https://www.premierleague.com/clubs/12/Manchester-United/results?co=1&se=578&cl=-1',
 'https://www.premierleague.com/clubs/131/Brighton-and-Hove-Albion/results?co=1&se=274&cl=-1',
 'https://www.premierleague.com/clubs/26/Leicester-City/results?co=1&se=719&cl=-1',
 'https://www.premierleague.com/clubs/6/Crystal-Palace/results?co=1&se=489&cl=-1',
 'https://www.premierleague.com/clubs/7/Everton/results?co=1&se=489&cl=-1',
 'https://www.premierleague.com/clubs/11/Manchester-City/results?co=1&se=274&cl=-1',
 'https://www.premierleague.com/clubs/10/Liverpool/results?co=1&se=274&cl=-1',
 'https://ww

In [13]:
len(team_urls)

120

In [14]:
!pip install selenium
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd

options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)

all_results = []
for team_url in team_urls:
    data = requests.get(team_url)

    driver.get(team_url)

    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, "lxml")

    matches = soup.select("div.match-fixture__wrapper")

    team_name_mapping = {
        "Nottingham Forest": "Nott'm Forest",
        "Tottenham Hotspur": "Spurs",
        "Manchester City": "Man City",
        "Manchester United": "Man Utd",
        "Newcastle United": "Newcastle",
        "West Ham United": "West Ham",
        "Wolverhampton Wanderers": "Wolves",
        "Brighton and Hove Albion": "Brighton",
        "AFC Bournemouth": "Bournemouth",
        "Sheffield United": "Sheffield Utd",
        "Leeds United": "Leeds",
        "Leicester City": "Leicester",
        "Ipswich Town": "Ipswich",
        "Luton Town": "Luton",
        "Norwich City": "Norwich",
        "West Bromwich Albion": "West Brom",
        "Wigan Athletic": "Wigan",
        "Stoke City": "Stoke",
        "Huddersfield Town": "Huddersfield",
        "Hull City": "Hull",
        "Cardiff City": "Cardiff",
        "Swansea City": "Swansea"
    }

    team_name = team_url.split("/")[5].replace("-", " ")
    team_name = team_name_mapping.get(team_name, team_name)

    results = []

    for match in matches:
        try:
            match_id = match.get("data-matchid")

            date = match.find_previous("div", class_="fixtures__date-content-container").find("time").get("datetime")
            teams = match.select("span.match-fixture__team-name span.match-fixture__short-name")
            scores = match.select_one("span.match-fixture__score").text.strip()

            score_home, score_away = map(int, scores.split("-"))

            team1 = teams[0].text.strip()
            team2 = teams[1].text.strip()

            if team1.lower() in team_name.lower():
                opponent = team2
                goals_for, goals_against = score_home, score_away
            else:
                opponent = team1
                goals_for, goals_against = score_away, score_home

            if goals_for > goals_against:
                result = "Win"
            elif goals_for < goals_against:
                result = "Lose"
            else:
                result = "Draw"

            stadium = match.select_one("span.match-fixture__stadium-name")
            stadium = stadium.text.strip() if stadium else None

            results.append({
                "match_id": match_id,
                "team_name": team_name,
                "date": date,
                "opponent": opponent,
                "goals_for": goals_for,
                "goals_against": goals_against,
                "result": result,
                "stadium": stadium
            })

        except Exception as e:
            continue


    all_results.extend(results)

df = pd.DataFrame(all_results)

print(df)

driver.quit()

Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [15]:
for team_url in team_urls:
    data = requests.get(team_url)

In [16]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import pandas as pd

firefox_options = Options()
firefox_options.add_argument("--headless")

driver = webdriver.Firefox(options=firefox_options)

all_match_urls = []

for team_url in team_urls:
    driver.get(team_url)
    time.sleep(5)

    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_pause_time = 2

    for _ in range(15):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = soup.find_all(attrs={'data-href': True})

    match_links = [
        link['data-href'] for link in links if '/match/' in link['data-href']
    ]
    distinct_match_links = list(dict.fromkeys(match_links))

    team_name_mapping = {
        "Nottingham Forest": "Nott'm Forest",
        "Tottenham Hotspur": "Spurs",
        "Manchester City": "Man City",
        "Manchester United": "Man Utd",
        "Newcastle United": "Newcastle",
        "West Ham United": "West Ham",
        "Wolverhampton Wanderers": "Wolves",
        "Brighton and Hove Albion": "Brighton",
        "AFC Bournemouth": "Bournemouth",
        "Sheffield United": "Sheffield Utd",
        "Leeds United": "Leeds",
        "Leicester City": "Leicester",
        "Ipswich Town": "Ipswich",
        "Luton Town": "Luton",
        "Norwich City": "Norwich",
        "West Bromwich Albion": "West Brom",
        "Wigan Athletic": "Wigan",
        "Stoke City": "Stoke",
        "Huddersfield Town": "Huddersfield",
        "Hull City": "Hull",
        "Cardiff City": "Cardiff",
        "Swansea City": "Swansea"
    }

    team_name = team_url.split("/")[5].replace("-", " ")
    team_name = team_name_mapping.get(team_name, team_name)

    for url in distinct_match_links:
        full_url = "https:" + url
        all_match_urls.append({
            "team_name": team_name,
            "match_url": full_url
        })

df_match_url = pd.DataFrame(all_match_urls)
print(df_match_url)

driver.quit()

        team_name                                   match_url
0     Bournemouth  https://www.premierleague.com/match/116177
1     Bournemouth  https://www.premierleague.com/match/116167
2     Bournemouth  https://www.premierleague.com/match/116158
3     Bournemouth  https://www.premierleague.com/match/116149
4     Bournemouth  https://www.premierleague.com/match/116138
...           ...                                         ...
4163    Brentford   https://www.premierleague.com/match/66391
4164    Brentford   https://www.premierleague.com/match/66373
4165    Brentford   https://www.premierleague.com/match/66362
4166    Brentford   https://www.premierleague.com/match/66355
4167    Brentford   https://www.premierleague.com/match/66342

[4168 rows x 2 columns]


In [17]:
df_match_url["match_id"] = df_match_url["match_url"].str.extract(r'/match/(\d+)')

print(df_match_url)

        team_name                                   match_url match_id
0     Bournemouth  https://www.premierleague.com/match/116177   116177
1     Bournemouth  https://www.premierleague.com/match/116167   116167
2     Bournemouth  https://www.premierleague.com/match/116158   116158
3     Bournemouth  https://www.premierleague.com/match/116149   116149
4     Bournemouth  https://www.premierleague.com/match/116138   116138
...           ...                                         ...      ...
4163    Brentford   https://www.premierleague.com/match/66391    66391
4164    Brentford   https://www.premierleague.com/match/66373    66373
4165    Brentford   https://www.premierleague.com/match/66362    66362
4166    Brentford   https://www.premierleague.com/match/66355    66355
4167    Brentford   https://www.premierleague.com/match/66342    66342

[4168 rows x 3 columns]


In [18]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import traceback

def scrape_match_stats(row):
    url = row["match_url"]
    team_name = row["team_name"]

    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)
    wait = WebDriverWait(driver, 15)

    try:
        driver.get(url)

        try:
            accept_btn = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "btn-primary")))
            accept_btn.click()
        except:
            pass

        try:
            stats_tab = wait.until(EC.element_to_be_clickable(
                (By.XPATH, "//li[@role='tab' and normalize-space()='Stats']")
            ))
            driver.execute_script("arguments[0].click();", stats_tab)
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "tbody.matchCentreStatsContainer")))
        except Exception as e:
            print("Failed to click 'Stats' tab:", e)
            return None

        soup = BeautifulSoup(driver.page_source, "html.parser")

        attendance, referee = "", ""

        try:
            info_divs = soup.find_all("div", class_="mc-summary__info")
            for div in info_divs:
                text = div.get_text(strip=True)
                if "Att:" in text:
                    attendance = text.replace("Att:", "").strip()
                elif "Ref:" in text:
                    referee = text.replace("Ref:", "").strip()
        except:
            pass


        headers = soup.select("table thead th a")
        if len(headers) >= 2:
            team_left = headers[0].get_text(strip=True)
            team_right = headers[1].get_text(strip=True)
        else:
            return None

        is_left = None
        if team_left.lower() in team_name.lower():
            is_left = True
        elif team_right.lower() in team_name.lower():
            is_left = False
        else:
            return None

        stat_rows = soup.select("tbody.matchCentreStatsContainer tr")
        stat_dict = {
            "Match": f"{team_left} vs {team_right}",
            "Opponent": team_left if is_left else team_right,
            "Side": "Home" if is_left else "Away",
            "Match URL": url,
            "Attendance": attendance,
            "Referee": referee
        }

        for row in stat_rows:
            try:
                stat_name = row.select_one("td:nth-of-type(2)").get_text(strip=True)
                val_left = row.select_one("td:nth-of-type(1) p").get_text(strip=True)
                val_right = row.select_one("td:nth-of-type(3) p").get_text(strip=True)
                stat_value = val_left if is_left else val_right
                stat_dict[stat_name] = stat_value
            except:
                continue

        return stat_dict

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        traceback.print_exc()
        return None

    finally:
        driver.quit()

MAX_WORKERS = 4

print(f"Starting with {MAX_WORKERS} parallel threads...")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    results = list(executor.map(scrape_match_stats, [row for _, row in df_match_url.iterrows()]))

clean_results = [r for r in results if r is not None]
stats = pd.DataFrame(clean_results)

print(stats)

Starting with 4 parallel threads...
Failed to click 'Stats' tab: Message: 
Stacktrace:
#0 0x56ec38d6475a <unknown>
#1 0x56ec388070a0 <unknown>
#2 0x56ec388589b0 <unknown>
#3 0x56ec38858ba1 <unknown>
#4 0x56ec388a6ea4 <unknown>
#5 0x56ec3887e3cd <unknown>
#6 0x56ec388a42a0 <unknown>
#7 0x56ec3887e173 <unknown>
#8 0x56ec3884ad4b <unknown>
#9 0x56ec3884b9b1 <unknown>
#10 0x56ec38d2990b <unknown>
#11 0x56ec38d2d80a <unknown>
#12 0x56ec38d11662 <unknown>
#13 0x56ec38d2e394 <unknown>
#14 0x56ec38cf649f <unknown>
#15 0x56ec38d52538 <unknown>
#16 0x56ec38d52716 <unknown>
#17 0x56ec38d635c6 <unknown>
#18 0x7dcf3f3f3ac3 <unknown>

Failed to click 'Stats' tab: Message: 
Stacktrace:
#0 0x5ad2d413675a <unknown>
#1 0x5ad2d3bd90a0 <unknown>
#2 0x5ad2d3c2a9b0 <unknown>
#3 0x5ad2d3c2aba1 <unknown>
#4 0x5ad2d3c78ea4 <unknown>
#5 0x5ad2d3c503cd <unknown>
#6 0x5ad2d3c762a0 <unknown>
#7 0x5ad2d3c50173 <unknown>
#8 0x5ad2d3c1cd4b <unknown>
#9 0x5ad2d3c1d9b1 <unknown>
#10 0x5ad2d40fb90b <unknown>
#11 0x5ad2d

In [19]:
stats["match_id"] = stats["Match URL"].str.extract(r'/match/(\d+)')
print(stats)

                              Match     Opponent  Side  \
0        Bournemouth vs Aston Villa  Bournemouth  Home   
1            Arsenal vs Bournemouth  Bournemouth  Away   
2            Bournemouth vs Man Utd  Bournemouth  Home   
3     Crystal Palace vs Bournemouth  Bournemouth  Away   
4             Bournemouth vs Fulham  Bournemouth  Home   
...                             ...          ...   ...   
4159            Wolves vs Brentford    Brentford  Away   
4160          Brentford vs Brighton    Brentford  Home   
4161       Aston Villa vs Brentford    Brentford  Away   
4162    Crystal Palace vs Brentford    Brentford  Away   
4163           Brentford vs Arsenal    Brentford  Home   

                                       Match URL Attendance          Referee  \
0     https://www.premierleague.com/match/116177     11,248   Stuart Attwell   
1     https://www.premierleague.com/match/116167     60,110   Jarred Gillett   
2     https://www.premierleague.com/match/116158     11,241    

In [20]:
stats = stats.fillna(0)
print(stats)

                              Match     Opponent  Side  \
0        Bournemouth vs Aston Villa  Bournemouth  Home   
1            Arsenal vs Bournemouth  Bournemouth  Away   
2            Bournemouth vs Man Utd  Bournemouth  Home   
3     Crystal Palace vs Bournemouth  Bournemouth  Away   
4             Bournemouth vs Fulham  Bournemouth  Home   
...                             ...          ...   ...   
4159            Wolves vs Brentford    Brentford  Away   
4160          Brentford vs Brighton    Brentford  Home   
4161       Aston Villa vs Brentford    Brentford  Away   
4162    Crystal Palace vs Brentford    Brentford  Away   
4163           Brentford vs Arsenal    Brentford  Home   

                                       Match URL Attendance          Referee  \
0     https://www.premierleague.com/match/116177     11,248   Stuart Attwell   
1     https://www.premierleague.com/match/116167     60,110   Jarred Gillett   
2     https://www.premierleague.com/match/116158     11,241    

In [21]:
print(f"df rows: {len(df)}, stats rows: {len(stats)}")

df rows: 3912, stats rows: 4164


In [23]:
stats = stats.drop_duplicates(subset = "match_id")
stats = stats.drop(columns = ["Opponent"])

merged_df = pd.merge(df, stats, on = ["match_id"], how = "inner", suffixes = ("_summary", "_stats"))

print(merged_df)

     match_id    team_name                        date        opponent  \
0      116177  Bournemouth        Saturday 10 May 2025     Aston Villa   
1      116167  Bournemouth         Saturday 3 May 2025         Arsenal   
2      116158  Bournemouth        Sunday 27 April 2025         Man Utd   
3      116149  Bournemouth      Saturday 19 April 2025  Crystal Palace   
4      116138  Bournemouth        Monday 14 April 2025          Fulham   
...       ...          ...                         ...             ...   
3906    66391    Brentford  Saturday 18 September 2021          Wolves   
3907    66373    Brentford  Saturday 11 September 2021        Brighton   
3908    66362    Brentford     Saturday 28 August 2021     Aston Villa   
3909    66355    Brentford     Saturday 21 August 2021  Crystal Palace   
3910    66342    Brentford       Friday 13 August 2021         Arsenal   

      goals_for  goals_against result                             stadium  \
0             0              1   L

In [24]:
merged_df.to_csv("premier_league_data.csv", index = False)