In [None]:
import time
import requests
import pandas as pd
from getpass import getuser
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

In [None]:

def get_team_links(url, driver):
    """Extract team links from the main page."""
    driver.get(url)
    
    # Add delay to ensure JavaScript rendering
    time.sleep(5)
    
    # Wait for the table to load
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "stats_table"))
        )
    except Exception as e:
        print("Failed to locate the stats table. Printing page source...")
        print(driver.page_source)
        raise e

    # Get the page source and parse it
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Find the table with the specified class
    standings_table = soup.select('table.stats_table')
    if not standings_table:
        print("Stats table not found in the page source.")
        return []
    standings_table = standings_table[0]

    links = standings_table.find_all('a')
    links = [l.get("href") for l in links]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    return team_urls

In [None]:
def scrape_scores_and_fixtures(team_url, session):
    """Scrape the Scores & Fixtures table for a given team."""
    for _ in range(3):  # Retry up to 3 times
        response = session.get(team_url)
        if response.status_code == 200:
            try:
                # Use pandas to directly read the Scores & Fixtures table
                matches = pd.read_html(response.text, match="Scores & Fixtures")[0]
                return matches
            except ValueError:
                print(f"No Scores & Fixtures table found for {team_url}")
                return None
        elif response.status_code == 429:
            print(f"Rate limited for {team_url}. Retrying...")
            time.sleep(10)  # Wait 10 seconds before retrying
        else:
            print(f"Failed to retrieve {team_url} (status code: {response.status_code})")
            return None
    return None


In [4]:
def main():
    # Dynamically get the current user's username
    user = getuser()
    chrome_driver_path = f"C:\\Users\\{user}\\Downloads\\chromedriver.exe"
    save_path = f"C:\\Users\\{user}\\Documents\\GitHub\\dream-team-fpl-prediction\\data\\games.xlsx"
    
    # URLs for the last three seasons
    season_urls = [
        "https://fbref.com/en/comps/9/2024-2025/Premier-League-Stats",  # Current season
        "https://fbref.com/en/comps/9/2023-2024/Premier-League-Stats",  # Previous season
        "https://fbref.com/en/comps/9/2022-2023/Premier-League-Stats"   # Two seasons ago
    ]
    
    # Set up Selenium WebDriver
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in headless mode
    chrome_options.add_argument('--disable-gpu')  # Disable GPU acceleration
    chrome_options.add_argument('--no-sandbox')  # Bypass OS-level security
    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=chrome_options)
    
    try:
        all_matches = []
        session = requests.Session()
        
        for season_url in season_urls:
            print(f"Processing season: {season_url}")
            team_urls = get_team_links(season_url, driver)
            
            for team_url in team_urls:
                print(f"Scraping matches from {team_url}...")
                matches = scrape_scores_and_fixtures(team_url, session)
                if matches is not None:
                    matches['team_url'] = team_url  # Add the team URL for reference
                    matches['season'] = season_url.split('/')[-2]  # Extract season from URL
                    all_matches.append(matches)
                time.sleep(5)  # Wait 5 seconds between requests
        
        # Combine all match data into a single DataFrame
        if all_matches:
            all_matches_df = pd.concat(all_matches, ignore_index=True)
            # Save to Excel
            all_matches_df.to_excel(save_path, index=False)
            print(f"Data saved to {save_path}")
        else:
            print("No matches found for any team.")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


Processing season: https://fbref.com/en/comps/9/2024-2025/Premier-League-Stats
Scraping matches from https://fbref.com/en/squads/822bd0ba/Liverpool-Stats...
Scraping matches from https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats...
Scraping matches from https://fbref.com/en/squads/18bb7c10/Arsenal-Stats...
Scraping matches from https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats...
Scraping matches from https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats...
Scraping matches from https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats...
Scraping matches from https://fbref.com/en/squads/8602292d/Aston-Villa-Stats...
Scraping matches from https://fbref.com/en/squads/fd962109/Fulham-Stats...
Scraping matches from https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats...
Scraping matches from https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats...
Scraping matches from https://fbref.com/en/squads/cd051869/Brentford-Stats...
Scraping matches from https:/