In [1]:
import requests
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_flashscore_with_selenium():
    """
    Scrape Flashscore football data using Selenium to handle JavaScript-loaded content
    """
    print("Setting up Chrome driver...")
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--disable-popup-blocking")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
    
    # Initialize the Chrome driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    try:
        # Navigate to Flashscore
        print("Navigating to Flashscore...")
        driver.get("https://www.flashscore.com/football/")
        
        # Wait for the content to load
        print("Waiting for content to load...")
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "sportName"))
        )
        
        # Give it some extra time for all data to load
        time.sleep(5)
        
        # Extract data from the page
        print("Extracting match data...")
        
        # Get network data from browser
        print("Getting network data...")
        # This doesn't directly access network data in pure selenium,
        # but we'll look for data in the page source
        
        # Extract match data from the DOM
        matches = []
        match_elements = driver.find_elements(By.CSS_SELECTOR, ".event__match")
        
        print(f"Found {len(match_elements)} match elements")
        
        for match in match_elements:
            try:
                match_id = match.get_attribute("id")
                
                home_team_elem = match.find_element(By.CSS_SELECTOR, ".event__participant--home")
                away_team_elem = match.find_element(By.CSS_SELECTOR, ".event__participant--away")
                
                home_team = home_team_elem.text if home_team_elem else "Unknown"
                away_team = away_team_elem.text if away_team_elem else "Unknown"
                
                # Score might not be available for future matches
                try:
                    home_score = match.find_element(By.CSS_SELECTOR, ".event__score--home").text
                    away_score = match.find_element(By.CSS_SELECTOR, ".event__score--away").text
                except:
                    home_score = "N/A"
                    away_score = "N/A"
                
                # Try to get match time
                try:
                    match_time = match.find_element(By.CSS_SELECTOR, ".event__time").text
                except:
                    match_time = "N/A"
                
                # Try to get match status
                try:
                    match_status = match.find_element(By.CSS_SELECTOR, ".event__stage").text
                except:
                    match_status = "N/A"
                
                # Try to get league information
                try:
                    # Go up to find the containing league section
                    # This is a bit tricky and might need adjustment based on the actual HTML structure
                    parent_section = driver.execute_script("""
                        var element = arguments[0];
                        while(element && !element.classList.contains('sportName')) {
                            element = element.parentElement;
                        }
                        return element;
                    """, match)
                    
                    if parent_section:
                        league_elem = parent_section.find_element(By.CSS_SELECTOR, ".event__title")
                        league = league_elem.text if league_elem else "Unknown"
                    else:
                        league = "Unknown"
                except:
                    league = "Unknown"
                
                match_data = {
                    "id": match_id,
                    "homeTeam": home_team,
                    "awayTeam": away_team,
                    "homeScore": home_score,
                    "awayScore": away_score,
                    "time": match_time,
                    "status": match_status,
                    "league": league
                }
                
                matches.append(match_data)
                
            except Exception as e:
                print(f"Error extracting match data: {e}")
        
        # Try to find any JSON data in the page
        print("Looking for JSON data in the page...")
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Look for script tags that might contain JSON data
        json_data = []
        for script in soup.find_all('script'):
            script_content = script.string
            if script_content:
                # Look for potential JSON objects in the script
                if '{"' in script_content and '":{' in script_content:
                    try:
                        # Extract content between curly braces
                        start_idx = script_content.find('{"')
                        if start_idx >= 0:
                            content = script_content[start_idx:]
                            # Find balanced closing brace
                            open_braces = 1
                            for i, char in enumerate(content[2:], 2):
                                if char == '{':
                                    open_braces += 1
                                elif char == '}':
                                    open_braces -= 1
                                if open_braces == 0:
                                    end_idx = i
                                    break
                            
                            if open_braces == 0:
                                json_str = content[:end_idx+1]
                                try:
                                    data = json.loads(json_str)
                                    json_data.append(data)
                                    print(f"Found JSON data in script tag: {json_str[:100]}...")
                                except json.JSONDecodeError:
                                    pass
                    except:
                        pass
        
        # Save the collected data
        with open('flashscore_matches.json', 'w', encoding='utf-8') as f:
            json.dump(matches, f, ensure_ascii=False, indent=2)
            print(f"Saved {len(matches)} matches to flashscore_matches.json")
        
        if json_data:
            with open('flashscore_json_data.json', 'w', encoding='utf-8') as f:
                json.dump(json_data, f, ensure_ascii=False, indent=2)
                print(f"Saved {len(json_data)} JSON objects to flashscore_json_data.json")
        
        return matches, json_data
        
    except Exception as e:
        print(f"Error: {e}")
    finally:
        driver.quit()
        print("Browser closed.")

def get_flashscore_api_data():
    """
    Try to access Flashscore API directly if available
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json',
        'Accept-Language': 'en-US,en;q=0.9',
        'Referer': 'https://www.flashscore.com/football/',
        'Origin': 'https://www.flashscore.com'
    }
    
    # Try different API endpoints (these are guesses and might need adjustment)
    api_endpoints = [
        'https://www.flashscore.com/football/matches/live/',
        'https://d.flashscore.com/x/feed/f_1_0_3_en-us_1',  # This format is often used by score sites
        'https://api.flashscore.com/v1/tournaments',  # Hypothetical API endpoint
    ]
    
    for endpoint in api_endpoints:
        try:
            print(f"Trying API endpoint: {endpoint}")
            response = requests.get(endpoint, headers=headers)
            if response.status_code == 200:
                try:
                    data = response.json()
                    print(f"Successfully got JSON data from {endpoint}")
                    with open(f'flashscore_api_{endpoint.split("/")[-1]}.json', 'w') as f:
                        json.dump(data, f, indent=2)
                    return data
                except:
                    print(f"Response from {endpoint} is not JSON")
        except Exception as e:
            print(f"Error accessing {endpoint}: {e}")
    
    print("Could not find a working API endpoint")
    return None

if __name__ == "__main__":
    print("Starting Flashscore scraper...")
    
    # First, try the Selenium approach (most reliable)
    matches, json_data = scrape_flashscore_with_selenium()
    
    # Then try direct API access (may not work)
    api_data = get_flashscore_api_data()
    
    print("Scraping complete!")
    
    if matches:
        print(f"Successfully scraped {len(matches)} matches")
    else:
        print("No matches were scraped")
    
    if json_data:
        print(f"Found {len(json_data)} JSON objects in the page")
    else:
        print("No JSON data found in the page")
    
    if api_data:
        print("Successfully accessed Flashscore API")
    else:
        print("Could not access Flashscore API directly")

ModuleNotFoundError: No module named 'webdriver_manager'

In [4]:
import requests
from bs4 import BeautifulSoup

def scrape_flashscore_headers():
    # Define the URL for Flashscore soccer section
    url = "https://www.flashscore.com/football/"
    
    # Set headers to mimic a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    try:
        # Send HTTP request
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise exception for bad status codes
        
        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Target the menuTop__items container
        menu_container = soup.select_one("body > nav > div > div.menuTop__items")
        if not menu_container:
            print("Menu container not found")
            return
        
        # Find all header elements (assuming <a> tags within menuTop__items)
        header_elements = menu_container.find_all('a', class_='menuTop__item')
        
        if not header_elements:
            print("No header elements found in menu container")
            return
        
        # Extract and print details of each header element
        for index, header in enumerate(header_elements, 1):
            text = header.get_text(strip=True)
            link = header.get('href', 'No link found')
            is_active = 'menuTop__item--active' in header.get('class', [])
            
            print(f"Header {index}:")
            print(f"Text: {text}")
            print(f"Link: {link}")
            print(f"Active: {is_active}")
            print("-" * 50)
        
    except requests.RequestException as e:
        print(f"Error fetching data: {e}")
    except Exception as e:
        print(f"Error parsing data: {e}")

if __name__ == "__main__":
    scrape_flashscore_headers()

Header 1:
Text: Football
Link: /
Active: True
--------------------------------------------------
Header 2:
Text: Tennis
Link: /tennis/
Active: False
--------------------------------------------------
Header 3:
Text: Basketball
Link: /basketball/
Active: False
--------------------------------------------------
Header 4:
Text: Hockey
Link: /hockey/
Active: False
--------------------------------------------------
Header 5:
Text: Golf
Link: /golf/
Active: False
--------------------------------------------------
Header 6:
Text: Snooker
Link: /snooker/
Active: False
--------------------------------------------------
Header 7:
Text: Baseball
Link: /baseball/
Active: False
--------------------------------------------------
Header 8:
Text: Volleyball
Link: /volleyball/
Active: False
--------------------------------------------------


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import uuid
import time

def scrape_flashscore_matches():
    # Set up Selenium with headless Chrome
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    driver = None
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        
        # Load the main page to verify football link
        base_url = "https://www.flashscore.com/"
        driver.get(base_url)
        
        # Wait for the active football menu item
        football_menu = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body > nav > div > div.menuTop__items > a.menuTop__item--active"))
        )
        football_link = football_menu.get_attribute('href') or '/soccer/'
        football_url = football_link if football_link.startswith('http') else base_url.rstrip('/') + football_link
        print(f"Football URL: {football_url}")
        
        # Load the football page
        driver.get(football_url)
        
        # Wait for the soccer container or match elements
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.sportName.soccer, div.container__livetable"))
        )
        
        # Parse page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Target the soccer container
        soccer_container = soup.find('div', class_='sportName soccer')
        if not soccer_container:
            print("Soccer container not found. Trying alternative container...")
            soccer_container = soup.find('div', class_='container__livetable')
            if not soccer_container:
                print("Alternative container not found. Page structure may have changed.")
                return
        
        # Find all match elements
        matches = soccer_container.find_all('div', class_='event__match')
        if not matches:
            print("No matches found. The page may use different class names or require further JavaScript rendering.")
            return
        
        # Extract league information
        current_league = None
        league_headers = soccer_container.find_all('div', class_='wclLeagueHeader')
        
        for match in matches:
            # Update league if a new header is encountered
            parent = match.find_parent()
            while parent and parent != soccer_container:
                header = parent.find('div', class_='wclLeagueHeader')
                if header:
                    league_info = header.find('a', class_='wcl-link')
                    current_league = league_info.get_text(strip=True) if league_info else "Unknown League"
                    break
                parent = parent.find_parent()
            
            # Extract match details
            home_team = match.find('div', class_='event__homeParticipant')
            away_team = match.find('div', class_='event__awayParticipant')
            time = match.find('div', class_='event__time')
            
            home_team_name = home_team.get_text(strip=True) if home_team else "N/A"
            away_team_name = away_team.get_text(strip=True) if away_team else "N/A"
            match_time = time.get_text(strip=True) if time else "N/A"
            
            # Get match link if available
            match_link = match.find('a', class_='eventRowLink')
            match_url = match_link.get('href', 'No link') if match_link else "No link"
            
            # Generate unique ID for each match
            match_id = str(uuid.uuid4())
            
            print(f"Match ID: {match_id}")
            print(f"League Header: {league_headers}")
            print(f"League: {current_league}")
            print(f"Home Team: {home_team_name}")
            print(f"Away Team: {away_team_name}")
            print(f"Time/Status: {match_time}")
            print(f"Match URL: {match_url}")
            print("-" * 50)
        
    except webdriver.exceptions.TimeoutException as e:
        print(f"Timeout waiting for page elements: {e}")
        print("The page may be slow to load or elements may have changed.")
    except Exception as e:
        print(f"Error scraping data: {e}")
        print("Ensure Selenium and ChromeDriver are installed and the page structure hasn't changed.")
    finally:
        if driver:
            driver.quit()

if __name__ == "__main__":
    scrape_flashscore_matches()

Football URL: https://www.flashscore.com/
Match ID: 856e04bf-2997-4061-a969-1af7e615ce5e
League Header: [<div class="wcl-header_uBhYi wcl-pinned_WU5N6 wclLeagueHeader wclLeagueHeader--collapsed" data-pinned="true" data-testid="wcl-headerLeague"><div class="wizard__relativeWrapper"><div class="wcl-trigger_YhU1j" data-state="closed"><button aria-label="Add all games to Favorites" class="wcl-favorite_SY6el wcl-favoriteL_y075f wcl-favoriteTertiary_xRLC3" data-testid="wcl-favorite-inactive"><svg class="wcl-icon_RJsNN wcl-size18_CZS-F" data-testid="wcl-icon-action-state-favorite" fill="currentColor" viewbox="0 0 20 20"><path d="m9.35 0-2.1 6.85L.4 6.83 0 8.14l5.54 4.21-2.13 6.84 1.06.81L10 15.76 15.53 20l1.05-.8-2.12-6.85L20 8.15l-.4-1.32-6.85.02L10.65 0h-1.3ZM8.4 7.8 10 2.57l1.6 5.21.66.5h5.2l-4.22 3.2-.25.81 1.63 5.21-4.22-3.23h-.8l-4.22 3.23 1.63-5.2-.26-.82-4.22-3.2h5.21l.66-.5Z" fill-rule="evenodd"></path></svg></button></div></div><div class="event__title"><div class="event__titleInfo"

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import uuid

def scrape_match_stats(url):
    # Set up Selenium with headless Chrome
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    driver = None
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        
        # Load the match statistics page
        driver.get(url)
        
        # Wait for the statistics section to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.wcl-row_OFViZ"))
        )
        
        # Parse page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find all statistic sections
        sections = soup.find_all('div', class_='section')
        if not sections:
            print("No statistics sections found. Page structure may have changed.")
            return
        
        # Generate unique ID for the match
        match_id = str(uuid.uuid4())
        print(f"Match ID: {match_id}")
        print(f"URL: {url}")
        print("=" * 50)
        
        for section in sections:
            # Get section title
            section_header = section.find('div', class_='sectionHeader')
            section_title = section_header.get_text(strip=True) if section_header else "Unknown Section"
            print(f"\nSection: {section_title}")
            print("-" * 30)
            
            # Find all statistic rows in the section
            stat_rows = section.find_all('div', class_='wcl-row_OFViZ')
            for row in stat_rows:
                # Extract category (metric name)
                category = row.find('div', class_='wcl-category_7qsgP')
                metric_name = category.find('strong').get_text(strip=True) if category and category.find('strong') else "N/A"
                
                # Extract home and away values
                home_value_div = row.find('div', class_='wcl-homeValue_-iJBW')
                away_value_div = row.find('div', class_='wcl-awayValue_rQvxs')
                
                # Get all text within the value divs, including additional details (e.g., pass ratios)
                home_value = home_value_div.get_text(strip=True) if home_value_div else "N/A"
                away_value = away_value_div.get_text(strip=True) if away_value_div else "N/A"
                
                print(f"Metric: {metric_name}")
                print(f"Home: {home_value}")
                print(f"Away: {away_value}")
                print("-" * 30)
        
    except webdriver.exceptions.TimeoutException as e:
        print(f"Timeout waiting for statistics elements: {e}")
        print("The page may be slow to load or elements may have changed.")
    except Exception as e:
        print(f"Error scraping data: {e}")
        print("Ensure Selenium and ChromeDriver are installed and the page structure hasn't changed.")
    finally:
        if driver:
            driver.quit()

if __name__ == "__main__":
    match_url = "https://www.flashscoreusa.com/game/soccer/tQUgk7EL/#/game-summary/game-statistics/0"
    scrape_match_stats(match_url)

Match ID: 63f06434-48bb-44a9-b643-9ef42011c6dd
URL: https://www.flashscoreusa.com/game/soccer/tQUgk7EL/#/game-summary/game-statistics/0

Section: Top stats
------------------------------
Metric: Expected goals (xG)
Home: 2.91
Away: 1.17
------------------------------
Metric: Ball possession
Home: 35%
Away: 65%
------------------------------
Metric: Total shots
Home: 17
Away: 11
------------------------------
Metric: Shots on target
Home: 7
Away: 2
------------------------------
Metric: Big chances
Home: 5
Away: 3
------------------------------
Metric: Corner kicks
Home: 3
Away: 6
------------------------------
Metric: Passes
Home: 84%(292/346)
Away: 88%(549/622)
------------------------------
Metric: Yellow cards
Home: 2
Away: 2
------------------------------

Section: Shots
------------------------------
Metric: Expected goals (xG)
Home: 2.91
Away: 1.17
------------------------------
Metric: xG on target (xGOT)
Home: 3.04
Away: 0.23
------------------------------
Metric: Total shots
H

In [1]:
import logging
import time
import re
import pandas as pd
from typing import List, Dict, Union
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException

# Logging setup
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)

# Base URL for Flashscore
BASE_URL = "https://www.flashscore.com"

# League configuration (subset from repository's countries_leagues)
COUNTRIES_LEAGUES = {'England': 'Premier League'}

def setup_driver() -> webdriver.Chrome:
    """Set up Selenium Chrome driver with headless options."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def execute_script_click(driver: webdriver.Chrome, element) -> None:
    """Execute JavaScript click on an element."""
    driver.execute_script('arguments[0].click();', element)

def click_league(driver: webdriver.Chrome, country: str, league_name: str) -> str:
    """
    Navigate to the league page by clicking country and league in the sidebar.
    Returns the league page URL.
    """
    try:
        driver.get(BASE_URL + "/football/")
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'lc'))
        )
        
        # Click "More" button if present
        try:
            more_countries = driver.find_element(By.CLASS_NAME, 'show-more')
            more_button = more_countries.find_element(By.LINK_TEXT, 'More')
            execute_script_click(driver, more_button)
            time.sleep(2)
        except:
            pass
        
        # Find and click country
        left_panel = driver.find_element(By.ID, 'lc')
        countries_menus = left_panel.find_elements(By.CLASS_NAME, 'mbox0px')
        for menu in countries_menus:
            try:
                country_link = menu.find_element(By.LINK_TEXT, country)
                execute_script_click(driver, country_link)
                time.sleep(2)
                break
            except:
                continue
        
        # Find and click league
        league_link = left_panel.find_element(By.LINK_TEXT, country).find_element(By.XPATH, '..').find_element(
            By.CLASS_NAME, 'submenu').find_element(By.LINK_TEXT, league_name)
        league_url = league_link.get_attribute('href')
        driver.get(league_url)
        return league_url
    
    except WebDriverException as e:
        logger.error(f"Error navigating to league {league_name} for {country}: {e}")
        return ""

def get_match_urls(league_url: str) -> List[Dict[str, Union[str]]]:
    """
    Fetch all match URLs from a league results page.
    """
    retries = 3
    wait_time = 15
    driver = None
    
    for attempt in range(retries):
        try:
            driver = setup_driver()
            driver.get(league_url)
            
            # Click Results tab
            results_tab = driver.find_element(By.LINK_TEXT, 'Results')
            execute_script_click(driver, results_tab)
            WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'event__more'))
            )
            
            # Click "Show more matches" until no more are available
            while True:
                try:
                    more_button = driver.find_element(By.CLASS_NAME, 'event__more')
                    execute_script_click(driver, more_button)
                    time.sleep(3)
                except (StaleElementReferenceException, WebDriverException):
                    break
            
            # Parse page source
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            match_divs = soup.find_all('div', class_='event__match')
            
            matches = []
            for div in match_divs:
                match_id = div.get('id', '').replace('g_1_', '')
                if match_id:
                    match_url = f"{BASE_URL}/match/{match_id}/#/match-summary"
                    matches.append({"match_id": match_id, "match_url": match_url})
            
            if not matches:
                logger.warning(f"No match URLs found on {league_url}. Page structure may have changed.")
            
            return matches
        
        except (WebDriverException, TimeoutException) as e:
            logger.error(f"Attempt {attempt + 1} failed while fetching match URLs: {e}")
            if attempt < retries - 1:
                logger.info("Retrying in 5 seconds...")
                time.sleep(5)
            else:
                logger.error("All retries failed.")
                return []
        
        finally:
            if driver:
                driver.quit()

def extract_match_data(match: Dict[str, Union[str]], season_name: str) -> Dict[str, Union[str, Dict]]:
    """
    Extract match details (teams, goals, stats, metadata) from a match page.
    """
    driver = None
    match_id = match.get("match_id", "")
    match_url = match.get("match_url", "")
    
    try:
        driver = setup_driver()
        driver.get(match_url)
        
        # Wait for match header
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.header"))
        )
        
        # Switch to statistics tab
        stats_url = match_url.replace("match-summary", "match-summary/match-statistics")
        driver.get(stats_url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.section"))
        )
        
        # Parse page source
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Extract metadata
        header = soup.select_one("div.headerStrip")
        league_info = header.select_one("div.fleft").get_text(strip=True) if header else "Unknown"
        match_round = league_info.split(" - Round ")[-1] if " - Round " in league_info else "N/A"
        match_date = header.select_one("div.mstat-date").get_text(strip=True) if header.select_one("div.mstat-date") else "N/A"
        
        # Adjust date with season year
        date_time = match_date
        date = date_time.split(' ')[0]
        season_years = season_name.split('/')
        if len(season_years) == 1:
            full_date = f"{date}{season_years[0]}"
        else:
            date_month = int(date.split('.')[1]) if '.' in date else 1
            full_date = f"{date}{season_years[0]}" if date_month < 7 else f"{date}{season_years[1]}"
        
        # Extract teams
        home_team = soup.select_one("div.tname-home a").get_text(strip=True) if soup.select_one("div.tname-home a") else "N/A"
        away_team = soup.select_one("div.tname-away a").get_text(strip=True) if soup.select_one("div.tname-away a") else "N/A"
        
        # Extract scores
        score_elem = soup.select_one("div.detailScore__wrapper")
        if score_elem:
            scores = score_elem.get_text(strip=True).replace(' ', '').split('-')
            home_score = scores[0] if len(scores) > 1 else "N/A"
            away_score = scores[1] if len(scores) > 1 else "N/A"
        else:
            home_score = away_score = "N/A"
        
        # Extract statistics
        stats = {}
        sections = soup.find_all('div', class_='section')
        for section in sections:
            section_header = section.find('div', class_='sectionHeader')
            section_title = section_header.get_text(strip=True) if section_header else "Unknown"
            stat_rows = section.find_all('div', class_='statRow')
            section_stats = []
            for row in stat_rows:
                metric = row.find('div', class_='statText--titleValue').get_text(strip=True) if row.find('div', class_='statText--titleValue') else "N/A"
                home_value = row.find('div', class_='statText--homeValue').get_text(strip=True) if row.find('div', class_='statText--homeValue') else "N/A"
                away_value = row.find('div', class_='statText--awayValue').get_text(strip=True) if row.find('div', class_='statText--awayValue') else "N/A"
                section_stats.append({
                    "metric": metric,
                    "home_value": home_value,
                    "away_value": away_value
                })
            stats[section_title] = section_stats
        
        return {
            "match_id": match_id,
            "match_url": match_url,
            "league": league_info.split(":")[1].split(" - ")[0].strip() if ":" in league_info else "N/A",
            "round": match_round,
            "date": full_date,
            "home_team": home_team,
            "away_team": away_team,
            "home_score": home_score,
            "away_score": away_score,
            "statistics": stats
        }
    
    except (WebDriverException, TimeoutException) as e:
        logger.error(f"Error scraping match {match_url}: {e}")
        return {"match_id": match_id, "match_url": match_url, "error": str(e)}
    
    finally:
        if driver:
            driver.quit()

def extract_matches(matches: List[Dict[str, Union[str]]], season_name: str) -> pd.DataFrame:
    """
    Extract key details from match data and convert it into a DataFrame.
    """
    match_list = []
    for match in matches:
        match_data = extract_match_data(match, season_name)
        stats_flat = {}
        for section, stats in match_data.get("statistics", {}).items():
            for stat in stats:
                metric_name = stat['metric'].replace(' ', '_').lower()
                stats_flat[f"{section}_{metric_name}_home"] = stat["home_value"]
                stats_flat[f"{section}_{metric_name}_away"] = stat["away_value"]
        
        match_info = {
            "match_id": match_data.get("match_id", "N/A"),
            "match_url": match_data.get("match_url", "N/A"),
            "league": match_data.get("league", "N/A"),
            "round": match_data.get("round", "N/A"),
            "date": match_data.get("date", "N/A"),
            "home_team": match_data.get("home_team", "N/A"),
            "away_team": match_data.get("away_team", "N/A"),
            "home_score": match_data.get("home_score", "N/A"),
            "away_score": match_data.get("away_score", "N/A"),
            **stats_flat
        }
        match_list.append(match_info)
    
    df = pd.DataFrame(match_list)
    if not df.empty:
        df.to_csv("flashscore_matches.csv", index=False)
        logger.info("Match data saved to flashscore_matches.csv")
    return df

def main(country: str = "England", season_name: str = "2024/2025") -> pd.DataFrame:
    """
    Main function to scrape match data from a Flashscore league results page.
    """
    league_name = COUNTRIES_LEAGUES.get(country, "Premier League")
    final_data = pd.DataFrame()
    
    driver = setup_driver()
    try:
        league_url = click_league(driver, country, league_name)
        if not league_url:
            logger.error(f"Failed to navigate to league {league_name}")
            return final_data
        
        matches = get_match_urls(league_url)
        if matches:
            matches_df = extract_matches(matches, season_name)
            final_data = pd.concat([final_data, matches_df], ignore_index=True)
    
    finally:
        driver.quit()
    
    return final_data

if __name__ == "__main__":
    matches_df = main()
    print(matches_df.to_string(index=False))

ERROR:__main__:Error navigating to league Premier League for England: Message: no such element: Unable to locate element: {"method":"link text","selector":"England"}
  (Session info: chrome=136.0.7103.49); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00E1FBD3+61635]
	GetHandleVerifier [0x00E1FC14+61700]
	(No symbol) [0x00C405D3]
	(No symbol) [0x00C8899E]
	(No symbol) [0x00C88D3B]
	(No symbol) [0x00C7E111]
	(No symbol) [0x00CAD2E4]
	(No symbol) [0x00C7E034]
	(No symbol) [0x00CAD514]
	(No symbol) [0x00CCE61B]
	(No symbol) [0x00CAD096]
	(No symbol) [0x00C7C840]
	(No symbol) [0x00C7D6A4]
	GetHandleVerifier [0x01067043+2450739]
	GetHandleVerifier [0x010627C6+2432182]
	GetHandleVerifier [0x0107D50E+2542078]
	GetHandleVerifier [0x00E36895+155013]
	GetHandleVerifier [0x00E3CF7D+181357]
	GetHandleVerifier [0x00E27428+92440]
	GetHandleVerifier [0x00E275D0+92864]
	

Empty DataFrame
Columns: []
Index: []


In [None]:
import logging
import time
import re
import pandas as pd
from typing import List, Dict, Union
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException

# Logging setup
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)

# Base URL for Flashscore
BASE_URL = "https://www.flashscore.com"

# League configuration
COUNTRIES_LEAGUES = {'England': 'Premier League'}

def setup_driver() -> webdriver.Chrome:
    """Set up Selenium Chrome driver with headless options."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def execute_script_click(driver: webdriver.Chrome, element) -> None:
    """Execute JavaScript click on an element."""
    driver.execute_script('arguments[0].click();', element)

def click_league(driver: webdriver.Chrome, country: str, league_name: str, retries: int = 3) -> str:
    """
    Navigate to the league page by clicking country and league in the sidebar.
    Falls back to direct league URL if navigation fails.
    Returns the league page URL.
    """
    league_url = f"{BASE_URL}/football/{country.lower()}/{league_name.lower().replace(' ', '-')}/"
    
    for attempt in range(retries):
        try:
            driver.get(BASE_URL + "/football/")
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.ID, 'lc'))
            )
            
            # Click "More" button if present
            try:
                more_countries = driver.find_element(By.CLASS_NAME, 'show-more')
                more_button = more_countries.find_element(By.LINK_TEXT, 'More')
                execute_script_click(driver, more_button)
                time.sleep(3)
            except:
                logger.info("No 'More' button found, proceeding to find country.")
            
            # Find and click country
            left_panel = driver.find_element(By.ID, 'lc')
            country_links = left_panel.find_elements(By.CSS_SELECTOR, "div.mbox0px a[href*='/football/']")
            country_clicked = False
            for link in country_links:
                if link.text.strip().lower() == country.lower():
                    execute_script_click(driver, link)
                    time.sleep(3)
                    country_clicked = True
                    break
            
            if not country_clicked:
                logger.warning(f"Country '{country}' not found in sidebar, falling back to direct URL.")
                driver.get(league_url)
                return league_url
            
            # Find and click league
            league_link = None
            submenu_links = left_panel.find_elements(By.CSS_SELECTOR, "div.submenu a")
            for link in submenu_links:
                if link.text.strip().lower() == league_name.lower():
                    league_link = link
                    break
            
            if league_link:
                league_url = league_link.get_attribute('href')
                execute_script_click(driver, league_link)
                time.sleep(3)
                return league_url
            else:
                logger.warning(f"League '{league_name}' not found in submenu, falling back to direct URL.")
                driver.get(league_url)
                return league_url
        
        except WebDriverException as e:
            logger.error(f"Attempt {attempt + 1} failed navigating to league {league_name} for {country}: {e}")
            if attempt < retries - 1:
                logger.info("Retrying in 5 seconds...")
                time.sleep(5)
            else:
                logger.warning(f"All retries failed, falling back to direct URL: {league_url}")
                driver.get(league_url)
                return league_url
    
    return league_url

def get_table_entries_from_table_div(table_rows_soup: BeautifulSoup, league_name: str, season_name: str) -> List[Dict[str, Union[str, int]]]:
    """
    Extract table entries (team rankings) from standings table.
    """
    table_entries = []
    teams_with_places = {}
    for row in table_rows_soup:
        place = row.find('div', class_='table__cell--rank').text.strip()
        team_name = row.find('span', class_='team_name_span').a.text if row.find('span', class_='team_name_span') else "N/A"
        teams_with_places[team_name] = place
    
    for team_name, place in teams_with_places.items():
        table_entries.append({
            "league": league_name,
            "season": season_name,
            "team": team_name,
            "place": place
        })
    return table_entries

def get_team_stats_from_table_div(table_rows_soup: BeautifulSoup, season_name: str, teams: List[Dict[str, str]]) -> List[Dict[str, Union[str, int]]]:
    """
    Extract team statistics from standings table.
    """
    team_stats = []
    for row in table_rows_soup:
        team_name = row.find('span', class_='team_name_span')
        team_name = team_name.a.text if team_name else "N/A"
        matches_played = row.find('div', class_='table__cell--matches_played').text if row.find('div', class_='table__cell--matches_played') else "N/A"
        wins = row.find('div', class_='table__cell--wins_regular').text if row.find('div', class_='table__cell--wins_regular') else "N/A"
        draws = row.find('div', class_='table__cell--draws').text if row.find('div', class_='table__cell--draws') else "N/A"
        losses = row.find('div', class_='table__cell--losses_regular').text if row.find('div', class_='table__cell--losses_regular') else "N/A"
        goals = row.find('div', class_='table__cell--goals').text.split(':') if row.find('div', class_='table__cell--goals') else ["N/A", "N/A"]
        goals_scored = goals[0]
        goals_conceded = goals[1]
        points = row.find('div', class_='table__cell--points').text if row.find('div', class_='table__cell--points') else "N/A"
        
        team_stats.append({
            "season": season_name,
            "team": team_name,
            "matches_played": matches_played,
            "wins": wins,
            "draws": draws,
            "losses": losses,
            "goals_scored": goals_scored,
            "goals_conceded": goals_conceded,
            "points": points
        })
    return team_stats

def scrape_table(league_url: str, league_name: str, season_name: str, retries: int = 3) -> None:
    """
    Scrape league standings and save to CSV.
    """
    driver = None
    standings_url = f"{league_url.rstrip('/')}/standings/"
    
    for attempt in range(retries):
        try:
            driver = setup_driver()
            driver.get(league_url)
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.sportName"))
            )
            
            # Try to find and click Standings or Table tab
            standings_tab = None
            try:
                standings_tab = driver.find_element(By.LINK_TEXT, 'Standings')
            except:
                try:
                    standings_tab = driver.find_element(By.LINK_TEXT, 'Table')
                except:
                    logger.warning("Standings/Table tab not found, falling back to direct standings URL.")
                    driver.get(standings_url)
            
            if standings_tab:
                execute_script_click(driver, standings_tab)
                WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.ID, 'tabitem-table')))
                
                inner_standings = driver.find_element(By.ID, 'tabitem-table')
                execute_script_click(driver, inner_standings)
            
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'table__body')))
            
            source = driver.find_element(By.CLASS_NAME, 'table__body').get_attribute('innerHTML')
            soup = BeautifulSoup(source, 'lxml')
            table_rows = soup.find_all('div', class_='table__row')
            
            table_entries = get_table_entries_from_table_div(table_rows, league_name, season_name)
            teams = [{"name": entry["team"]} for entry in table_entries]
            team_stats = get_team_stats_from_table_div(table_rows, season_name, teams)
            
            # Save to CSV
            pd.DataFrame(table_entries).to_csv("table_entries.csv", index=False)
            pd.DataFrame(team_stats).to_csv("team_stats.csv", index=False)
            logger.info("Table entries and team stats saved to CSV")
            return
        
        except (WebDriverException, TimeoutException) as e:
            logger.error(f"Attempt {attempt + 1} failed scraping table for {league_url}: {e}")
            if attempt < retries - 1:
                logger.info("Retrying in 5 seconds...")
                time.sleep(5)
            else:
                logger.error(f"All retries failed for table scraping, trying direct standings URL: {standings_url}")
                try:
                    driver.get(standings_url)
                    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'table__body')))
                    
                    source = driver.find_element(By.CLASS_NAME, 'table__body').get_attribute('innerHTML')
                    soup = BeautifulSoup(source, 'lxml')
                    table_rows = soup.find_all('div', class_='table__row')
                    
                    table_entries = get_table_entries_from_table_div(table_rows, league_name, season_name)
                    teams = [{"name": entry["team"]} for entry in table_entries]
                    team_stats = get_team_stats_from_table_div(table_rows, season_name, teams)
                    
                    pd.DataFrame(table_entries).to_csv("table_entries.csv", index=False)
                    pd.DataFrame(team_stats).to_csv("team_stats.csv", index=False)
                    logger.info("Table entries and team stats saved to CSV via direct standings URL")
                except Exception as e:
                    logger.error(f"Failed to scrape table via direct standings URL: {e}")
        
        finally:
            if driver:
                driver.quit()

def calculate_dropped_elements(headers_and_match_divs: List[BeautifulSoup], league_name: str) -> List[int]:
    """
    Identify elements to drop (e.g., non-league matches like play-offs).
    """
    dropped_elements = []
    drop = True
    for ind, element in enumerate(headers_and_match_divs):
        if drop:
            dropped_elements.append(ind)
        if element['class'][0] == 'event__header':
            header_name = element.find('span', class_='event__title--name').text
            previous_drop = drop
            drop = header_name != league_name
            if previous_drop != drop and not previous_drop:
                dropped_elements.append(ind)
    return dropped_elements

def get_season_matches_as_html(league_url: str, league_name: str) -> List[BeautifulSoup]:
    """
    Fetch match HTML from the results page.
    """
    driver = None
    try:
        driver = setup_driver()
        driver.get(league_url)
        results_tab = driver.find_element(By.LINK_TEXT, 'Results')
        execute_script_click(driver, results_tab)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'event__more')))
        
        more_button = driver.find_element(By.CLASS_NAME, 'event__more')
        more_matches = more_button is not None
        while more_matches:
            try:
                execute_script_click(driver, more_button)
                time.sleep(3)
            except StaleElementReferenceException:
                more_matches = False
        
        source = driver.find_element(By.CLASS_NAME, 'event--results').get_attribute('innerHTML')
        soup = BeautifulSoup(source, 'lxml')
        headers_and_match_divs = soup.find_all('div', class_=['event__match', 'event__header'])
        
        dropped_elements_indexes = calculate_dropped_elements(headers_and_match_divs, league_name)
        match_divs = [element for ind, element in enumerate(headers_and_match_divs) if ind not in dropped_elements_indexes]
        return match_divs
    
    except (WebDriverException, TimeoutException) as e:
        logger.error(f"Error fetching matches for {league_url}: {e}")
        return []
    
    finally:
        if driver:
            driver.quit()

def get_match_year(season_name: str, date: str) -> str:
    """
    Append the appropriate season year to the match date.
    """
    season_years = season_name.split('/')
    if len(season_years) == 1:
        return f"{date}{season_years[0]}"
    else:
        date_month = int(date.split('.')[1]) if '.' in date else 1
        return f"{date}{season_years[0]}" if date_month < 7 else f"{date}{season_years[1]}"

def scrape_results(league_url: str, league_name: str, season_name: str) -> None:
    """
    Scrape match results and save to CSV.
    """
    matches_soup = get_season_matches_as_html(league_url, league_name)
    matches = []
    
    for match_div in matches_soup:
        try:
            date_time = match_div.find('div', class_='event__time').text
            date = get_match_year(season_name, date_time.split(' ')[0])
            home_team_name = match_div.find('div', class_='event__participant--home').text
            away_team_name = match_div.find('div', class_='event__participant--away').text
            score = match_div.find('div', class_='event__scores').text.replace(' ', '').split('-') if match_div.find('div', class_='event__scores') else ["N/A", "N/A"]
            home_team_score = score[0]
            away_team_score = score[1]
            
            matches.append({
                "season": season_name,
                "date": date,
                "home_team": home_team_name,
                "away_team": away_team_name,
                "home_team_score": home_team_score,
                "away_team_score": away_team_score
            })
        except Exception as e:
            logger.error(f"Error parsing match data: {e}")
            continue
    
    if matches:
        pd.DataFrame(matches).to_csv("matches.csv", index=False)
        logger.info("Match data saved to matches.csv")

def get_years_from_season_name(season_name: str) -> str:
    """
    Extract season years from season name.
    """
    two_years_season_name = re.search(r'[0-9]{4}/[0-9]{4}', season_name)
    if two_years_season_name:
        return two_years_season_name.group()
    else:
        return re.search(r'[0-9]{4}', season_name).group()

def scrape_league_history(country: str, season_name: str = "2024/2025") -> None:
    """
    Scrape league history for a given country and season.
    """
    league_name = COUNTRIES_LEAGUES.get(country, "Premier League")
    driver = None
    
    try:
        driver = setup_driver()
        driver.get(BASE_URL)
        
        # Navigate to league
        league_url = click_league(driver, country, league_name)
        if not league_url:
            logger.error(f"Failed to navigate to league {league_name}")
            return
        
        # Scrape table and results
        scrape_table(league_url, league_name, season_name)
        scrape_results(league_url, league_name, season_name)
    
    except WebDriverException as e:
        logger.error(f"Error scraping league history for {country}: {e}")
    
    finally:
        if driver:
            driver.quit()

def main(country: str = "England") -> None:
    """
    Main function to scrape league data from Flashscore.
    """
    season_name = "2024/2025"
    scrape_league_history(country, season_name)

if __name__ == "__main__":
    main()

ERROR:__main__:Attempt 1 failed scraping table for https://www.flashscore.com/football/england/premier-league/: Message: 
Stacktrace:
	GetHandleVerifier [0x00E1FBD3+61635]
	GetHandleVerifier [0x00E1FC14+61700]
	(No symbol) [0x00C405D3]
	(No symbol) [0x00C8899E]
	(No symbol) [0x00C88D3B]
	(No symbol) [0x00CD0E12]
	(No symbol) [0x00CAD2E4]
	(No symbol) [0x00CCE61B]
	(No symbol) [0x00CAD096]
	(No symbol) [0x00C7C840]
	(No symbol) [0x00C7D6A4]
	GetHandleVerifier [0x01067043+2450739]
	GetHandleVerifier [0x010627C6+2432182]
	GetHandleVerifier [0x0107D50E+2542078]
	GetHandleVerifier [0x00E36895+155013]
	GetHandleVerifier [0x00E3CF7D+181357]
	GetHandleVerifier [0x00E27428+92440]
	GetHandleVerifier [0x00E275D0+92864]
	GetHandleVerifier [0x00E11FC0+5296]
	BaseThreadInitThunk [0x77365D49+25]
	RtlInitializeExceptionChain [0x7786CF0B+107]
	RtlGetAppContainerNamedObjectPath [0x7786CE91+561]

ERROR:__main__:Attempt 2 failed scraping table for https://www.flashscore.com/football/england/premier-league

In [5]:
#!/usr/bin/env python

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
import time
import logging
import os

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='match_fetcher.log'
)
logger = logging.getLogger(__name__)

def setup_driver():
    """Setup and return configured Chrome WebDriver"""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--log-level=3')
    
    try:
        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
        driver.implicitly_wait(10)
        return driver
    except Exception as e:
        logger.error(f"Failed to initialize Chrome driver: {e}")
        raise

def get_match_ids(date_str):
    """
    Fetch match IDs from Flashscore for a specific date
    date_str: date in format YYYYMMDD
    """
    driver = setup_driver()
    match_ids = []
    
    try:
        # Go to Flashscore and set the date
        url = f"https://www.flashscore.com/football/?d={date_str}"
        logger.info(f"Fetching matches for date: {date_str}")
        driver.get(url)
        time.sleep(2)

        # Accept GDPR if present
        try:
            WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
            ).click()
        except (TimeoutException, NoSuchElementException):
            pass

        # Wait for matches to load
        time.sleep(2)

        # Find all match elements
        matches = driver.find_elements(By.CLASS_NAME, "event__match")
        
        # Extract match IDs
        for match in matches:
            try:
                match_id = match.get_attribute("id").split("_")[2]
                match_ids.append(match_id)
                logger.info(f"Found match ID: {match_id}")
            except Exception as e:
                logger.error(f"Error extracting match ID: {e}")
                continue

        logger.info(f"Found {len(match_ids)} matches for date {date_str}")
        
        # Save match IDs to file
        output_file = "match_ids_input.txt"
        with open(output_file, "w") as f:
            for match_id in match_ids:
                f.write(f"{match_id}\n")
        
        logger.info(f"Saved match IDs to {output_file}")
        
        return match_ids

    except Exception as e:
        logger.error(f"Error fetching match IDs: {e}")
        return []
    
    finally:
        driver.quit()

def main():
    # Get yesterday's date in YYYYMMDD format
    yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y%m%d")
    
    try:
        match_ids = get_match_ids(yesterday)
        print(f"Found {len(match_ids)} matches for {yesterday}")
    except Exception as e:
        logger.error(f"Script failed: {e}")

if __name__ == "__main__":
    main()

Found 251 matches for 20250506


In [7]:
#!/usr/bin/env python

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import logging

from bs4 import BeautifulSoup

import datetime
import json
import os
import re
import time

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='scraper.log'
)
logger = logging.getLogger(__name__)


# Create an empty list to store all match_ids
match_ids = []

# Path to where is stored match_ids
path = os.getcwd() + r'\match_ids_input.txt'

# Open .txt file in read mode and create a list with all match_ids
with open(path, 'r') as match_ids_results:
    for line in match_ids_results.readlines():
        match_ids.append(line.strip())
        
# Create an empty list to store all dict matches
data_processed = []

def get_match_info(driver):
    soup = BeautifulSoup(driver.page_source, features="html.parser")
    
    # Country, League, Round, Date
    tournament_info = soup.find('span', attrs={'class':'tournamentHeader__country'}).text

    # Match Date 
    match_date_scrapped = soup.find('div', attrs={'class':'duelParticipant__startTime'}).text
    match_date = datetime.datetime.strptime(match_date_scrapped, '%d.%m.%Y %H:%M').isoformat()

    # Teams
    home_team = soup.find('div', attrs={"class": re.compile('^duelParticipant__home')}).text
    away_team = soup.find('div', attrs={"class": re.compile('^duelParticipant__away')}).text

    # Results
    score = {}

    # full_time, final_result
    full_time_result = soup.find('div', attrs={'class': 'detailScore__fullTime'})
    final_result = soup.find('div', attrs={'class': 'detailScore__wrapper'})
    if full_time_result is None:
        score['final_result'] = final_result.text
    else:
        score['full_time'] = full_time_result.text.replace("(","").replace(")","")
        score['final_result'] = final_result.text

    score['match_status'] = soup.find('div', attrs={'class': 'detailScore__status'}).text

    # first_half, second_half, extra_time, penalties
    half_score_data = soup.find_all('div', attrs={'class': re.compile('^wclHeaderSection--summary')})
    for x in half_score_data:
        key = x.find_all('span')[0].text.lower()
        value = x.find_all('span')[1].text.replace(" ", "")
        score = score | {key: value}

    # ========================

    match_info = {}
    match_info_keys = soup.find_all('div', attrs={'class':re.compile('^wcl-infoLabelWrapper')})
    match_info_values = soup.find_all('div', attrs={'class':re.compile('^wcl-infoValue')})
    for k,v in zip(match_info_keys, match_info_values):
        match_info = match_info | {k.text[:-1].lower() : v.text.replace('\xa0', ' ')}


    # ========================
    # Odds

    odds = {}
    odds_data = soup.find('div', attrs={'class': 'oddsRowContent'})
    odds_labels = odds_data.find_all('span', attrs={'class': 'oddsType'})
    odss_values = odds_data.find_all('span', attrs={'class': re.compile('^oddsValue')})
    for k,v in zip(odds_labels, odss_values):
        odds = odds | {k.text : float(v.text)}



    data = {"tournament": tournament_info} | \
        {"local_datetime": match_date} | \
        {"home_team": home_team} | \
        {"away_team": away_team} | \
        {"score": score} | \
        {"match_info": match_info} | \
        {"odds": odds}

    return data
    
def get_summary(driver):
    soup = BeautifulSoup(driver.page_source, features="html.parser")

    match_data = []

    data = soup.find_all('div', attrs={'class': 'smv__incident'})
    for i in data:
        match_time = i.find('div', attrs={'class': 'smv__timeBox'}).text
        
        player_out = None
        if i.find('div', attrs={'class': 'smv__incidentSubOut'}) is not None:
            player_out = i.find('div', attrs={'class': 'smv__incidentSubOut'}).text
        
        player = i.find('a', attrs={'class': 'smv__playerName'}).text

        incident = None
        if i.find('div', attrs={'class': 'smv__subIncident'}) is not None:
            incident = i.find('div', attrs={'class': 'smv__subIncident'}).text.replace('(', '').replace(')', '')

        assist = None
        if i.find('div', attrs={'class': 'smv__assist'}) is not None:
            assist = i.find('div', attrs={'class': 'smv__assist'}).text.replace('(', '').replace(')', '')

        incident_icon = None
        if i.find('div', attrs={'class': 'smv__incidentIcon'}) is not None:
            incident_icon = None if i.find('div', attrs={'class': 'smv__incidentIcon'}).text == "" else i.find('div', attrs={'class': 'smv__incidentIcon'}).text

        commentary = None
        if i.find('div', attrs={'class': ''}) is not None:
            commentary = i.find('div', attrs={'class': ''})['title'].replace('<br>', ' ').replace('<br />', ' ').replace('\n', ' ')

        # add team for each event
        if i.find_parent('div')['class'][-1][5:].startswith('home'):
            team = soup.find('div', attrs={"class": re.compile('^duelParticipant__home')}).text
        else:
            team = soup.find('div', attrs={"class": re.compile('^duelParticipant__away')}).text
        
        match_data.append({
            "time": match_time,
            "player_out": player_out,
            "player": player,
            "incident": incident,
            "assist": assist,
            "incident_icon": incident_icon,
            "commentary": commentary,
            "team": team
        })

    return match_data

def get_statistics(driver):
    soup = BeautifulSoup(driver.page_source, features="html.parser")
    
    data = []
    for i in soup.find_all('div', attrs={'data-testid':'wcl-statistics'}):
        d = {}

        stats_name = i.find('div', attrs={'data-testid': 'wcl-statistics-category'}).text
        home_value = i.find_all('div', attrs={'data-testid': 'wcl-statistics-value'})[0].text
        away_value = i.find_all('div', attrs={'data-testid': 'wcl-statistics-value'})[1].text

        d = {
            "label": stats_name,
            "home_value": home_value,
            "away_value": away_value
        }
        
        data.append(d)

    return data
    
def get_lineup(driver):
    soup = BeautifulSoup(driver.page_source, features="html.parser")

    data = {}
    data['lineup'] = {}

    try:
        data['lineup']['home_team_formation'] = soup.find_all('span', attrs={'data-testid':'wcl-scores-overline-02'})[0].text
        data['lineup']['away_team_formation'] = soup.find_all('span', attrs={'data-testid':'wcl-scores-overline-02'})[2].text
    except:
        pass

    data['lineup']['home_team'] = []
    data['lineup']['away_team'] = []

    sections = soup.find('div', attrs={'class':'lf__lineUp'})

    for section in sections:
        # Lineup players
        if section.find('div', attrs={'data-testid' : "wcl-headerSection-text"}).text == "Starting Lineups":
            home_players = section.find_all("div", attrs={'class': 'lf__side'})[0]
            away_players = section.find_all("div", attrs={'class': 'lf__side'})[-1]

            for player in home_players:
                player_dict = {}
                player_dict['jersey'] = int(player.find("span", attrs={"data-testid": "wcl-scores-simpleText-01"}).text)
                player_dict['nationality'] = player.find("img", attrs={"data-testid": "wcl-assetContainerBoxFree-XS"})["alt"]
                player_dict['name'] = player.find("a", attrs={"data-testid": "wcl-textLink"}).text  #['href'].split('/')[2].replace('-', ' ').title()

                player_dict['status'] = "lineup"

                data['lineup']['home_team'].append(player_dict)

            for player in away_players:
                player_dict = {}
                player_dict['jersey'] = int(player.find("span", attrs={"data-testid": "wcl-scores-simpleText-01"}).text)
                player_dict['nationality'] = player.find("img", attrs={"data-testid": "wcl-assetContainerBoxFree-XS"})["alt"]
                player_dict['name'] = player.find("a", attrs={"data-testid": "wcl-textLink"}).text   #['href'].split('/')[2].replace('-', ' ').title()

                player_dict['status'] = "lineup"

                data['lineup']['away_team'].append(player_dict)

        # Substituted players
        if section.find('div', attrs={'data-testid' : "wcl-headerSection-text"}).text == "Substituted players":
            home_players = section.find_all("div", attrs={'class': 'lf__side'})[0]
            away_players = section.find_all("div", attrs={'class': 'lf__side'})[-1]

            for player in home_players:
                player_dict = {}    
                player_dict['name'] = player.find("a", attrs={"data-testid": "wcl-textLink"}).text  #['href'].split('/')[2].replace('-', ' ').title()
                try:
                    player_dict['rating'] = float(player.find_all("span", attrs={"data-testid": "wcl-scores-caption-03"})[-1].text)
                except:
                    pass
                player_dict['status'] = "Substituted player"

                data['lineup']['home_team'].append(player_dict)

            for player in away_players:
                player_dict = {}    
                player_dict['name'] = player.find("a", attrs={"data-testid": "wcl-textLink"}).text  #['href'].split('/')[2].replace('-', ' ').title()
                try:
                    player_dict['rating'] = float(player.find_all("span", attrs={"data-testid": "wcl-scores-caption-03"})[-1].text)
                except:
                    pass
                player_dict['status'] = "Substituted player"

                data['lineup']['away_team'].append(player_dict)

        # Substitutes players
        if section.find('div', attrs={'data-testid' : "wcl-headerSection-text"}).text == "Substitutes":
            home_players = section.find_all("div", attrs={'class': 'lf__side'})[0]
            away_players = section.find_all("div", attrs={'class': 'lf__side'})[-1]

            for player in home_players:
                player_dict = {}    
                player_dict['jersey'] = int(player.find("span", attrs={"data-testid": "wcl-scores-simpleText-01"}).text)
                player_dict['nationality'] = player.find("img", attrs={"data-testid": "wcl-assetContainerBoxFree-XS"})["alt"]
                player_dict['name'] = player.find("a", attrs={"data-testid": "wcl-textLink"}).text  #['href'].split('/')[2].replace('-', ' ').title()
                player_dict['status'] = "Substitutes"

                data['lineup']['home_team'].append(player_dict)

            for player in away_players:
                player_dict = {}    
                player_dict['jersey'] = int(player.find("span", attrs={"data-testid": "wcl-scores-simpleText-01"}).text)
                player_dict['nationality'] = player.find("img", attrs={"data-testid": "wcl-assetContainerBoxFree-XS"})["alt"]
                player_dict['name'] = player.find("a", attrs={"data-testid": "wcl-textLink"}).text  #['href'].split('/')[2].replace('-', ' ').title()
                player_dict['status'] = "Substitutes"

                data['lineup']['away_team'].append(player_dict)

        # Missing Players
        if section.find('div', attrs={'data-testid' : "wcl-headerSection-text"}).text == "Missing Players":
            home_players = section.find_all("div", attrs={'class': 'lf__side'})[0]
            away_players = section.find_all("div", attrs={'class': 'lf__side'})[-1]

            for player in home_players:
                player_dict = {}    
                player_dict['nationality'] = player.find("img", attrs={"data-testid": "wcl-assetContainerBoxFree-XS"})["alt"]
                player_dict['name'] = player.find("a", attrs={"data-testid": "wcl-textLink"}).text    #['href'].split('/')[2].replace('-', ' ').title()
                player_dict['status'] = player.find("span", attrs={"data-testid": "wcl-scores-caption-05"}).text

                data['lineup']['home_team'].append(player_dict)

            for player in away_players:
                player_dict = {}    
                player_dict['nationality'] = player.find("img", attrs={"data-testid": "wcl-assetContainerBoxFree-XS"})["alt"]
                player_dict['name'] = player.find("a", attrs={"data-testid": "wcl-textLink"}).text    #['href'].split('/')[2].replace('-', ' ').title()
                player_dict['status'] = player.find("span", attrs={"data-testid": "wcl-scores-caption-05"}).text

                data['lineup']['away_team'].append(player_dict)

        # Coaches
        if section.find('div', attrs={'data-testid' : "wcl-headerSection-text"}).text == "Coaches":
            home_coach = section.find_all("div", attrs={'class': 'lf__side'})[0]
            away_coach = section.find_all("div", attrs={'class': 'lf__side'})[-1]

            for coach in home_coach:
                player_dict = {}  
                player_dict['nationality'] = coach.find("img", attrs={"class": re.compile("^wcl-assetContainer")})["alt"]
                player_dict['name'] = coach.find("a", attrs={"data-testid": 'wcl-textLink'}).text  #['href'].split('/')[2].replace('-', ' ').title()
                player_dict['status'] = "coach"

                data['lineup']['home_team'].append(player_dict)

            for coach in away_coach:
                player_dict = {}  
                player_dict['nationality'] = coach.find("img", attrs={"class": re.compile("^wcl-assetContainer")})["alt"]
                player_dict['name'] = coach.find("a", attrs={"data-testid": 'wcl-textLink'}).text  #['href'].split('/')[2].replace('-', ' ').title()
                player_dict['status'] = "coach"

                data['lineup']['away_team'].append(player_dict)

    return data
    
def get_commentary(driver):
    soup = BeautifulSoup(driver.page_source, features="html.parser")

    comments = []

    for event in soup.find_all('div', attrs={'data-testid': 'wcl-commentary'}):
        try:
            minute = event.find('strong', attrs={'data-testid': 'wcl-scores-simpleText-02'}).text.replace("'", "")
        except:
            minute = '0'
    
        try:
            comment = event.find('div', attrs={'class': re.compile('^wcl-general_')}).text
        except:
            pass
        try:
            comment = event.find('div', attrs={'class': re.compile('^wcl-highlighted_')}).text
        except:
            pass
        try:
            comment = event.find('div', attrs={'class': re.compile('^wcl-live_')}).text
        except:
            pass

        comments.append((minute, comment))
        comments.reverse()
        
    return comments
    
def get_report(driver):
    soup = BeautifulSoup(driver.page_source, features="html.parser")
    
    ps = soup.find('div', attrs={'class': 'fsNewsArticle__content'})
    return ps.text.strip().split('\n')[-1].split(': ')[-1]
    
def main():
    try:
        # Setup Chrome options
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--log-level=3')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')

        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
        driver.implicitly_wait(10)  # Set implicit wait time

        for idx, match_id in enumerate(match_ids):
            logger.info(f'Processing match {match_id} ({idx + 1}/{len(match_ids)})')
            
            try:
                url = f'https://www.flashscore.com/match/{match_id}/#match-summary'
                driver.get(url)
                time.sleep(2)  # Increased wait time

                # Accept GDPR
                try:
                    WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
                    ).click()
                except (TimeoutException, NoSuchElementException):
                    logger.warning("No GDPR consent button found")

                match_data = {}

                # Get match info
                try:
                    WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//a[@href='#/match-summary']"))
                    ).click()
                    time.sleep(1.5)
                    match_data = match_data | get_match_info(driver)
                    match_data['events'] = get_summary(driver)
                except Exception as e:
                    logger.error(f"Error getting match info: {e}")

                # Get statistics
                try:
                    match_data['statistics'] = {}
                    for period in ['full_time', '1st_half', '2nd_half', 'extra_time']:
                        xpath = f"//a[@href='#/match-summary/match-statistics{'/1' if period == '1st_half' else '/2' if period == '2nd_half' else '/3' if period == 'extra_time' else ''}']"
                        try:
                            WebDriverWait(driver, 10).until(
                                EC.element_to_be_clickable((By.XPATH, xpath))
                            ).click()
                            time.sleep(1.5)
                            match_data['statistics'][period] = get_statistics(driver)
                        except NoSuchElementException:
                            if period != 'extra_time':
                                logger.warning(f"Statistics not found for {period}")
                except Exception as e:
                    logger.error(f"Error getting statistics: {e}")

                # Get lineup
                try:
                    WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//a[@href='#/match-summary/lineups']"))
                    ).click()
                    time.sleep(1.5)
                    match_data = match_data | get_lineup(driver)
                except Exception as e:
                    logger.error(f"Error getting lineup: {e}")

                # # Get commentary
                # try:
                #     WebDriverWait(driver, 10).until(
                #         EC.element_to_be_clickable((By.XPATH, "//a[@href='#/match-summary/live-commentary']"))
                #     ).click()
                #     time.sleep(1.5)
                #     match_data['commentary'] = get_commentary(driver)
                # except Exception as e:
                #     logger.warning(f"Commentary not available: {e}")

                # Get match report
                try:
                    WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//a[@href='#/report']"))
                    ).click()
                    time.sleep(1.5)
                    match_data['man_of_the_match'] = get_report(driver)  # Fixed function name
                except Exception as e:
                    logger.warning(f"Match report not available: {e}")

                data_processed.append(match_data)

            except Exception as e:
                logger.error(f"Error processing match {match_id}: {e}")
                continue

        # Save results
        try:
            yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
            output_dir = os.path.join(os.getcwd(), 'processed')
            os.makedirs(output_dir, exist_ok=True)
            
            output_file = os.path.join(output_dir, f"{yesterday.date()}.json")
            with open(output_file, 'w', encoding='utf-8') as json_file:
                json.dump(data_processed, json_file, ensure_ascii=False, indent=2)
            
            logger.info(f"Results saved to {output_file}")
        except Exception as e:
            logger.error(f"Error saving results: {e}")

    except Exception as e:
        logger.error(f"Fatal error: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()