In [55]:
"""
Flashscore Soccer Match Data Scraper

This script scrapes soccer match data from Flashscore using a functional approach,
with separate functions for different scraping operations.
"""

import time
import json
import csv
import os
import random
import re
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional, Union

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    WebDriverException,
    StaleElementReferenceException
)


In [56]:

# Global variables
BASE_URL = "https://www.flashscore.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",
}

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("flashscore_scraper.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("flashscore_scraper")

# Initialize global variables for driver and session
driver = None
session = None


def setup_directories():
    """Create directories for storing data."""
    os.makedirs("data", exist_ok=True)
    os.makedirs("data/matches", exist_ok=True)
    os.makedirs("data/leagues", exist_ok=True)
    os.makedirs("data/live", exist_ok=True)


def setup_selenium_driver(headless: bool = True, chrome_driver_path: Optional[str] = None) -> webdriver.Chrome:
    """
    Set up the Selenium WebDriver.
    
    Args:
        headless: Whether to run the browser in headless mode
        chrome_driver_path: Path to the Chrome webdriver executable
        
    Returns:
        A configured Chrome WebDriver
    """
    options = Options()
    if headless:
        options.add_argument("--headless")
    
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--disable-extensions")
    options.add_argument(f"user-agent={HEADERS['User-Agent']}")
    
    if chrome_driver_path:
        service = Service(executable_path=chrome_driver_path)
        driver = webdriver.Chrome(service=service, options=options)
    else:
        driver = webdriver.Chrome(options=options)
    
    driver.set_page_load_timeout(30)
    return driver


def init_session() -> requests.Session:
    """
    Initialize a requests session with appropriate headers.
    
    Returns:
        A configured requests Session
    """
    session = requests.Session()
    session.headers.update(HEADERS)
    return session


def random_delay(min_seconds: float = 1.5, max_seconds: float = 4.0):
    """
    Add a random delay to avoid detection.
    
    Args:
        min_seconds: Minimum delay in seconds
        max_seconds: Maximum delay in seconds
    """
    delay = random.uniform(min_seconds, max_seconds)
    time.sleep(delay)


def get_page_content(url: str, use_selenium: bool = True) -> str:
    """
    Get the HTML content of a page.
    
    Args:
        url: The URL to fetch
        use_selenium: Whether to use Selenium (True) or requests (False)
        
    Returns:
        The HTML content of the page
    """
    global driver, session
    
    random_delay()
    
    if use_selenium:
        if not driver:
            logger.error("Selenium driver not initialized")
            return ""
        
        try:
            driver.get(url)
            # Wait for the main content to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            return driver.page_source
        except (TimeoutException, WebDriverException) as e:
            logger.error(f"Error loading page with Selenium: {url} - {str(e)}")
            return ""
    else:
        if not session:
            logger.error("Requests session not initialized")
            return ""
        
        try:
            response = session.get(url, timeout=15)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            logger.error(f"Error loading page with requests: {url} - {str(e)}")
            return ""


def get_available_leagues(use_selenium: bool = True) -> List[Dict[str, str]]:
    """
    Get a list of available soccer leagues.
    
    Args:
        use_selenium: Whether to use Selenium or requests
        
    Returns:
        A list of dictionaries containing league information
    """
    url = f"{BASE_URL}/football/"
    content = get_page_content(url, use_selenium)
    
    if not content:
        logger.error("Failed to fetch available leagues")
        return []
    
    soup = BeautifulSoup(content, "html.parser")
    leagues = []
    
    # This selector will need to be updated based on actual Flashscore HTML structure
    # The following is a placeholder that needs to be adjusted
    league_elements = soup.select(".leagues-list a")
    
    for element in league_elements:
        league_url = element.get("href", "")
        if league_url:
            league_name = element.text.strip()
            league_id = league_url.split("/")[-1]
            
            leagues.append({
                "id": league_id,
                "name": league_name,
                "url": f"{BASE_URL}{league_url}"
            })
    
    logger.info(f"Found {len(leagues)} leagues")
    return leagues


In [57]:


def get_league_fixtures(league_url: str, season: Optional[str] = None, use_selenium: bool = True) -> List[Dict[str, Any]]:
    """
    Get fixtures for a specific league.
    
    Args:
        league_url: The URL of the league
        season: Optional season identifier
        use_selenium: Whether to use Selenium or requests
        
    Returns:
        A list of dictionaries containing fixture information
    """
    fixtures_url = f"{league_url}/fixtures/"
    if season:
        fixtures_url = f"{fixtures_url}{season}/"
    
    content = get_page_content(fixtures_url, use_selenium)
    
    if not content:
        logger.error(f"Failed to fetch fixtures for league: {league_url}")
        return []
    
    soup = BeautifulSoup(content, "html.parser")
    fixtures = []
    
    # This selector will need to be updated based on actual Flashscore HTML structure
    match_elements = soup.select(".event__match")
    
    for element in match_elements:
        try:
            match_id = element.get("id", "").replace("g_1_", "")
            
            if not match_id:
                continue
            
            home_team = element.select_one(".event__participant--home").text.strip()
            away_team = element.select_one(".event__participant--away").text.strip()
            
            match_time_element = element.select_one(".event__time")
            match_date = match_time_element.get("data-date", "") if match_time_element else ""
            match_time = match_time_element.text.strip() if match_time_element else ""
            
            match_status_element = element.select_one(".event__stage")
            match_status = match_status_element.text.strip() if match_status_element else ""
            
            score_element = element.select_one(".event__scores")
            score = score_element.text.strip() if score_element else ""
            
            fixture = {
                "id": match_id,
                "home_team": home_team,
                "away_team": away_team,
                "date": match_date,
                "time": match_time,
                "status": match_status,
                "score": score,
                "url": f"{BASE_URL}/match/{match_id}/"
            }
            
            fixtures.append(fixture)
        except (AttributeError, Exception) as e:
            logger.error(f"Error parsing fixture: {str(e)}")
    
    logger.info(f"Found {len(fixtures)} fixtures for league: {league_url}")
    return fixtures


In [58]:


def get_match_details(match_id: str, use_selenium: bool = True) -> Dict[str, Any]:
    """
    Get detailed information for a specific match.
    
    Args:
        match_id: The ID of the match
        use_selenium: Whether to use Selenium or requests
        
    Returns:
        A dictionary containing match details
    """
    match_url = f"{BASE_URL}/match/{match_id}/"
    content = get_page_content(match_url, use_selenium)
    
    if not content:
        logger.error(f"Failed to fetch match details for: {match_id}")
        return {}
    
    soup = BeautifulSoup(content, "html.parser")
    match_details = {
        "id": match_id,
        "url": match_url,
        "teams": {},
        "score": {},
        "events": [],
        "stats": {},
        "lineups": {},
        "h2h": []
    }
    
    try:
        # Get teams
        home_team_element = soup.select_one(".duelParticipant__home")
        away_team_element = soup.select_one(".duelParticipant__away")
        
        if home_team_element and away_team_element:
            match_details["teams"]["home"] = home_team_element.text.strip()
            match_details["teams"]["away"] = away_team_element.text.strip()
        
        # Get score
        score_element = soup.select_one(".detailScore__wrapper")
        if score_element:
            home_score = score_element.select_one(".detailScore__home")
            away_score = score_element.select_one(".detailScore__away")
            
            if home_score and away_score:
                match_details["score"]["full_time"] = {
                    "home": home_score.text.strip(),
                    "away": away_score.text.strip()
                }
        
        # Get half-time score
        ht_score_element = soup.select_one(".detailScore__status")
        if ht_score_element and "HT" in ht_score_element.text:
            ht_score_match = re.search(r'\((\d+):(\d+)\)', ht_score_element.text)
            if ht_score_match:
                match_details["score"]["half_time"] = {
                    "home": ht_score_match.group(1),
                    "away": ht_score_match.group(2)
                }
        
        # Get match events (goals, cards, substitutions)
        events_elements = soup.select(".detailMS__incidentRow")
        
        for event_element in events_elements:
            event_type_element = event_element.select_one(".detailMS__incidentType")
            event_time_element = event_element.select_one(".detailMS__incidentTime")
            event_player_element = event_element.select_one(".detailMS__incidentPlayer")
            
            if event_type_element and event_time_element and event_player_element:
                event_type = event_type_element.get("class", [])
                event_type = [cls for cls in event_type if "icon" in cls]
                event_type = event_type[0].replace("icon-", "") if event_type else "unknown"
                
                event_time = event_time_element.text.strip()
                event_player = event_player_element.text.strip()
                
                event = {
                    "type": event_type,
                    "time": event_time,
                    "player": event_player,
                    "team": "home" if "home" in event_element.get("class", []) else "away"
                }
                
                match_details["events"].append(event)
        
        # Get match statistics
        stats_container = soup.select_one("#tab-statistics-0-statistic")
        if stats_container:
            stat_items = stats_container.select(".statRow")
            
            for stat_item in stat_items:
                stat_name_element = stat_item.select_one(".statTextGroup")
                home_value_element = stat_item.select_one(".statHomeValue")
                away_value_element = stat_item.select_one(".statAwayValue")
                
                if stat_name_element and home_value_element and away_value_element:
                    stat_name = stat_name_element.text.strip().lower().replace(" ", "_")
                    home_value = home_value_element.text.strip()
                    away_value = away_value_element.text.strip()
                    
                    match_details["stats"][stat_name] = {
                        "home": home_value,
                        "away": away_value
                    }
        
        # Get lineups
        lineups_container = soup.select_one("#tab-lineups-0-team")
        if lineups_container:
            home_lineup = lineups_container.select(".lineups__playerHome .pl__name")
            away_lineup = lineups_container.select(".lineups__playerAway .pl__name")
            
            match_details["lineups"]["home"] = [player.text.strip() for player in home_lineup]
            match_details["lineups"]["away"] = [player.text.strip() for player in away_lineup]
        
        # Get H2H (Head to Head)
        h2h_container = soup.select_one("#tab-h2h-0")
        if h2h_container:
            h2h_matches = h2h_container.select(".h2h__match")
            
            for h2h_match in h2h_matches:
                home_team = h2h_match.select_one(".h2h__homeParticipant")
                away_team = h2h_match.select_one(".h2h__awayParticipant")
                result = h2h_match.select_one(".h2h__result")
                date = h2h_match.select_one(".h2h__date")
                
                if home_team and away_team and result and date:
                    h2h_item = {
                        "home_team": home_team.text.strip(),
                        "away_team": away_team.text.strip(),
                        "result": result.text.strip(),
                        "date": date.text.strip()
                    }
                    
                    match_details["h2h"].append(h2h_item)
    
    except Exception as e:
        logger.error(f"Error parsing match details for {match_id}: {str(e)}")
    
    return match_details




In [59]:
def get_live_matches(use_selenium: bool = True) -> List[Dict[str, Any]]:
    """
    Get currently live matches.
    
    Args:
        use_selenium: Whether to use Selenium or requests
        
    Returns:
        A list of dictionaries containing live match information
    """
    url = f"{BASE_URL}/live/"
    content = get_page_content(url, use_selenium)
    
    if not content:
        logger.error("Failed to fetch live matches")
        return []
    
    soup = BeautifulSoup(content, "html.parser")
    live_matches = []
    
    # This selector will need to be updated based on actual Flashscore HTML structure
    match_elements = soup.select(".event__match--live")
    
    for element in match_elements:
        try:
            match_id = element.get("id", "").replace("g_1_", "")
            
            if not match_id:
                continue
            
            home_team = element.select_one(".event__participant--home").text.strip()
            away_team = element.select_one(".event__participant--away").text.strip()
            
            match_time_element = element.select_one(".event__stage--block")
            match_time = match_time_element.text.strip() if match_time_element else ""
            
            score_element = element.select_one(".event__scores")
            score = score_element.text.strip() if score_element else ""
            
            live_match = {
                "id": match_id,
                "home_team": home_team,
                "away_team": away_team,
                "current_time": match_time,
                "current_score": score,
                "url": f"{BASE_URL}/match/{match_id}/"
            }
            
            live_matches.append(live_match)
        except Exception as e:
            logger.error(f"Error parsing live match: {str(e)}")
    
    logger.info(f"Found {len(live_matches)} live matches")
    return live_matches




In [60]:
def get_historical_data(league_url: str, seasons: List[str], use_selenium: bool = True) -> Dict[str, List[Dict[str, Any]]]:
    """
    Get historical data for a league across multiple seasons.
    
    Args:
        league_url: The URL of the league
        seasons: List of season identifiers
        use_selenium: Whether to use Selenium or requests
        
    Returns:
        A dictionary mapping seasons to lists of fixtures
    """
    historical_data = {}
    
    for season in seasons:
        logger.info(f"Fetching historical data for season: {season}")
        fixtures = get_league_fixtures(league_url, season, use_selenium)
        historical_data[season] = fixtures
        
        # Add a delay between seasons to avoid detection
        random_delay(3.0, 6.0)
    
    return historical_data




In [61]:
def save_to_json(data: Any, file_path: str):
    """
    Save data to a JSON file.
    
    Args:
        data: The data to save
        file_path: The path to the output file
    """
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        logger.info(f"Data saved to {file_path}")
    except Exception as e:
        logger.error(f"Error saving data to {file_path}: {str(e)}")


def save_to_csv(data: List[Dict[str, Any]], file_path: str):
    """
    Save data to a CSV file.
    
    Args:
        data: The data to save (list of dictionaries)
        file_path: The path to the output file
    """
    if not data:
        logger.error(f"No data to save to {file_path}")
        return
    
    try:
        # Get all unique keys from all dictionaries
        fieldnames = set()
        for item in data:
            fieldnames.update(item.keys())
        
        with open(file_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=sorted(fieldnames))
            writer.writeheader()
            writer.writerows(data)
        
        logger.info(f"Data saved to {file_path}")
    except Exception as e:
        logger.error(f"Error saving data to {file_path}: {str(e)}")




In [62]:
def run_league_scraper(league_url: str, output_format: str = "json", use_selenium: bool = True):
    """
    Run the scraper for a specific league.
    
    Args:
        league_url: The URL of the league
        output_format: Output format ("json" or "csv")
        use_selenium: Whether to use Selenium or requests
    """
    league_id = league_url.split("/")[-1]
    
    # Get all fixtures for the league
    fixtures = get_league_fixtures(league_url, use_selenium=use_selenium)
    
    # Save fixtures
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"data/leagues/{league_id}"
    os.makedirs(output_dir, exist_ok=True)
    
    output_file = f"{output_dir}/fixtures_{timestamp}"
    
    if output_format.lower() == "json":
        save_to_json(fixtures, f"{output_file}.json")
    elif output_format.lower() == "csv":
        save_to_csv(fixtures, f"{output_file}.csv")
    
    # Get detailed information for each match
    match_details = []
    for fixture in fixtures[:10]:  # Limit to 10 matches for testing
        match_id = fixture["id"]
        details = get_match_details(match_id, use_selenium=use_selenium)
        match_details.append(details)

        # Save individual match details
        match_dir = f"data/matches/{match_id}"
        os.makedirs(match_dir, exist_ok=True)
        
        if output_format.lower() == "json":
            save_to_json(details, f"{match_dir}/details.json")
        
        # Add a delay between requests
        random_delay()
    
    # Save all match details in one file
    if output_format.lower() == "json":
        save_to_json(match_details, f"{output_dir}/match_details_{timestamp}.json")
    elif output_format.lower() == "csv":
        # Flatten match details for CSV
        flattened_details = []
        for match in match_details:
            flat_match = {
                "id": match.get("id", ""),
                "url": match.get("url", ""),
                "home_team": match.get("teams", {}).get("home", ""),
                "away_team": match.get("teams", {}).get("away", ""),
                "home_score": match.get("score", {}).get("full_time", {}).get("home", ""),
                "away_score": match.get("score", {}).get("full_time", {}).get("away", ""),
                "ht_home_score": match.get("score", {}).get("half_time", {}).get("home", ""),
                "ht_away_score": match.get("score", {}).get("half_time", {}).get("away", "")
            }
            
            # Add statistics
            for stat_name, stat_values in match.get("stats", {}).items():
                flat_match[f"stat_{stat_name}_home"] = stat_values.get("home", "")
                flat_match[f"stat_{stat_name}_away"] = stat_values.get("away", "")
            
            flattened_details.append(flat_match)
        
        save_to_csv(flattened_details, f"{output_dir}/match_details_{timestamp}.csv")




In [63]:
def run_live_scraper(interval: int = 60, duration: int = 3600, output_format: str = "json", use_selenium: bool = True):
    """
    Run the scraper for live matches with periodic updates.
    
    Args:
        interval: Update interval in seconds
        duration: Total duration to run in seconds
        output_format: Output format ("json" or "csv")
        use_selenium: Whether to use Selenium or requests
    """
    start_time = time.time()
    end_time = start_time + duration
    
    while time.time() < end_time:
        # Get current live matches
        live_matches = get_live_matches(use_selenium=use_selenium)
        
        if live_matches:
            # Save live matches
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_dir = "data/live"
            output_file = f"{output_dir}/live_matches_{timestamp}"
            
            if output_format.lower() == "json":
                save_to_json(live_matches, f"{output_file}.json")
            elif output_format.lower() == "csv":
                save_to_csv(live_matches, f"{output_file}.csv")
            
            # Get detailed information for each live match
            for match in live_matches:
                match_id = match["id"]
                details = get_match_details(match_id, use_selenium=use_selenium)
                
                match_dir = f"{output_dir}/{match_id}"
                os.makedirs(match_dir, exist_ok=True)
                
                if output_format.lower() == "json":
                    save_to_json(details, f"{match_dir}/details_{timestamp}.json")
                
                # Add a delay between requests
                random_delay(1.0, 2.0)
        
        # Wait for the next update
        time_to_sleep = max(0, interval - (time.time() - start_time) % interval)
        logger.info(f"Waiting {time_to_sleep:.2f} seconds for next update...")
        time.sleep(time_to_sleep)




In [64]:
def cleanup():
    """Clean up resources."""
    global driver
    if driver:
        driver.quit()
        driver = None


def initialize(use_selenium: bool = True, headless: bool = True, chrome_driver_path: Optional[str] = None):
    """
    Initialize the scraper.
    
    Args:
        use_selenium: Whether to use Selenium for scraping
        headless: Whether to run the browser in headless mode
        chrome_driver_path: Path to the Chrome webdriver executable
    """
    global driver, session
    
    # Set up directories
    setup_directories()
    
    # Initialize driver or session
    if use_selenium:
        driver = setup_selenium_driver(headless, chrome_driver_path)
    
    session = init_session()
    
    logger.info("Scraper initialized")




In [65]:
import sys

def main():
    """Main function to demonstrate the Flashscore scraper."""
    # Check if running in IPython/Jupyter
    is_jupyter = 'ipykernel' in sys.modules
    
    if not is_jupyter:
        # Normal command line execution
        import argparse
        parser = argparse.ArgumentParser(description="Flashscore Soccer Match Data Scraper")
        parser.add_argument("--no-selenium", action="store_true", help="Use requests/BeautifulSoup instead of Selenium")
        parser.add_argument("--headless", action="store_true", help="Run Selenium in headless mode")
        parser.add_argument("--chromedriver", type=str, help="Path to ChromeDriver executable")
        parser.add_argument("--league", type=str, help="League URL to scrape")
        parser.add_argument("--live", action="store_true", help="Scrape live matches")
        parser.add_argument("--interval", type=int, default=60, help="Update interval for live matches in seconds")
        parser.add_argument("--duration", type=int, default=3600, help="Duration to run live scraper in seconds")
        parser.add_argument("--format", type=str, choices=["json", "csv"], default="json", help="Output format")
        args = parser.parse_args()
    else:
        # Default values for Jupyter
        class Args:
            no_selenium = False
            headless = True
            chromedriver = None
            league = "https://www.flashscore.com/football/brazil/serie-a-betano/"
            live = False
            interval = 60
            duration = 3600
            format = "csv"
        args = Args()

    try:
        initialize(
            use_selenium=not args.no_selenium,
            headless=args.headless,
            chrome_driver_path=args.chromedriver
        )
        
        if args.league:
            run_league_scraper(args.league, args.format, use_selenium=not args.no_selenium)
        elif args.live:
            run_live_scraper(args.interval, args.duration, args.format, use_selenium=not args.no_selenium)
        else:
            leagues = get_available_leagues(use_selenium=not args.no_selenium)
            print("Available leagues:")
            for i, league in enumerate(leagues):
                print(f"{i+1}. {league['name']} - {league['url']}")
            
            if is_jupyter:
                print("In Jupyter, please specify --league directly in function calls")
    
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    

In [66]:
if __name__ == "__main__":
    main()

2025-05-04 18:10:53,113 - flashscore_scraper - INFO - Scraper initialized
2025-05-04 18:11:10,806 - flashscore_scraper - INFO - Found 0 fixtures for league: https://www.flashscore.com/football/brazil/serie-a-betano/
2025-05-04 18:11:10,819 - flashscore_scraper - ERROR - No data to save to data/leagues//fixtures_20250504_181110.csv
2025-05-04 18:11:10,836 - flashscore_scraper - ERROR - No data to save to data/leagues//match_details_20250504_181110.csv
