# FotMob Soccer Data Scraper with XPath

This notebook implements a web scraper for FotMob.com using Selenium with XPath selectors. Features include:
- XPath-based element selection
- Match data extraction using full XPaths
- Structured JSON output
- Error handling and validation

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import json
import time
import random

class FotMobXPathScraper:
    def __init__(self):
        self.base_url = 'https://www.fotmob.com'
        self.options = self._setup_chrome_options()
        self.driver = None
        self.results = []
        
        # Define XPath selectors
        self.XPATHS = {
            'content_container': '/html/body/div[1]/main/main/div/div/div[1]/div[3]/section/div[2]',
            'match_rows': './/div[contains(@class, "css") and @role="row"]',
            'teams': './/div[contains(@class, "teamName")]',
            'scores': './/div[contains(@class, "score")]',
            'match_time': './/div[contains(@class, "matchTime")]',
            'match_status': './/div[contains(@class, "matchStatus")]',
            'league_name': '//*[@data-testid="competition-name"]',
            'cookie_consent': '//*[@data-testid="cookie-consent-accept"]'
        }
        
    def _setup_chrome_options(self):
        options = Options()
        options.add_experimental_option('excludeSwitches', ['enable-automation'])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('--start-maximized')
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--disable-notifications')
        options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        return options
    
    def _random_delay(self, min_sec=1, max_sec=3):
        time.sleep(random.uniform(min_sec, max_sec))
    
    def scrape_matches(self):
        try:
            print("Waiting for content to load...")
            wait = WebDriverWait(self.driver, 20)
            
            # Wait for main container
            container = wait.until(
                EC.presence_of_element_located((By.XPATH, self.XPATHS['content_container']))
            )
            print("Content container found")
            
            # Handle cookie consent if present
            try:
                cookie_button = self.driver.find_element(By.XPATH, self.XPATHS['cookie_consent'])
                cookie_button.click()
                self._random_delay()
            except:
                pass
            
            # Scroll to load all content
            print("Scrolling to load all matches...")
            last_height = self.driver.execute_script("return document.body.scrollHeight")
            while True:
                # Scroll in smaller increments
                for i in range(5):
                    self.driver.execute_script(f"window.scrollTo(0, {i * last_height / 5});")
                    self._random_delay(0.5, 1)
                
                new_height = self.driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            
            # Find all match elements using XPath
            matches = container.find_elements(By.XPATH, self.XPATHS['match_rows'])
            print(f"Found {len(matches)} matches")
            
            # Get league name if available
            try:
                league_name = self.driver.find_element(By.XPATH, self.XPATHS['league_name']).text
            except:
                league_name = "Unknown League"
            
            # Process each match
            for idx, match in enumerate(matches, 1):
                try:
                    # Get teams
                    teams = match.find_elements(By.XPATH, self.XPATHS['teams'])
                    if len(teams) >= 2:
                        home_team = teams[0].text
                        away_team = teams[1].text
                    else:
                        continue
                    
                    # Get score/status
                    try:
                        score_element = match.find_element(By.XPATH, self.XPATHS['scores'])
                        score_text = score_element.text
                        if ' - ' in score_text:
                            home_score, away_score = score_text.split(' - ')
                            status = 'Completed'
                        else:
                            home_score = away_score = None
                            status = score_text
                    except NoSuchElementException:
                        home_score = away_score = None
                        status = 'Scheduled'
                    
                    # Get match time
                    try:
                        match_time = match.find_element(By.XPATH, self.XPATHS['match_time']).text
                    except:
                        match_time = None
                    
                    # Create match data structure
                    match_data = {
                        'league': league_name,
                        'match_number': idx,
                        'home_team': home_team,
                        'away_team': away_team,
                        'home_score': home_score,
                        'away_score': away_score,
                        'status': status,
                        'time': match_time,
                        'timestamp': datetime.now().isoformat()
                    }
                    
                    self.results.append(match_data)
                    print(f"Processed match {idx}: {home_team} vs {away_team}")
                    
                except Exception as e:
                    print(f"Error processing match {idx}: {str(e)}")
                    continue
                    
        except Exception as e:
            print(f"Error scraping matches: {str(e)}")
            print("Page source preview:")
            print(self.driver.page_source[:1000])
    
    def scrape(self):
        try:
            self.driver = webdriver.Chrome(options=self.options)
            print("Loading FotMob homepage...")
            self.driver.get(self.base_url)
            self._random_delay(8, 10)  # Longer initial wait
            
            # Scrape matches
            self.scrape_matches()
            
            # Save results
            self.save_results()
            
        finally:
            if self.driver:
                self.driver.quit()
    
    def save_results(self):
        output = {
            'metadata': {
                'url': self.base_url,
                'timestamp': datetime.now().isoformat(),
                'total_matches': len(self.results)
            },
            'matches': self.results
        }
        
        with open('fotmob_matches.json', 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2, ensure_ascii=False)
        
        print(f"\nSaved {len(self.results)} matches to fotmob_matches.json")

# Create and run scraper
scraper = FotMobXPathScraper()
scraper.scrape()

# Display sample of results
if scraper.results:
    print("\nSample match data:")
    print(json.dumps(scraper.results[0], indent=2))
else:
    print("No matches were scraped. Please check the XPath selectors.")

Loading FotMob homepage...
Waiting for content to load...
Waiting for content to load...
Error scraping matches: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff6a5c9fea5+79173]
	GetHandleVerifier [0x0x7ff6a5c9ff00+79264]
	(No symbol) [0x0x7ff6a5a59e5a]
	(No symbol) [0x0x7ff6a5ab0586]
	(No symbol) [0x0x7ff6a5ab083c]
	(No symbol) [0x0x7ff6a5b04247]
	(No symbol) [0x0x7ff6a5ad89af]
	(No symbol) [0x0x7ff6a5b0100d]
	(No symbol) [0x0x7ff6a5ad8743]
	(No symbol) [0x0x7ff6a5aa14c1]
	(No symbol) [0x0x7ff6a5aa2253]
	GetHandleVerifier [0x0x7ff6a5f6a2dd+3004797]
	GetHandleVerifier [0x0x7ff6a5f6472d+2981325]
	GetHandleVerifier [0x0x7ff6a5f83380+3107360]
	GetHandleVerifier [0x0x7ff6a5cbaa2e+188622]
	GetHandleVerifier [0x0x7ff6a5cc22bf+219487]
	GetHandleVerifier [0x0x7ff6a5ca8df4+115860]
	GetHandleVerifier [0x0x7ff6a5ca8fa9+116297]
	GetHandleVerifier [0x0x7ff6a5c8f558+11256]
	BaseThreadInitThunk [0x0x7ffb2120e8d7+23]
	RtlUserThreadStart [0x0x7ffb2209c5dc+44]

Page source preview:
<html lang="en" data

In [None]:
import pandas as pd
import json

# Convert results to DataFrame
df = pd.DataFrame([{
    'date': match['date'],
    'home_team': match['home_team']['name'],
    'away_team': match['away_team']['name'],
    'home_score': match['home_team']['score'],
    'away_score': match['away_team']['score'],
    'status': match['status']
} for match in results])

# Display basic information
if len(df) == 0:
    print("No data was collected. Please check the selectors and try again.")
else:
    print(f"Total matches scraped: {len(df)}")
    print("\nFirst few matches (DataFrame format):")
    print(df.head())
    
    print("\nSample JSON output (first 2 matches):")
    print(json.dumps(results[:2], indent=2, ensure_ascii=False))

# Analyze the scraped data
if 'df' in locals() and len(df) > 0:
    # Basic statistics
    print("\nData Analysis:")
    print(f"Total matches: {len(df)}")
    
    if 'competition' in df.columns:
        print("\nMatches by competition:")
        print(df['competition'].value_counts())
    
    if 'status' in df.columns:
        print("\nMatches by status:")
        print(df['status'].value_counts())
    
    if 'home_score' in df.columns and 'away_score' in df.columns:
        completed_matches = df[df['home_score'].notna()]
        if len(completed_matches) > 0:
            print("\nScoring Statistics:")
            completed_matches['total_goals'] = completed_matches['home_score'].astype(float) + completed_matches['away_score'].astype(float)
            print(completed_matches['total_goals'].describe())
    
    print("\nSample matches:")
    display_cols = ['datetime', 'home_team', 'home_score', 'away_score', 'away_team', 'status']
    print(df[display_cols].head())
else:
    print("No data available for analysis")

# Convert results to DataFrame if available
if 'scraper' in locals() and scraper.results:
    df = pd.DataFrame(scraper.results)
    
    # Display basic information
    print("\nData Analysis:")
    print(f"Total matches scraped: {len(df)}")
    
    if 'league' in df.columns:
        print("\nMatches by league:")
        print(df['league'].value_counts())
    
    if 'status' in df.columns:
        print("\nMatches by status:")
        print(df['status'].value_counts())
    
    # Show sample matches
    print("\nSample matches:")
    display_cols = ['league', 'home_team', 'home_score', 'away_score', 'away_team', 'status', 'time']
    display_cols = [col for col in display_cols if col in df.columns]
    print(df[display_cols].head())
    
    # Save to CSV for easier viewing
    df.to_csv('fotmob_matches.csv', index=False)
    print("\nData also saved to fotmob_matches.csv")
else:
    print("No data available for analysis")

No data was collected. Please check the selectors and try again.


In [None]:
# Clean and format the data
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df = df.sort_values('date', ascending=False)

# Display the formatted data
pd.set_option('display.max_rows', None)
df

# FotMob Premier League Match Scraper

This notebook will scrape match data from FotMob's Premier League page. Note that since FotMob uses dynamic JavaScript content, we'll need to use Selenium instead of just Scrapy to properly access the data.

In [None]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Initialize Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode (no GUI)
driver = webdriver.Chrome(options=options)

# URL to scrape
url = "https://www.fotmob.com/leagues/47/matches/premier-league?group=by-date"
driver.get(url)

# Wait for the content to load (wait for match elements to be present)
wait = WebDriverWait(driver, 10)
matches = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[data-match-id]")))

# List to store match data
match_data = []

# Extract data from each match
for match in matches:
    try:
        home_team = match.find_element(By.CSS_SELECTOR, ".home__team").text
        away_team = match.find_element(By.CSS_SELECTOR, ".away__team").text
        match_date = match.find_element(By.CSS_SELECTOR, ".match__date").text
        score = match.find_element(By.CSS_SELECTOR, ".match__score").text if match.find_elements(By.CSS_SELECTOR, ".match__score") else "Not played"
        
        match_info = {
            'date': match_date,
            'home_team': home_team,
            'away_team': away_team,
            'score': score
        }
        match_data.append(match_info)
    except Exception as e:
        print(f"Error extracting match data: {e}")

# Close the browser
driver.quit()

# Convert to DataFrame
df = pd.DataFrame(match_data)
df