<a href="https://colab.research.google.com/github/Carlscamt/sofascore-selenium-scraper/blob/main/Sofascore_scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install necessary packages
!pip install selenium webdriver-manager
!wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add -
!echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list
!apt-get update
!apt-get install google-chrome-stable -y

OK
Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://dl.google.com/linux/chrome/deb stable InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Target Packages (main/binary-amd64/Packages) is configured multiple times in /etc/apt/sources.list.d/google-chrome.list:3 and /etc/apt

In [5]:
import pandas as pd
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from datetime import datetime, timedelta
from selenium.common.exceptions import WebDriverException # Import specific exception

# date_range is already calculated in a previous cell
# TARGET_DATE will be set within the loop

# Set Chrome in headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

# Specify the path to the Chrome binary explicitly for Colab environments
chrome_options.binary_location = '/usr/bin/google-chrome'

# Function to initialize the WebDriver
def initialize_driver():
    try:
        service = Service(ChromeDriverManager().install())
        driver_instance = webdriver.Chrome(service=service, options=chrome_options)
        return driver_instance
    except Exception as e:
        print(f"Error initializing WebDriver: {e}")
        return None

driver = initialize_driver()
if not driver:
    raise Exception("Failed to initialize WebDriver. Cannot proceed.")

print("üöÄ Starting...")

# Initialize a list to accumulate results from all days
all_results = []

# Define date_range for the last 7 days (including today)
current_date = datetime.now().date()
date_range = []
for i in range(7): # Changed to 1 day for testing
    date_range.append((current_date - timedelta(days=i)).strftime('%Y-%m-%d'))

for current_date in date_range:
    TARGET_DATE = current_date
    print(f"\n--- Scraping data for {TARGET_DATE} ---")

    # 1. GET MATCHES
    # Add retry logic for driver.get()
    max_retries = 3
    for attempt in range(max_retries):
        try:
            url_matches = f"https://www.sofascore.com/api/v1/sport/football/scheduled-events/{TARGET_DATE}"
            driver.get(url_matches)
            time.sleep(3)
            # Corrected By.TAG_TAG_NAME to By.TAG_NAME
            json_text = driver.find_element(By.TAG_NAME, "pre").text
            data = json.loads(json_text)
            events = data.get('events', [])
            print(f"‚úÖ Found {len(events)} matches for {TARGET_DATE}")
            break # If successful, break retry loop
        except (WebDriverException, ConnectionError, json.JSONDecodeError) as e:
            print(f"‚ö†Ô∏è Warning: Error getting or parsing matches for {TARGET_DATE} on attempt {attempt+1}/{max_retries}: {e}")
            if driver:
                driver.quit() # Quit the problematic driver
            driver = initialize_driver() # Reinitialize driver
            if not driver: # If reinitialization fails, print error and break from retries
                print(f"‚ùå Failed to reinitialize driver after error. Skipping {TARGET_DATE}.")
                events = []
                break # Break retry loop, effectively skipping this date
            time.sleep(5) # Wait before retrying
        except Exception as e: # Catch other unexpected errors
            print(f"‚ùå Unexpected error while getting matches for {TARGET_DATE}: {e}")
            events = []
            break # Do not retry for unexpected errors
    else: # This block executes if the loop finishes without a 'break' (i.e., all retries failed)
        print(f"‚ùå Failed to get matches for {TARGET_DATE} after {max_retries} attempts. Skipping this date.")
        events = []
        continue # Skip to the next date

    # If events are empty after retries, continue to next date
    if not events:
        continue

    # 2. Process each match
    for i, event in enumerate(events[:10]):  # Limits to 10 matches per day for testing
        event_id = event['id']

        row = {
            'id': event_id,
            'date': TARGET_DATE,
            'league': event['tournament']['name'],
            'home': event['homeTeam']['name'],
            'away': event['awayTeam']['name'],
            'status': event['status']['description'],
            'score_home': event.get('homeScore', {}).get('current'),
            'score_away': event.get('awayScore', {}).get('current'),
            'odd_1': None,
            'odd_X': None,
            'odd_2': None,
            'home_avg_rating': None,
            'home_position': None,
            'home_form': None,
            'away_avg_rating': None,
            'away_position': None,
            'away_form': None,
            'h2h_home_wins': None,
            'h2h_away_wins': None,
            'h2h_draws': None,
            'lineups': [] # Initialize an empty list for lineups
        }

        print(f"   [{i+1}/{min(10, len(events))}] {row['home']} vs {row['away']} ({TARGET_DATE})")

        # Get ODDS
        try:
            url_odds = f"https://www.sofascore.com/api/v1/event/{event_id}/odds/1/all"
            driver.get(url_odds)
            time.sleep(1)
            odds_text = driver.find_element(By.TAG_NAME, "pre").text
            odds_data = json.loads(odds_text)

            # Search Odd Maket "Full time"
            for market in odds_data.get('markets', []):
                if market.get('marketName') == 'Full time':
                    for choice in market.get('choices', []):
                        frac = choice.get('fractionalValue', '')
                        if '/' in str(frac):
                            num, den = map(int, frac.split('/'))
                            decimal_odd = round(1 + (num / den), 2)
                        else:
                            decimal_odd = float(frac) if frac else None

                        if choice['name'] == '1':
                            row['odd_1'] = decimal_odd
                        elif choice['name'] == 'X':
                            row['odd_X'] = decimal_odd
                        elif choice['name'] == '2':
                            row['odd_2'] = decimal_odd
                    break
        except Exception as e:
            print(f"      ‚ùå Error getting odds for {event_id}: {e}")
            pass # Continue processing other data points even if odds fail

        # Get Pregame Form Data
        try:
            url_pregame_form = f"https://www.sofascore.com/api/v1/event/{event_id}/pregame-form"
            driver.get(url_pregame_form)
            time.sleep(1)
            pregame_form_text = driver.find_element(By.TAG_NAME, "pre").text
            pregame_form_data = json.loads(pregame_form_text)

            home_team_form = pregame_form_data.get('homeTeam', {})
            away_team_form = pregame_form_data.get('awayTeam', {})

            # Convert avgRating to float directly
            row['home_avg_rating'] = float(home_team_form.get('avgRating')) if home_team_form.get('avgRating') else None
            row['home_position'] = home_team_form.get('position')
            row['home_form'] = ','.join(home_team_form.get('form', []))

            row['away_avg_rating'] = float(away_team_form.get('avgRating')) if away_team_form.get('avgRating') else None
            row['away_position'] = away_team_form.get('position')
            row['away_form'] = ','.join(away_team_form.get('form', []))

        except Exception as e:
            print(f"      ‚ùå Error getting pregame form for {event_id}: {e}")
            pass

        # Get Head-to-Head (H2H) Data
        try:
            url_h2h = f"https://www.sofascore.com/api/v1/event/{event_id}/h2h"
            driver.get(url_h2h)
            time.sleep(1)
            h2h_text = driver.find_element(By.TAG_NAME, "pre").text
            h2h_data = json.loads(h2h_text)

            # Correctly access 'teamDuel' which contains H2H stats, handling None
            team_duel = h2h_data.get('teamDuel')
            if team_duel:
                row['h2h_home_wins'] = team_duel.get('homeWins')
                row['h2h_away_wins'] = team_duel.get('awayWins')
                row['h2h_draws'] = team_duel.get('draws')

        except Exception as e:
            print(f"      ‚ùå Error getting H2H data for {event_id}: {e}")
            pass

        # Get Lineups Data
        try:
            url_lineups = f"https://www.sofascore.com/api/v1/event/{event_id}/lineups"
            driver.get(url_lineups)
            time.sleep(1)
            lineups_text = driver.find_element(By.TAG_NAME, "pre").text
            lineups_data = json.loads(lineups_text)

            home_players = lineups_data.get('home', {}).get('players', [])
            away_players = lineups_data.get('away', {}).get('players', [])

            for player_data in home_players:
                player_info = {
                    'name': player_data['player']['name'],
                    'player_id': player_data['player']['id'],
                    'position': player_data['position'],
                }
                row['lineups'].append(player_info)

            for player_data in away_players:
                player_info = {
                    'name': player_data['player']['name'],
                    'player_id': player_data['player']['id'],
                    'position': player_data['position'],
                }
                row['lineups'].append(player_info)

        except Exception as e:
            print(f"      ‚ùå Error getting lineups data for {event_id}: {e}")
            pass

        all_results.append(row)
        time.sleep(1)  # Delay between requests

if driver:
    driver.quit()

# 3. Save CSV With the collected data
df = pd.DataFrame(all_results)
output_file = "sofascore_selenium_last_7_days.csv"
df.to_csv(output_file, index=False)

print(f"\n‚úÖ EXPORTED: {output_file}")
print(df.head())

üöÄ Starting...

--- Scraping data for 2025-12-07 ---
‚úÖ Found 551 matches for 2025-12-07
   [1/10] Aston Villa vs Arsenal (2025-12-07)
   [2/10] Bournemouth vs Chelsea (2025-12-07)
   [3/10] Everton vs Nottingham Forest (2025-12-07)
   [4/10] Manchester City vs Sunderland (2025-12-07)
   [5/10] Newcastle United vs Burnley (2025-12-07)
   [6/10] Tottenham Hotspur vs Brentford (2025-12-07)
   [7/10] Leeds United vs Liverpool (2025-12-07)
   [8/10] Brighton & Hove Albion vs West Ham United (2025-12-07)
   [9/10] Fulham vs Crystal Palace (2025-12-07)
   [10/10] Sassuolo vs Fiorentina (2025-12-07)

--- Scraping data for 2025-12-06 ---
‚úÖ Found 447 matches for 2025-12-06
   [1/10] Aston Villa vs Arsenal (2025-12-06)
   [2/10] Bournemouth vs Chelsea (2025-12-06)
   [3/10] Everton vs Nottingham Forest (2025-12-06)
   [4/10] Manchester City vs Sunderland (2025-12-06)
   [5/10] Newcastle United vs Burnley (2025-12-06)
   [6/10] Tottenham Hotspur vs Brentford (2025-12-06)
   [7/10] Leeds Unit