In [None]:
## python code to extract odds and statistics for a given match

In [29]:
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.set_capability('goog:loggingPrefs', {"performance": "ALL", "browser": "ALL"})
# Adding user agent to look more like a real browser
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(30)

In [30]:
def get_stats(url, driver):
    # Clear existing logs
    driver.get("about:blank")
    logs = driver.get_log("performance")
    
    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)
    
    # Wait for the page to load properly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
    except:
        print("Timeout waiting for page to load")
    
    # Interact with the page to trigger API calls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    
    # Get network logs
    logs_raw = driver.get_log("performance")
    logs = [json.loads(lr['message'])['message'] for lr in logs_raw]
    
    # Direct API approach - try to call the API directly
    
    match_id = url.split("#id:")[1].split(",")[0] if "#id:" in url else None
    if match_id:
        print(f"Extracted match ID: {match_id}")
        api_url = f"https://www.sofascore.com/api/v1/event/{match_id}/statistics"
        
        # Open the API URL directly
        driver.execute_script(f"window.open('{api_url}', '_blank');")
        time.sleep(2)
        
        # Switch to the new tab
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(1)
        
        # Get the page source which should contain the JSON
        page_source = driver.page_source
        
        # Extract JSON from the page source
        if "application/json" in page_source or "{" in page_source:
            start_idx = page_source.find("{")
            end_idx = page_source.rfind("}") + 1
            if start_idx >= 0 and end_idx > start_idx:
                json_str = page_source[start_idx:end_idx]
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError:
                    print("Failed to parse JSON response")
        
        # Close the tab and switch back
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    
    # Look for the API call in the logs
    request_ids = []
    for log in logs:
        if log["method"] == "Network.requestWillBeSent":
            request_url = log["params"].get("request", {}).get("url", "")
            if name in request_url:
                print(f"✅ Found API request: {request_url}")
                request_ids.append(log["params"]["requestId"])
    
    if not request_ids:
        print("❌ No valid request ID found for:", name)
        return None
    
    # Try each request ID
    for request_id in request_ids:
        try:
            response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
            print("✅ Response Body Retrieved")
            return json.loads(response['body'])
        except Exception as e:
            print(f"❌ Error with request ID {request_id}: {str(e)}")
    
    print("❌ Failed to retrieve data with all request IDs")
    return None

In [31]:
url = "https://www.sofascore.com/football/match/arsenal-manchester-united/KR#id:12437005,tab:statistics"

In [32]:
def get_odds(url, driver):
    # Clear existing logs
    driver.get("about:blank")
    logs = driver.get_log("performance")
    
    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)
    
    # Wait for the page to load properly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
    except:
        print("Timeout waiting for page to load")
    
    # Interact with the page to trigger API calls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    
    # Get network logs
    logs_raw = driver.get_log("performance")
    logs = [json.loads(lr['message'])['message'] for lr in logs_raw]
    
    # Direct API approach - try to call the API directly
    
    match_id = url.split("#id:")[1].split(",")[0] if "#id:" in url else None
    if match_id:
        print(f"Extracted match ID: {match_id}")
        api_url = f"https://www.sofascore.com/api/v1/event/{match_id}/odds/1/featured"
        
        # Open the API URL directly
        driver.execute_script(f"window.open('{api_url}', '_blank');")
        time.sleep(2)
        
        # Switch to the new tab
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(1)
        
        # Get the page source which should contain the JSON
        page_source = driver.page_source
        
        # Extract JSON from the page source
        if "application/json" in page_source or "{" in page_source:
            start_idx = page_source.find("{")
            end_idx = page_source.rfind("}") + 1
            if start_idx >= 0 and end_idx > start_idx:
                json_str = page_source[start_idx:end_idx]
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError:
                    print("Failed to parse JSON response")
        
        # Close the tab and switch back
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    
    # Look for the API call in the logs
    request_ids = []
    for log in logs:
        if log["method"] == "Network.requestWillBeSent":
            request_url = log["params"].get("request", {}).get("url", "")
            if name in request_url:
                print(f"✅ Found API request: {request_url}")
                request_ids.append(log["params"]["requestId"])
    
    if not request_ids:
        print("❌ No valid request ID found for:", name)
        return None
    
    # Try each request ID
    for request_id in request_ids:
        try:
            response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
            print("✅ Response Body Retrieved")
            return json.loads(response['body'])
        except Exception as e:
            print(f"❌ Error with request ID {request_id}: {str(e)}")
    
    print("❌ Failed to retrieve data with all request IDs")
    return None

In [32]:
def get_score(url, driver):
    # Clear existing logs
    driver.get("about:blank")
    logs = driver.get_log("performance")
    
    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)
    
    # Wait for the page to load properly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
    except:
        print("Timeout waiting for page to load")
    
    # Interact with the page to trigger API calls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    
    # Get network logs
    logs_raw = driver.get_log("performance")
    logs = [json.loads(lr['message'])['message'] for lr in logs_raw]
    
    # Direct API approach - try to call the API directly
    
    match_id = url.split("#id:")[1].split(",")[0] if "#id:" in url else None
    if match_id:
        print(f"Extracted match ID: {match_id}")
        api_url = f"https://www.sofascore.com/api/v1/event/{match_id}"
        
        # Open the API URL directly
        driver.execute_script(f"window.open('{api_url}', '_blank');")
        time.sleep(2)
        
        # Switch to the new tab
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(1)
        
        # Get the page source which should contain the JSON
        page_source = driver.page_source
        
        # Extract JSON from the page source
        if "application/json" in page_source or "{" in page_source:
            start_idx = page_source.find("{")
            end_idx = page_source.rfind("}") + 1
            if start_idx >= 0 and end_idx > start_idx:
                json_str = page_source[start_idx:end_idx]
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError:
                    print("Failed to parse JSON response")
        
        # Close the tab and switch back
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    
    # Look for the API call in the logs
    request_ids = []
    for log in logs:
        if log["method"] == "Network.requestWillBeSent":
            request_url = log["params"].get("request", {}).get("url", "")
            if name in request_url:
                print(f"✅ Found API request: {request_url}")
                request_ids.append(log["params"]["requestId"])
    
    if not request_ids:
        print("❌ No valid request ID found for:", name)
        return None
    
    # Try each request ID
    for request_id in request_ids:
        try:
            response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
            print("✅ Response Body Retrieved")
            return json.loads(response['body'])
        except Exception as e:
            print(f"❌ Error with request ID {request_id}: {str(e)}")
    
    print("❌ Failed to retrieve data with all request IDs")
    return None

In [46]:
def parse_stats(stats):
    headers = []
    game_stats = {}
    home = "h"
    away = "a"
    for stat in stats:
        game_stats[home + stat["name"]] = stat["home"]
        game_stats[away + stat["name"]] = stat["away"]
        
    if "aRed Cards" not in game_stats:
        game_stats["aRed Cards"] = '0'
    if "hRed Cards" not in game_stats:
        game_stats["hRed Cards"] = '0'
    return game_stats

In [45]:
def get_team_name (url):
    TEAM_NAMES = {
        "brentford", "liverpool", "nottingham-forest", "wolverhampton", "west-ham-united",
        "manchester-city", "chelsea", "bournemouth", "newcastle-united", "southampton", 
        "everton", "manchester-united", "leicester-city", "tottenham-hotspur", "arsenal",
        "ipswich-town", "fulham", "brighton-and-hove-albion", "crystal-palace", "aston-villa"
    }
    parts = url.split("/")
    if "match" in parts:
        match_index = parts.index("match")
        teams = parts[match_index + 1]
        teams = teams.split("-")

        # get team name
        home_team = None
        away_team = None
        for word in teams:
            if home_team not in TEAM_NAMES:
                home_team = word if not home_team else home_team + "-" + word
            else:
                away_team = word if not away_team else away_team + "-" + word

        return home_team, away_team
    else:
        return "poo", "willy"

In [44]:
def get_game_stats(url):
    stats = get_stats (url,driver)['statistics'][0]['groups'][0]['statisticsItems']
    game_stats = parse_stats(stats)
    
    odds = get_odds (url,driver)
    game_stats['hOdds'] = (odds['featured']['default']['choices'][0]['initialFractionalValue'])
    game_stats['hTeam'],game_stats['aTeam'] = get_team_name (url)
    
    return game_stats


In [27]:
print (get_team_name("https://www.sofascore.com/football/match/aston-villa-leicester-city/GP#id:12436918"))

('aston-villa', 'leicester-city')


In [43]:
game_stats = get_game_stats(url)

with open ("test.csv","w") as f:
    f.write (",".join(game_stats.values()) + "\n")
f.close()

Loading URL: https://www.sofascore.com/football/match/arsenal-manchester-united/KR#id:12437005,tab:statistics


KeyboardInterrupt: 

In [42]:
file = open("match_links.txt", "r")
count = 0
for link in file:
    full_link  = link.strip() + ",tab;statistics"
    game_stats = get_game_stats(full_link)

    with open("test.csv", "a") as f:
        f.write (",".join(game_stats.values()) + "\n")
    count += 1

    if count == 5:
        break
f.close()

Loading URL: https://www.sofascore.com/football/match/brentford-liverpool/Usab#id:12436440,tab;statistics
Extracted match ID: 12436440
Loading URL: https://www.sofascore.com/football/match/brentford-liverpool/Usab#id:12436440,tab;statistics
Extracted match ID: 12436440
Loading URL: https://www.sofascore.com/football/match/liverpool-nottingham-forest/osU#id:12436526,tab;statistics
Extracted match ID: 12436526
Loading URL: https://www.sofascore.com/football/match/liverpool-nottingham-forest/osU#id:12436526,tab;statistics
Extracted match ID: 12436526
Loading URL: https://www.sofascore.com/football/match/nottingham-forest-wolverhampton/dso#id:12436926,tab;statistics
Extracted match ID: 12436926
Loading URL: https://www.sofascore.com/football/match/nottingham-forest-wolverhampton/dso#id:12436926,tab;statistics
Extracted match ID: 12436926
Loading URL: https://www.sofascore.com/football/match/aston-villa-tottenham-hotspur/IP#id:12437059,tab;statistics
Extracted match ID: 12437059
Loading URL