In [None]:
## python code to extract odds and statistics for a given match

In [19]:
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.set_capability('goog:loggingPrefs', {"performance": "ALL", "browser": "ALL"})
# Adding user agent to look more like a real browser
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(30)

In [50]:
def get_stats(url, driver):
    # Clear existing logs
    driver.get("about:blank")
    logs = driver.get_log("performance")
    
    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)
    
    # Wait for the page to load properly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
    except:
        print("Timeout waiting for page to load")
    
    # Interact with the page to trigger API calls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    
    # Get network logs
    logs_raw = driver.get_log("performance")
    logs = [json.loads(lr['message'])['message'] for lr in logs_raw]
    
    # Direct API approach - try to call the API directly
    
    match_id = url.split("#id:")[1].split(",")[0] if "#id:" in url else None
    if match_id:
        print(f"Extracted match ID: {match_id}")
        api_url = f"https://www.sofascore.com/api/v1/event/{match_id}/statistics"
        
        # Open the API URL directly
        driver.execute_script(f"window.open('{api_url}', '_blank');")
        time.sleep(2)
        
        # Switch to the new tab
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(1)
        
        # Get the page source which should contain the JSON
        page_source = driver.page_source
        
        # Extract JSON from the page source
        if "application/json" in page_source or "{" in page_source:
            start_idx = page_source.find("{")
            end_idx = page_source.rfind("}") + 1
            if start_idx >= 0 and end_idx > start_idx:
                json_str = page_source[start_idx:end_idx]
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError:
                    print("Failed to parse JSON response")
        
        # Close the tab and switch back
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    
    # Look for the API call in the logs
    request_ids = []
    for log in logs:
        if log["method"] == "Network.requestWillBeSent":
            request_url = log["params"].get("request", {}).get("url", "")
            if name in request_url:
                print(f"✅ Found API request: {request_url}")
                request_ids.append(log["params"]["requestId"])
    
    if not request_ids:
        print("❌ No valid request ID found for:", name)
        return None
    
    # Try each request ID
    for request_id in request_ids:
        try:
            response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
            print("✅ Response Body Retrieved")
            return json.loads(response['body'])
        except Exception as e:
            print(f"❌ Error with request ID {request_id}: {str(e)}")
    
    print("❌ Failed to retrieve data with all request IDs")
    return None

In [4]:
url = "https://www.sofascore.com/football/match/arsenal-manchester-united/KR#id:12437005,tab:statistics"

In [51]:
def get_odds(url, driver):
    # Clear existing logs
    driver.get("about:blank")
    logs = driver.get_log("performance")
    
    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)
    
    # Wait for the page to load properly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
    except:
        print("Timeout waiting for page to load")
    
    # Interact with the page to trigger API calls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    
    # Get network logs
    logs_raw = driver.get_log("performance")
    logs = [json.loads(lr['message'])['message'] for lr in logs_raw]
    
    # Direct API approach - try to call the API directly
    
    match_id = url.split("#id:")[1].split(",")[0] if "#id:" in url else None
    if match_id:
        print(f"Extracted match ID: {match_id}")
        api_url = f"https://www.sofascore.com/api/v1/event/{match_id}/odds/1/featured"
        
        # Open the API URL directly
        driver.execute_script(f"window.open('{api_url}', '_blank');")
        time.sleep(2)
        
        # Switch to the new tab
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(1)
        
        # Get the page source which should contain the JSON
        page_source = driver.page_source
        
        # Extract JSON from the page source
        if "application/json" in page_source or "{" in page_source:
            start_idx = page_source.find("{")
            end_idx = page_source.rfind("}") + 1
            if start_idx >= 0 and end_idx > start_idx:
                json_str = page_source[start_idx:end_idx]
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError:
                    print("Failed to parse JSON response")
        
        # Close the tab and switch back
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    
    # Look for the API call in the logs
    request_ids = []
    for log in logs:
        if log["method"] == "Network.requestWillBeSent":
            request_url = log["params"].get("request", {}).get("url", "")
            if name in request_url:
                print(f"✅ Found API request: {request_url}")
                request_ids.append(log["params"]["requestId"])
    
    if not request_ids:
        print("❌ No valid request ID found for:", name)
        return None
    
    # Try each request ID
    for request_id in request_ids:
        try:
            response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
            print("✅ Response Body Retrieved")
            return json.loads(response['body'])
        except Exception as e:
            print(f"❌ Error with request ID {request_id}: {str(e)}")
    
    print("❌ Failed to retrieve data with all request IDs")
    return None

In [5]:
def get_score(url, driver):
    # Clear existing logs
    driver.get("about:blank")
    logs = driver.get_log("performance")
    
    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)
    
    # Wait for the page to load properly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
    except:
        print("Timeout waiting for page to load")
    
    # Interact with the page to trigger API calls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    
    # Get network logs
    logs_raw = driver.get_log("performance")
    logs = [json.loads(lr['message'])['message'] for lr in logs_raw]
    
    # Direct API approach - try to call the API directly
    
    match_id = url.split("#id:")[1].split(",")[0] if "#id:" in url else None
    if match_id:
        print(f"Extracted match ID: {match_id}")
        urls = []
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/odds/1/featured")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/statistics")
        data = {}
        
        for i in range(0,len(urls)):
            print (i)
            api_url = urls[i]
            # Open the API URL directly
            driver.execute_script(f"window.open('{api_url}', '_blank');")
            time.sleep(2)
            
            # Switch to the new tab
            driver.switch_to.window(driver.window_handles[-1])
            time.sleep(1)
            
            # Get the page source which should contain the JSON
            page_source = driver.page_source
            
            # Extract JSON from the page source
            if "application/json" in page_source or "{" in page_source:
                start_idx = page_source.find("{")
                end_idx = page_source.rfind("}") + 1
                if start_idx >= 0 and end_idx > start_idx:
                    json_str = page_source[start_idx:end_idx]
                    try:
                        data[i] = json.loads(json_str)
                    except json.JSONDecodeError:
                        print("Failed to parse JSON response")
            
            # Close the tab and switch back
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
        
        return data
        """
            # Look for the API call in the logs
            request_ids = {}
            for log in logs:
                if log["method"] == "Network.requestWillBeSent":
                    request_url = log["params"].get("request", {}).get("url", "")
                    if name in request_url:
                        print(f"✅ Found API request: {request_url}")
                        request_ids[i].append(log["params"]["requestId"])
    
    if not request_ids:
        print("❌ No valid request ID found for:", name)
        return None
    
    # Try each request ID
    
    for i in request_ids:
        for request_id in request_ids[i]:
            try:
                response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
                print("✅ Response Body Retrieved")
                data[i] = json.loads(response['body'])
            except Exception as e:
                print(f"❌ Error with request ID {request_id}: {str(e)}")
    if data:
        return data
    else:
        print("❌ Failed to retrieve data with all request IDs")
        return None 
    return None
        """

In [14]:
def parse_stats(stats):
    headers = {"Ball possession","Expected goals","Total shots","Goalkeeper saves","Corner kicks","Fouls","Passes","Tackles","Free kicks","Yellow cards", "Red cards", "Big chances"} ## need to add header name and handling
    game_stats = {}
    home = "h"
    away = "a"
    for stat in stats:
        if stat["name"] in headers:
            game_stats[home + stat["name"]] = stat["home"]
            game_stats[away + stat["name"]] = stat["away"]   
    for header in headers: 
        if home+header not in game_stats:
            game_stats[home+header] = "0"
        if away+header not in game_stats:
            game_stats[away+header] = "0"
    return game_stats

In [7]:
def get_team_name (url):
    TEAM_NAMES = {
        "brentford", "liverpool", "nottingham-forest", "wolverhampton", "west-ham-united",
        "manchester-city", "chelsea", "bournemouth", "newcastle-united", "southampton", 
        "everton", "manchester-united", "leicester-city", "tottenham-hotspur", "arsenal",
        "ipswich-town", "fulham", "brighton-and-hove-albion", "crystal-palace", "aston-villa"
    }
    parts = url.split("/")
    if "match" in parts:
        match_index = parts.index("match")
        teams = parts[match_index + 1]
        teams = teams.split("-")

        # get team name
        home_team = None
        away_team = None
        for word in teams:
            if home_team not in TEAM_NAMES:
                home_team = word if not home_team else home_team + "-" + word
            else:
                away_team = word if not away_team else away_team + "-" + word

        return home_team, away_team
    else:
        return "poo", "willy"

In [8]:
def get_game_stats(url):
    data = get_score(url,driver)
    stats = data[2]['statistics'][0]['groups'][0]['statisticsItems']
    game_stats = parse_stats(stats)
    
    odds = data[1]
    game_stats['hOdds'] = (odds['featured']['default']['choices'][0]['initialFractionalValue'])
    game_stats['hTeam'],game_stats['aTeam'] = get_team_name (url)

    score = data[0]
    game_stats["hScore"] = str(score["event"]["homeScore"]["current"])
    game_stats["aScore"] = str(score["event"]["awayScore"]["current"])
    game_stats["hWin"] = "1" if game_stats["hScore"] > game_stats["aScore"] else "0"
    
    return game_stats


In [55]:
#score = get_score("https://www.sofascore.com/football/match/aston-villa-leicester-city/GP#id:12436918",driver)
print (score["event"]["homeScore"]["current"])
print (score["event"]["awayScore"]["current"])

1
2


In [11]:

## create headers 
game_stats = get_game_stats(url)
print (game_stats)
with open ("test.csv","w") as f:
    f.write (",".join(game_stats.keys()) + "\n")
f.close()

Loading URL: https://www.sofascore.com/football/match/arsenal-manchester-united/KR#id:12437005,tab:statistics
Extracted match ID: 12437005
0
1
2
{'hFouls': '0', 'aFouls': '0', 'hCorner kicks': '0', 'aCorner kicks': '0', 'hBig chances': '0', 'aBig chances': '0', 'hBall possession': '0', 'aBall possession': '0', 'hRed cards': '0', 'aRed cards': '0', 'hTotal shots': '0', 'aTotal shots': '0', 'hTackles': '0', 'aTackles': '0', 'hPasses': '0', 'aPasses': '0', 'hExpected goals': '0', 'aExpected goals': '0', 'hYellow cards': '0', 'aYellow cards': '0', 'hFree kicks': '0', 'aFree kicks': '0', 'hGoalkeeper saves': '0', 'aGoalkeeper saves': '0', 'hOdds': '7/2', 'hTeam': 'arsenal', 'aTeam': 'manchester-united', 'hScore': '1', 'aScore': '1', 'hWin': '0'}


In [20]:
file = open("match_links.txt", "r")
count = 0
for link in file:
    full_link  = link.strip() + ",tab;statistics"
    game_stats = get_game_stats(full_link)

    with open("test.csv", "a") as f:
        f.write (",".join(game_stats.values()) + "\n")
    count += 1

f.close()

Loading URL: https://www.sofascore.com/football/match/bournemouth-newcastle-united/Okb#id:12436885,tab;statistics
Extracted match ID: 12436885
0
1
2
Loading URL: https://www.sofascore.com/football/match/southampton-chelsea/NV#id:12436984,tab;statistics
Extracted match ID: 12436984
0
1
2
Loading URL: https://www.sofascore.com/football/match/everton-nottingham-forest/osY#id:12436517,tab;statistics
Extracted match ID: 12436517
0
1
2
Loading URL: https://www.sofascore.com/football/match/manchester-united-leicester-city/GK#id:12436494,tab;statistics
Extracted match ID: 12436494
0
1
2
Loading URL: https://www.sofascore.com/football/match/bournemouth-tottenham-hotspur/Iskb#id:12437009,tab;statistics
Extracted match ID: 12437009
0
1
2
Loading URL: https://www.sofascore.com/football/match/bournemouth-arsenal/Rkb#id:12437026,tab;statistics
Extracted match ID: 12437026
0
1
2
Loading URL: https://www.sofascore.com/football/match/aston-villa-ipswich-town/HsP#id:12436993,tab;statistics
Extracted mat

KeyError: 'statistics'