In [None]:
## python code to extract odds and statistics for a given match

In [31]:
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests

options = webdriver.ChromeOptions()
options.set_capability('goog:loggingPrefs', {"performance": "ALL", "browser": "ALL"})
# Adding user agent to look more like a real browser
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(30)

In [7]:
url = "https://www.sofascore.com/football/match/fulham-aston-villa/PsT#id:12437028"

In [29]:
def get_top_five(lineups):
    home_players = {}
    away_players = {}

    for player in lineups['home']['players']:
        if player['statistics'] and player["substitute"] == False:
            home_players[player['player']['name']] = player['statistics']['rating']
    home_players = dict(sorted(home_players.items(), key=lambda item:item[1]))
    
    for player in lineups['away']['players']:
        if player['statistics'] and player['substitute'] == False:
            away_players[player['player']['name']] = player['statistics']['rating']
    away_players = dict(sorted(home_players.items(), key=lambda item:item[1]))

    print (home_players)
    print (away_players)
    return home_players,away_players



        


In [6]:
def get_game_stats(url,match_id):
    data = get_score(url,match_id,driver)
    stats = data[2]['statistics'][0]['groups'][0]['statisticsItems']
    game_stats = parse_stats(stats)
    
    odds = data[1]
    new_odds = get_odds(match_id)
    
    if odds and odds.get('home') and odds.get('away'):
        try:
            game_stats['hOddsFraction'] = odds['home']['fractionalValue']
            game_stats['aOddsFraction'] = odds['away']['fractionalValue']
        except (KeyError, TypeError):
            pass
    elif new_odds:
        try:
            choices = new_odds['featured']["default"]['choices']
            game_stats['hOddsFraction'] = choices[0]['initialFractionalValue']
            game_stats['aOddsFraction'] = choices[2]['initialFractionalValue']
        except (KeyError, IndexError, TypeError):
            pass  # Fall back to previous/default odds if structure is unexpected
    else:
        game_stats['hOddsFraction'] = "1/2"
        game_stats['aOddsFraction'] = "1/2"
    # If no new odds, try fallback from `odds`
        
    score = data[0]
    game_stats["hScore"] = str(score["event"]["homeScore"]["current"])
    game_stats["aScore"] = str(score["event"]["awayScore"]["current"])
    if game_stats["hScore"] > game_stats["aScore"]:
        game_stats["hWin"] = "1" 
        game_stats["draw"] = "0"
        game_stats["aWin"] = "0"
    elif game_stats["aScore"] > game_stats["hScore"]:
        game_stats["aWin"] = "1"
        game_stats["draw"] = "0"
        game_stats["hWin"] = "0"
    else:
        game_stats["draw"] = "1"
        game_stats["hWin"] = "0"
        game_stats["aWin"] = "0"

    h2h = data[3]
    game_stats['hVsTeamWins'] = str(h2h['teamDuel']['homeWins']) if h2h['teamDuel'] else "0"
    game_stats['aVsTeamWins'] = str(h2h['teamDuel']['awayWins']) if h2h['teamDuel'] else "0"
    game_stats['aVsDraws'] = str(h2h['teamDuel']['draws']) if h2h['teamDuel'] else "0"

    game_stats['hVsManWins'] = str(h2h['managerDuel']['homeWins']) if h2h['managerDuel'] else "0"
    game_stats['aVsManWins'] = str(h2h['managerDuel']['awayWins']) if h2h['managerDuel'] else "0"
    game_stats['aVsDraws'] = str(h2h['managerDuel']['draws']) if h2h['managerDuel'] else "0"


    pregame_form = data[4]
    game_stats['hTeamFormRating'] = pregame_form['homeTeam']['avgRating']
    game_stats['aTeamFormRating'] = pregame_form['awayTeam']['avgRating']

    game_stats['hTeamCurrentPosition'] = pregame_form['homeTeam']['position']
    game_stats['aTeamCurrentPosition'] = pregame_form['awayTeam']['position']

    game_stats['hTeamValue'] = pregame_form['homeTeam']['value']
    game_stats['aTeamValue'] = pregame_form['awayTeam']['value']
    
    lineups = data[5]
    game_stats['hFormation'] = lineups['home']['formation']
    game_stats['aFomation'] = lineups['away']['formation']
    home_players,away_players = get_top_five(lineups)

    for i in range (1,11):
        game_stats[f'h{i}playerName'] = list(home_players.keys())[i]
        game_stats[f'h{i}playerName'] = home_players[list(home_players.keys())[i]]

        game_stats[f'a{i}playerName'] = list(away_players.keys())[i]
        game_stats[f'a{i}playerName'] = away_players[list(home_players.keys())[i]]
        
    
    managers = data[6]
    game_stats['hManagerName'] = managers['homeManager']['name']
    game_stats['aManagerName'] = managers['awayManager']['name']
    
    match_details = data[7]
    game_stats["dateTime"] = match_details["startDate"]
    game_stats["hTeamName"] = match_details["homeTeam"]["name"]
    game_stats["aTeamName"] = match_details["awayTeam"]["name"]
    
    return game_stats

In [10]:
def get_score(url,match_id, driver):
    # Clear existing logs
    #driver.get("about:blank")
    #logs = driver.get_log("performance")
    
    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)
    """
    # Wait for the page to load properly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
    except:
        print("Timeout waiting for page to load")
    """
    
    # Interact with the page to trigger API calls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(0.1)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(0.2)

    soup = BeautifulSoup (driver.page_source, "html.parser")

    meta_data = soup.find_all('script', type = 'application/ld+json')

    meta_data = json.loads(meta_data[1].string)
    
    # Direct API approach - try to call the API directly
    
    if match_id:
        print(f"Extracted match ID: {match_id}")
        urls = []
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/provider/1/winning-odds")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/statistics")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/h2h")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/pregame-form")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/lineups")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/managers")
        
        data = {}
        for i in range(0,len(urls)):
            api_url = urls[i]
            # Open the API URL directly
            driver.execute_script(f"window.open('{api_url}', '_blank');")
            #time.sleep(2)
            
            # Switch to the new tab
            driver.switch_to.window(driver.window_handles[-1])
            #time.sleep(1)
            
            # Get the page source which should contain the JSON
            page_source = driver.page_source
            
            # Extract JSON from the page source
            if "application/json" in page_source or "{" in page_source:
                start_idx = page_source.find("{")
                end_idx = page_source.rfind("}") + 1
                if start_idx >= 0 and end_idx > start_idx:
                    json_str = page_source[start_idx:end_idx]
                    try:
                        data[i] = json.loads(json_str)
                    except json.JSONDecodeError:
                        print("Failed to parse JSON response")
            
            # Close the tab and switch back
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

        data[len(urls)] = meta_data
        return data


In [32]:
#data = get_score ("https://www.sofascore.com/football/match/fulham-manchester-united/KsT#id:12436899",12436899,driver)
print(get_game_stats ("https://www.sofascore.com/football/match/brentford-everton/Ysab#id:12436968",12436968))

Loading URL: https://www.sofascore.com/football/match/brentford-everton/Ysab#id:12436968
Extracted match ID: 12436968
{'Bryan Mbeumo': 6.4, 'Keane Lewis-Potter': 6.5, 'Ethan Pinnock': 6.8, 'Nathan Collins': 6.9, 'Kristoffer Ajer': 7.1, 'Yehor Yarmolyuk': 7.1, 'Kevin Schade': 7.2, 'Vitaly Janelt': 7.3, 'Mark Flekken': 7.5, 'Mikkel Damsgaard': 7.5, 'Yoane Wissa': 7.7}
{'Bryan Mbeumo': 6.4, 'Keane Lewis-Potter': 6.5, 'Ethan Pinnock': 6.8, 'Nathan Collins': 6.9, 'Kristoffer Ajer': 7.1, 'Yehor Yarmolyuk': 7.1, 'Kevin Schade': 7.2, 'Vitaly Janelt': 7.3, 'Mark Flekken': 7.5, 'Mikkel Damsgaard': 7.5, 'Yoane Wissa': 7.7}
{'hExpected goals': '1.48', 'aExpected goals': '1.37', 'hBig chances': '2', 'aBig chances': '3', 'hBall possession': '52%', 'aBall possession': '48%', 'hFouls': '3', 'aFouls': '6', 'hTackles': '13', 'aTackles': '15', 'hTotal shots': '12', 'aTotal shots': '14', 'hFree kicks': '6', 'aFree kicks': '3', 'hPasses': '458', 'aPasses': '425', 'hYellow cards': '2', 'aYellow cards': '1',

In [12]:
def parse_stats(stats):
    headers = {"Ball possession","Expected goals","Total shots","Goalkeeper saves","Corner kicks","Fouls","Passes","Tackles","Free kicks","Yellow cards", "Red cards", "Big chances"}
    home = "h"
    away = "a"
    game_stats = {}
    for header in headers:
        game_stats[f"{home}{header}"] = "0"
        game_stats[f"{away}{header}"] = "0"
    for stat in stats:
        if stat["name"] in headers:
            game_stats[home + stat["name"]] = stat["home"]
            game_stats[away + stat["name"]] = stat["away"]   
    return game_stats

In [13]:
def get_odds(match_id):
    driver.get(f"https://www.sofascore.com/api/v1/event/{match_id}/odds/1/featured")
    page_source = driver.page_source
    if "application/json" in page_source or "{" in page_source:
                start_idx = page_source.find("{")
                end_idx = page_source.rfind("}") + 1
                if start_idx >= 0 and end_idx > start_idx:
                    json_str = page_source[start_idx:end_idx]
                    try:
                        return json.loads(json_str)
                    except json.JSONDecodeError:
                        return none

In [None]:
#score = get_score("https://www.sofascore.com/football/match/aston-villa-leicester-city/GP#id:12436918",driver)
print (score["event"]["homeScore"]["current"])
print (score["event"]["awayScore"]["current"])

In [105]:

## create headers 
game_stats = get_game_stats(url,12437028)
print (",".join(game_stats.keys()))
with open ("version3.csv","w") as f:
    f.write (",".join(game_stats.keys()) + "\n")
f.close()

Loading URL: https://www.sofascore.com/football/match/fulham-aston-villa/PsT#id:12437028
Extracted match ID: 12437028
hTotal shots,aTotal shots,hFouls,aFouls,hExpected goals,aExpected goals,hFree kicks,aFree kicks,hTackles,aTackles,hYellow cards,aYellow cards,hPasses,aPasses,hBig chances,aBig chances,hRed cards,aRed cards,hGoalkeeper saves,aGoalkeeper saves,hBall possession,aBall possession,hCorner kicks,aCorner kicks,hOddsFraction,aOddsFraction,hScore,aScore,aWin,draw,hWin,hVsTeamWins,aVsTeamWins,aVsDraws,hVsManWins,aVsManWins,dateTime,hTeamName,aTeamName


In [99]:
file = open("../data_files/match_links_7_4.txt", "r")
for link in file:
    print (link)
    link = link.split(",")
    
    url = link[0]
    match_id = link[1].strip()
    full_link  = url + ",tab;statistics"
    game_stats = get_game_stats(full_link,match_id)

    with open("../version3.csv", "a") as f:
        f.write (",".join(game_stats.values()) + "\n")

f.close()

https://www.sofascore.com/football/match/everton-liverpool/UsY#id:12436593,12436593

Loading URL: https://www.sofascore.com/football/match/everton-liverpool/UsY#id:12436593,tab;statistics
Extracted match ID: 12436593
https://www.sofascore.com/football/match/newcastle-united-wolverhampton/dsO#id:12436529,12436529

Loading URL: https://www.sofascore.com/football/match/newcastle-united-wolverhampton/dsO#id:12436529,tab;statistics
Extracted match ID: 12436529
https://www.sofascore.com/football/match/bournemouth-everton/Yskb#id:12436912,12436912

Loading URL: https://www.sofascore.com/football/match/bournemouth-everton/Yskb#id:12436912,tab;statistics
Extracted match ID: 12436912
https://www.sofascore.com/football/match/liverpool-wolverhampton/dsU#id:12437002,12437002

Loading URL: https://www.sofascore.com/football/match/liverpool-wolverhampton/dsU#id:12437002,tab;statistics
Extracted match ID: 12437002
https://www.sofascore.com/football/match/brighton-and-hove-albion-nottingham-forest/osF#

In [88]:
print (get_game_stats("https://www.sofascore.com/football/match/ipswich-town-brighton-and-hove-albion/FsH#id:12436936,tab;statistics",12436936))

Loading URL: https://www.sofascore.com/football/match/ipswich-town-brighton-and-hove-albion/FsH#id:12436936,tab;statistics
Extracted match ID: 12436936
{'hTotal shots': '21', 'aTotal shots': '6', 'hFouls': '14', 'aFouls': '16', 'hExpected goals': '1.66', 'aExpected goals': '0.35', 'hFree kicks': '16', 'aFree kicks': '14', 'hTackles': '19', 'aTackles': '25', 'hYellow cards': '4', 'aYellow cards': '3', 'hPasses': '570', 'aPasses': '261', 'hBig chances': '3', 'aBig chances': '1', 'hRed cards': '0', 'aRed cards': '0', 'hGoalkeeper saves': '1', 'aGoalkeeper saves': '6', 'hBall possession': '69%', 'aBall possession': '31%', 'hCorner kicks': '9', 'aCorner kicks': '2', 'hOddsFraction': '19/50', 'aOddsFraction': '7/1', 'hScore': '0', 'aScore': '0', 'draw': '1', 'hVsTeamWins': '3', 'aVsTeamWins': '4', 'aVsDraws': '1', 'hVsManWins': '1', 'aVsManWins': '0', 'dateTime': '2024-09-14T14:00:00.000Z', 'hTeamName': 'Brighton &amp; Hove Albion', 'aTeamName': 'Ipswich Town'}
