In [None]:
## python code to extract odds and statistics for a given match

In [70]:
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests

options = webdriver.ChromeOptions()
options.set_capability('goog:loggingPrefs', {"performance": "ALL", "browser": "ALL"})
# Adding user agent to look more like a real browser
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(30)

In [60]:
url = "https://www.sofascore.com/football/match/fulham-aston-villa/PsT#id:12437028"

In [58]:
def get_top_five(lineups):
    home_players = {}
    away_players = {}

    for player in lineups['home']['players']:
        
        if player['statistics'] and player["substitute"] == False:
            if 'rating' in player['statistics']:
                home_players[player['player']['name']] = player['statistics']['rating']
            else:
                home_players[player['player']['name']] = 4
    home_players = dict(sorted(home_players.items(), key=lambda item:item[1], reverse = True))
    
    for player in lineups['away']['players']:
        if 'rating' in player['statistics']:
            if player['statistics'] and player['substitute'] == False:
                away_players[player['player']['name']] = player['statistics']['rating']
    away_players = dict(sorted(away_players.items(), key=lambda item:item[1], reverse = True))

    
    return home_players,away_players



        


In [31]:
def get_game_stats(url,match_id):
    data = get_score(url,match_id,driver)
    stats = data[2]['statistics'][0]['groups'][0]['statisticsItems']
    game_stats = parse_stats(stats)
    
    odds = data[1]
    new_odds = get_odds(match_id)
    
    if odds and odds.get('home') and odds.get('away'):
        try:
            game_stats['hOddsFraction'] = odds['home']['fractionalValue']
            game_stats['aOddsFraction'] = odds['away']['fractionalValue']
        except (KeyError, TypeError):
            pass
    elif new_odds:
        try:
            choices = new_odds['featured']["default"]['choices']
            game_stats['hOddsFraction'] = choices[0]['initialFractionalValue']
            game_stats['aOddsFraction'] = choices[2]['initialFractionalValue']
        except (KeyError, IndexError, TypeError):
            pass  # Fall back to previous/default odds if structure is unexpected
    else:
        game_stats['hOddsFraction'] = "1/2"
        game_stats['aOddsFraction'] = "1/2"
    # If no new odds, try fallback from `odds`
        
    score = data[0]
    game_stats["hScore"] = str(score["event"]["homeScore"]["current"])
    game_stats["aScore"] = str(score["event"]["awayScore"]["current"])
    if game_stats["hScore"] > game_stats["aScore"]:
        game_stats["hWin"] = "1" 
        game_stats["draw"] = "0"
        game_stats["aWin"] = "0"
    elif game_stats["aScore"] > game_stats["hScore"]:
        game_stats["aWin"] = "1"
        game_stats["draw"] = "0"
        game_stats["hWin"] = "0"
    else:
        game_stats["draw"] = "1"
        game_stats["hWin"] = "0"
        game_stats["aWin"] = "0"

    h2h = data[3]
    game_stats['hVsTeamWins'] = str(h2h['teamDuel']['homeWins']) if h2h['teamDuel'] else "0"
    game_stats['aVsTeamWins'] = str(h2h['teamDuel']['awayWins']) if h2h['teamDuel'] else "0"
    game_stats['aVsDraws'] = str(h2h['teamDuel']['draws']) if h2h['teamDuel'] else "0"

    game_stats['hVsManWins'] = str(h2h['managerDuel']['homeWins']) if h2h['managerDuel'] else "0"
    game_stats['aVsManWins'] = str(h2h['managerDuel']['awayWins']) if h2h['managerDuel'] else "0"
    game_stats['aVsDraws'] = str(h2h['managerDuel']['draws']) if h2h['managerDuel'] else "0"


    pregame_form = data[4]
    game_stats['hTeamFormRating'] = pregame_form['homeTeam']['avgRating'] if 'homeTeam' in pregame_form else "-1"
    game_stats['aTeamFormRating'] = pregame_form['awayTeam']['avgRating'] if 'awayTeam' in pregame_form else "-1"

    game_stats['hTeamCurrentPosition'] = str(pregame_form['homeTeam']['position']) if 'homeTeam' in pregame_form else "-1"
    game_stats['aTeamCurrentPosition'] = str(pregame_form['awayTeam']['position']) if 'awayTeam' in pregame_form else "-1"

    game_stats['hTeamValue'] = pregame_form['homeTeam']['value'] if 'homeTeam' in pregame_form else "-1"
    game_stats['aTeamValue'] = pregame_form['awayTeam']['value'] if 'awayTeam' in pregame_form else "-1"
    
    lineups = data[5]
    game_stats['hFormation'] = lineups['home']['formation']
    game_stats['aFomation'] = lineups['away']['formation']
    home_players,away_players = get_top_five(lineups)

    for i in range (0,11):
        game_stats[f'h{i}playerName'] = list(home_players.keys())[i]
        game_stats[f'h{i}playerRating'] = str(home_players[list(home_players.keys())[i]])

        game_stats[f'a{i}playerName'] = list(away_players.keys())[i]
        game_stats[f'a{i}playerRating'] = str(away_players[list(away_players.keys())[i]])
        
    
    managers = data[6]
    game_stats['hManagerName'] = managers['homeManager']['name']
    game_stats['aManagerName'] = managers['awayManager']['name']
    
    match_details = data[7]
    game_stats["dateTime"] = match_details["startDate"]
    game_stats["hTeamName"] = match_details["homeTeam"]["name"]
    game_stats["aTeamName"] = match_details["awayTeam"]["name"]
    
    return game_stats

In [4]:
def get_score(url,match_id, driver):
    # Clear existing logs
    #driver.get("about:blank")
    #logs = driver.get_log("performance")
    
    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)
    """
    # Wait for the page to load properly
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
    except:
        print("Timeout waiting for page to load")
    """
    
    # Interact with the page to trigger API calls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(0.1)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(0.2)

    soup = BeautifulSoup (driver.page_source, "html.parser")

    meta_data = soup.find_all('script', type = 'application/ld+json')

    meta_data = json.loads(meta_data[1].string)
    
    # Direct API approach - try to call the API directly
    
    if match_id:
        print(f"Extracted match ID: {match_id}")
        urls = []
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/provider/1/winning-odds")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/statistics")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/h2h")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/pregame-form")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/lineups")
        urls.append(f"https://www.sofascore.com/api/v1/event/{match_id}/managers")
        
        data = {}
        for i in range(0,len(urls)):
            api_url = urls[i]
            # Open the API URL directly
            driver.execute_script(f"window.open('{api_url}', '_blank');")
            #time.sleep(2)
            
            # Switch to the new tab
            driver.switch_to.window(driver.window_handles[-1])
            #time.sleep(1)
            
            # Get the page source which should contain the JSON
            page_source = driver.page_source
            
            # Extract JSON from the page source
            if "application/json" in page_source or "{" in page_source:
                start_idx = page_source.find("{")
                end_idx = page_source.rfind("}") + 1
                if start_idx >= 0 and end_idx > start_idx:
                    json_str = page_source[start_idx:end_idx]
                    try:
                        data[i] = json.loads(json_str)
                    except json.JSONDecodeError:
                        print("Failed to parse JSON response")
            
            # Close the tab and switch back
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

        data[len(urls)] = meta_data
        return data


In [37]:
#data = get_score ("https://www.sofascore.com/football/match/fulham-manchester-united/KsT#id:12436899",12436899,driver)
print(get_game_stats ("https://www.sofascore.com/football/match/luton-town-aston-villa/Psxb#id:11352545,tab;statistics",11352545))

Loading URL: https://www.sofascore.com/football/match/luton-town-aston-villa/Psxb#id:11352545,tab;statistics
Extracted match ID: 11352545
{'player': {'name': 'Thomas Kaminski', 'slug': 'thomas-kaminski', 'shortName': 'T. Kaminski', 'position': 'G', 'jerseyNumber': '24', 'height': 190, 'userCount': 719, 'id': 45959, 'country': {'alpha2': 'BE', 'alpha3': 'BEL', 'name': 'Belgium', 'slug': 'belgium'}, 'marketValueCurrency': 'EUR', 'dateOfBirthTimestamp': 719798400, 'proposedMarketValueRaw': {'value': 2400000, 'currency': 'EUR'}, 'fieldTranslations': {'nameTranslation': {'ar': 'توماس كامينسكي', 'hi': 'थॉमस कामिन्स्की', 'bn': 'টমাস কামিনস্কি'}, 'shortNameTranslation': {'ar': 'ت. كامينسكي', 'hi': 'टी. कामिन्स्की', 'bn': 'টি. কামিনস্কি'}}}, 'teamId': 72, 'shirtNumber': 24, 'jerseyNumber': '24', 'position': 'G', 'substitute': False, 'statistics': {'totalPass': 22, 'accuratePass': 14, 'totalLongBalls': 10, 'accurateLongBalls': 2, 'goodHighClaim': 1, 'savedShotsFromInsideTheBox': 5, 'saves': 5, '

KeyError: 'rating'

In [5]:
def parse_stats(stats):
    headers = {"Ball possession","Expected goals","Total shots","Goalkeeper saves","Corner kicks","Fouls","Passes","Tackles","Free kicks","Yellow cards", "Red cards", "Big chances"}
    home = "h"
    away = "a"
    game_stats = {}
    for header in headers:
        game_stats[f"{home}{header}"] = "0"
        game_stats[f"{away}{header}"] = "0"
    for stat in stats:
        if stat["name"] in headers:
            game_stats[home + stat["name"]] = stat["home"]
            game_stats[away + stat["name"]] = stat["away"]   
    return game_stats

In [15]:
def get_odds(match_id):
    driver.get(f"https://www.sofascore.com/api/v1/event/{match_id}/odds/1/featured")
    page_source = driver.page_source
    if "application/json" in page_source or "{" in page_source:
        start_idx = page_source.find("{")
        end_idx = page_source.rfind("}") + 1
        if start_idx >= 0 and end_idx > start_idx:
            json_str = page_source[start_idx:end_idx]
            try:
                return json.loads(json_str)
            except json.JSONDecodeError:
                return none

In [None]:
#score = get_score("https://www.sofascore.com/football/match/aston-villa-leicester-city/GP#id:12436918",driver)
print (score["event"]["homeScore"]["current"])
print (score["event"]["awayScore"]["current"])

In [50]:

## create headers 
game_stats = get_game_stats(url,12437028)
print (",".join(game_stats.keys()))
with open ("version4.csv","a") as f:
    f.write (",".join(game_stats.keys()) + "\n")
f.close()

Loading URL: https://www.sofascore.com/football/match/fulham-aston-villa/PsT#id:12437028
Extracted match ID: 12437028
hFree kicks,aFree kicks,hCorner kicks,aCorner kicks,hPasses,aPasses,hYellow cards,aYellow cards,hGoalkeeper saves,aGoalkeeper saves,hRed cards,aRed cards,hFouls,aFouls,hTotal shots,aTotal shots,hBig chances,aBig chances,hTackles,aTackles,hBall possession,aBall possession,hExpected goals,aExpected goals,hOddsFraction,aOddsFraction,hScore,aScore,aWin,draw,hWin,hVsTeamWins,aVsTeamWins,aVsDraws,hVsManWins,aVsManWins,hTeamFormRating,aTeamFormRating,hTeamCurrentPosition,aTeamCurrentPosition,hTeamValue,aTeamValue,hFormation,aFomation,h0playerName,h0playerRating,a0playerName,a0playerRating,h1playerName,h1playerRating,a1playerName,a1playerRating,h2playerName,h2playerRating,a2playerName,a2playerRating,h3playerName,h3playerRating,a3playerName,a3playerRating,h4playerName,h4playerRating,a4playerName,a4playerRating,h5playerName,h5playerRating,a5playerName,a5playerRating,h6playerName,

In [46]:
file = open("../data_files/match_links_23_24.txt", "r")
for link in file:
    match_id = link.split(":")[2].strip()
    full_link  = link.strip() + ",tab;statistics"
    game_stats = get_game_stats(full_link,match_id)

    with open("../version4.csv", "a") as f:
        f.write (",".join(game_stats.values()) + "\n")

f.close()

Loading URL: https://www.sofascore.com/football/match/brentford-wolverhampton/dsab#id:11352446,tab;statistics
Extracted match ID: 11352446
Loading URL: https://www.sofascore.com/football/match/bournemouth-liverpool/Uskb#id:11352308,tab;statistics
Extracted match ID: 11352308
Loading URL: https://www.sofascore.com/football/match/fulham-chelsea/NsT#id:11352321,tab;statistics
Extracted match ID: 11352321
Loading URL: https://www.sofascore.com/football/match/liverpool-crystal-palace/hsU#id:11352403,tab;statistics
Extracted match ID: 11352403
Loading URL: https://www.sofascore.com/football/match/arsenal-manchester-united/KR#id:11352532,tab;statistics
Extracted match ID: 11352532
Loading URL: https://www.sofascore.com/football/match/bournemouth-brentford/abskb#id:11352519,tab;statistics
Extracted match ID: 11352519
Loading URL: https://www.sofascore.com/football/match/arsenal-burnley/gsR#id:11352453,tab;statistics
Extracted match ID: 11352453
Loading URL: https://www.sofascore.com/football/m

In [88]:
print (get_game_stats("https://www.sofascore.com/football/match/ipswich-town-brighton-and-hove-albion/FsH#id:12436936,tab;statistics",12436936))

Loading URL: https://www.sofascore.com/football/match/ipswich-town-brighton-and-hove-albion/FsH#id:12436936,tab;statistics
Extracted match ID: 12436936
{'hTotal shots': '21', 'aTotal shots': '6', 'hFouls': '14', 'aFouls': '16', 'hExpected goals': '1.66', 'aExpected goals': '0.35', 'hFree kicks': '16', 'aFree kicks': '14', 'hTackles': '19', 'aTackles': '25', 'hYellow cards': '4', 'aYellow cards': '3', 'hPasses': '570', 'aPasses': '261', 'hBig chances': '3', 'aBig chances': '1', 'hRed cards': '0', 'aRed cards': '0', 'hGoalkeeper saves': '1', 'aGoalkeeper saves': '6', 'hBall possession': '69%', 'aBall possession': '31%', 'hCorner kicks': '9', 'aCorner kicks': '2', 'hOddsFraction': '19/50', 'aOddsFraction': '7/1', 'hScore': '0', 'aScore': '0', 'draw': '1', 'hVsTeamWins': '3', 'aVsTeamWins': '4', 'aVsDraws': '1', 'hVsManWins': '1', 'aVsManWins': '0', 'dateTime': '2024-09-14T14:00:00.000Z', 'hTeamName': 'Brighton &amp; Hove Albion', 'aTeamName': 'Ipswich Town'}


In [75]:
"""

file = open("../data_files/match_links_23_24.txt", "r")
for link in file:
    
    match_id = link.split(":")[2].strip()
    ## get stuff 
    api_link = f"https://www.sofascore.com/api/v1/event/{match_id}/lineups"
    print ("link")
    print (api_link)
    driver.get(api_link)
    page_source = driver.page_source
      
            # Extract JSON from the page source
    if "application/json" in page_source or "{" in page_source:
        start_idx = page_source.find("{")
        end_idx = page_source.rfind("}") + 1
        if start_idx >= 0 and end_idx > start_idx:
            json_str = page_source[start_idx:end_idx]
            try:
                data = json.loads(json_str)
                home, away = get_top_five(data)
                with open("../version4.csv", "r") as f, open("../version4,1.csv","a") as b:
                    text = f.readline()
                    text = text.split(",")
                    count = 0 
                    for i in range(0,len(text)-1):
                        if text[i] == text[i+2] and text[i+1] == text[i+3]:
                            text[i+1] = away.items()[count]
                            count +=1
                            text[i+2] = away.items()[count]
                            count+=1

                    print (text)
                
            except json.JSONDecodeError:
                pass
    
     
    

f.close()
"""

link
https://www.sofascore.com/api/v1/event/11352446/lineups


TypeError: 'dict_items' object is not subscriptable

In [66]:
headers = "hFree kicks,aFree kicks,hCorner kicks,aCorner kicks,hPasses,aPasses,hYellow cards,aYellow cards,hGoalkeeper saves,aGoalkeeper saves,hRed cards,aRed cards,hFouls,aFouls,hTotal shots,aTotal shots,hBig chances,aBig chances,hTackles,aTackles,hBall possession,aBall possession,hExpected goals,aExpected goals,hOddsFraction,aOddsFraction,hScore,aScore,aWin,draw,hWin,hVsTeamWins,aVsTeamWins,aVsDraws,hVsManWins,aVsManWins,hTeamFormRating,aTeamFormRating,hTeamCurrentPosition,aTeamCurrentPosition,hTeamValue,aTeamValue,hFormation,aFomation,h0playerName,h0playerRating,a0playerName,a0playerRating,h1playerName,h1playerRating,a1playerName,a1playerRating,h2playerName,h2playerRating,a2playerName,a2playerRating,h3playerName,h3playerRating,a3playerName,a3playerRating,h4playerName,h4playerRating,a4playerName,a4playerRating,h5playerName,h5playerRating,a5playerName,a5playerRating,h6playerName,h6playerRating,a6playerName,a6playerRating,h7playerName,h7playerRating,a7playerName,a7playerRating,h8playerName,h8playerRating,a8playerName,a8playerRating,h9playerName,h9playerRating,a9playerName,a9playerRating,h10playerName,h10playerRating,a10playerName,a10playerRating,hManagerName,aManagerName,dateTime,hTeamName,aTeamName"