In [1]:
import json
import httpx
import pandas as pd
import asyncio
import nest_asyncio
import numpy as np
from PIL import Image
from io import BytesIO
import janitor


In [28]:
class Spider:
    def __init__(self):
        self.comments =  "https://www.sofascore.com/api/v1/event/11352568/comments"
        self.shotmap =  "https://www.sofascore.com/api/v1/event/11352568/shotmap"
        self.event = "https://www.sofascore.com/api/v1/event/11352568"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Referer': 'https://www.sofascore.com/',
            'Origin': 'https://www.sofascore.com'
        }
    
    async def get_comments(self):
        async with httpx.AsyncClient(headers=self.headers) as client:
            response = await client.get(self.comments)  
            return pd.json_normalize(response.json().get('comments'))
    
    async def get_shotmap(self):
        async with httpx.AsyncClient(headers=self.headers) as client:
            response = await client.get(self.shotmap) 
            return pd.json_normalize(response.json().get('shotmap'))
    
    async def get_event(self):
        async with httpx.AsyncClient(headers=self.headers) as client:
            response = await client.get(self.event)  
            return pd.json_normalize(response.json().get('event'))
            

In [29]:
nest_asyncio.apply()

spider = Spider()
comments_data = await spider.get_comments()
shotmap_data = await spider.get_shotmap()
event_data = await spider.get_event()
    

In [31]:
comments_data.loc[0, 'time'] = comments_data.loc[1, 'time']

In [32]:
shots = ['shotOffTarget', 'shotBlocked', 'scoreChange', 'post', 'penaltyScored', 'shotSaved']
comments_data[comments_data['type'].isin(shots)].head()

Unnamed: 0,text,type,id,time,periodName,isHome,player.name,player.firstName,player.lastName,player.slug,...,assist1.firstName,assist1.lastName,assist1.slug,assist1.shortName,assist1.position,assist1.jerseyNumber,assist1.userCount,assist1.id,assist1.fieldTranslations.nameTranslation.ar,assist1.fieldTranslations.shortNameTranslation.ar
2,Attempt missed. Bobby De Cordova-Reid (Fulham)...,shotOffTarget,23924481,93,2ND,False,Bobby Decordova-Reid,,,bobby-decordova-reid,...,,,,,,,,,,
11,Attempt saved. Cauley Woodrow (Luton Town) rig...,shotSaved,23924353,88,2ND,True,Cauley Woodrow,,,cauley-woodrow,...,,,,,,,,,,
15,Attempt saved. Alex Iwobi (Fulham) right foote...,shotSaved,23924234,85,2ND,False,Alex Iwobi,,,alex-iwobi,...,,,,,,,,,,
16,Attempt missed. Bobby De Cordova-Reid (Fulham)...,shotOffTarget,23924235,85,2ND,False,Bobby Decordova-Reid,,,bobby-decordova-reid,...,,,,,,,,,,
19,Attempt missed. Bobby De Cordova-Reid (Fulham)...,shotOffTarget,23924161,83,2ND,False,Bobby Decordova-Reid,,,bobby-decordova-reid,...,,,,,,,,,,


In [33]:
shotmap_data['time_min'] = (shotmap_data['timeSeconds'] / 60).apply(np.floor).astype(int)
shotmap_data['player.id'] = shotmap_data['player.id'].astype(float)

In [34]:
shotmap_data = shotmap_data.sort_values(by='time_min')

In [35]:
cols = ['shotType', 'situation', 'bodyPart', 'goalMouthLocation', 'xg','addedTime', 'timeSeconds', 'reversedPeriodTime', 
       'reversedPeriodTimeSeconds', 'incidentType', 'player.id','playerCoordinates.x', 'playerCoordinates.y', 'playerCoordinates.z',
       'goalMouthCoordinates.x', 'goalMouthCoordinates.y',
       'goalMouthCoordinates.z', 'draw.start.x', 'draw.start.y', 'draw.end.x',
       'draw.end.y', 'draw.goal.x', 'draw.goal.y', 'xgot',
       'blockCoordinates.x', 'blockCoordinates.y', 'blockCoordinates.z',
       'draw.block.x', 'draw.block.y', 'goalType', 'time_min']

shotmap_data = shotmap_data[cols]



In [36]:
comments_data = comments_data.sort_values(by = 'time')

In [37]:
# Some times may be off by 1 so this merge takes care of this by allotting an error of 1
# note: For larger scale I must specify the game ID for the merge
merged_df = pd.merge_asof(comments_data, shotmap_data, left_on='time', right_on = 'time_min', by='player.id', tolerance=1)


In [38]:
merged_df['game_year'] = event_data['season.name'].iloc[0]
merged_df['home_team'] = event_data['homeTeam.name'].iloc[0] # Assuming every match will be 1v1
merged_df['away_team'] = event_data['awayTeam.name'].iloc[0]
merged_df['home_team_id'] = event_data['homeTeam.id'].iloc[0]
merged_df['away_team_id'] = event_data['awayTeam.id'].iloc[0]
merged_df['posteam'] = np.where(merged_df['isHome'] == True, merged_df['home_team'], merged_df['away_team']) # fix this
merged_df['defteam'] = np.where(merged_df['isHome'] == False, merged_df['home_team'], merged_df['away_team']) # fix this



In [39]:
# make sure you drop is home
merged_df = janitor.clean_names(merged_df)

merged_df



Unnamed: 0,text,type,id,time,periodname,ishome,player_name,player_firstname,player_lastname,player_slug,...,draw_block_y,goaltype,time_min,game_year,home_team,away_team,home_team_id,away_team_id,posteam,defteam
0,First Half begins.,matchStarted,23921401,0,1ST,,,,,,...,,,,Premier League 23/24,Luton Town,Fulham,72,43,Fulham,Fulham
1,Tim Ream (Fulham) wins a free kick in the defe...,freeKickWon,23921468,2,1ST,False,Tim Ream,,,tim-ream,...,,,,Premier League 23/24,Luton Town,Fulham,72,43,Fulham,Luton Town
2,Foul by Elijah Adebayo (Luton Town).,freeKickLost,23921469,2,1ST,True,Elijah Adebayo,,,adebayo-elijah,...,,,,Premier League 23/24,Luton Town,Fulham,72,43,Luton Town,Fulham
3,Attempt blocked. João Palhinha (Fulham) right ...,shotBlocked,23921634,6,1ST,False,João Palhinha,,,joao-palhinha,...,20.8,,6.0,Premier League 23/24,Luton Town,Fulham,72,43,Fulham,Luton Town
4,"Corner, Fulham. Conceded by Daiki Hashioka.",cornerKick,23921683,7,1ST,True,Daiki Hashioka,,,daiki-hashioka,...,,,,Premier League 23/24,Luton Town,Fulham,72,43,Luton Town,Fulham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,Foul by Fodé Ballo-Touré (Fulham).,freeKickLost,23924435,92,2ND,False,Fodé Ballo-Touré,,,fode-ballo-toure,...,,,,Premier League 23/24,Luton Town,Fulham,72,43,Fulham,Luton Town
139,Andros Townsend (Luton Town) wins a free kick ...,freeKickWon,23924436,92,2ND,True,Andros Townsend,,,andros-townsend,...,,,,Premier League 23/24,Luton Town,Fulham,72,43,Luton Town,Fulham
140,Attempt missed. Bobby De Cordova-Reid (Fulham)...,shotOffTarget,23924481,93,2ND,False,Bobby Decordova-Reid,,,bobby-decordova-reid,...,,,93.0,Premier League 23/24,Luton Town,Fulham,72,43,Fulham,Luton Town
141,"Second Half ends, Luton Town 2, Fulham 4.",endSecondHalf,23924492,95,2ND,,,,,,...,,,,Premier League 23/24,Luton Town,Fulham,72,43,Fulham,Fulham


In [80]:
#list(event_data.columns) # may use hasXg, 'hasXg','hasEventPlayerStatistics','hasEventPlayerHeatMap', as try catch or something



In [4]:
url = 'https://www.sofascore.com/api/v1/player/934386/image'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Referer': 'https://www.sofascore.com/',
    'Origin': 'https://www.sofascore.com'
}

with httpx.Client(headers=headers) as client:
    response = client.get(url)
    i = Image.open(BytesIO(response.content))



If it's a large request, use tqdm to provide user with download time:

```
import tempfile

import httpx
from tqdm import tqdm

with tempfile.NamedTemporaryFile() as download_file:
    url = "https://speed.hetzner.de/100MB.bin"
    with httpx.stream("GET", url) as response:
        total = int(response.headers["Content-Length"])

        with tqdm(total=total, unit_scale=True, unit_divisor=1024, unit="B") as progress:
            num_bytes_downloaded = response.num_bytes_downloaded
            for chunk in response.iter_bytes():
                download_file.write(chunk)
                progress.update(response.num_bytes_downloaded - num_bytes_downloaded)
                num_bytes_downloaded = response.num_bytes_downloaded
```

Note: Don't literally code this, use it as a reference. It's just opening a temp file then running a request to website


Making multiple requests at once:

```
import asyncio
import httpx

async def do_tasks():
    async with httpx.AsyncClient() as client:
        tasks = [client.get(f"http://my-api/{url_param}") for url_param in parameters]
        result = await asyncio.gather(*tasks)

```

In [11]:
game.columns

Index(['text', 'type', 'id', 'time', 'periodName', 'isHome', 'player.name',
       'player.slug', 'player.shortName', 'player.position',
       'player.jerseyNumber', 'player.userCount', 'player.id',
       'player.fieldTranslations.nameTranslation.ar',
       'player.fieldTranslations.shortNameTranslation.ar', 'player.firstName',
       'player.lastName', 'assist1.name', 'assist1.slug', 'assist1.shortName',
       'assist1.position', 'assist1.jerseyNumber', 'assist1.userCount',
       'assist1.id', 'assist1.fieldTranslations.nameTranslation.ar',
       'assist1.fieldTranslations.shortNameTranslation.ar', 'playerIn.name',
       'playerIn.slug', 'playerIn.shortName', 'playerIn.position',
       'playerIn.jerseyNumber', 'playerIn.userCount', 'playerIn.id',
       'playerOut.name', 'playerOut.slug', 'playerOut.shortName',
       'playerOut.position', 'playerOut.jerseyNumber', 'playerOut.userCount',
       'playerOut.id', 'playerOut.fieldTranslations.nameTranslation.ar',
       'playerOut

In [53]:
stats = pd.json_normalize(df.explode('statisticsItems')['statisticsItems'])

In [80]:
stats.head()

Unnamed: 0,name,home,away,compareCode,statisticsType,valueType,homeValue,awayValue,renderType,key,homeTotal,awayTotal
0,Ball possession,74%,26%,1,positive,event,74.0,26.0,2,ballPossession,,
1,Expected goals,4.32,0.51,1,positive,event,4.32,0.51,1,expectedGoals,,
2,Big chances,5,2,1,positive,event,5.0,2.0,1,bigChanceCreated,,
3,Total shots,37,4,1,positive,event,37.0,4.0,1,totalShotsOnGoal,,
4,Goalkeeper saves,1,9,2,positive,event,1.0,9.0,1,goalkeeperSaves,,


In [None]:
stats.set_index("name", inplace=True)



In [90]:
# Transpose the DataFrame
stats.T.iloc[:2, :].reset_index()

name,index,Ball possession,Expected goals,Big chances,Total shots,Goalkeeper saves,Corner kicks,Fouls,Passes,Tackles,...,Aerial duels,Dribbles,Tackles won,Total tackles,Interceptions,Recoveries,Clearances,Total saves,Goals prevented,Goal kicks
0,home,74%,4.32,5,37,1,12,7,763,20,...,9/23 (39%),10/18 (56%),65%,20,5,52,3,1,0.23,1
1,away,26%,0.51,2,4,9,1,6,276,13,...,14/23 (61%),7/11 (64%),54%,13,8,44,28,9,-1.18,16


In [19]:
import json

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService

from webdriver_manager.chrome import ChromeDriverManager

In [20]:
options = webdriver.ChromeOptions()
options.set_capability(
    "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}
)


driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
driver.set_page_load_timeout(10)

try:
    driver.get("https://www.sofascore.com/luton-town-fulham/Tsxb#id:11352568")
except:
    pass


driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [21]:
logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

In [None]:
for x in logs:
    path = x['params'].get('headers', {}).get(':path', '') # Extract the ':path' value from the headers, defaulting to an empty string if not found
    if '/api/' in path:
        print(path)

In [29]:
import requests

response = requests.get('https://www.sofascore.com/api/v1/event/11352568/shotmap').json()

