In [1]:
import json
import httpx
import pandas as pd
import asyncio
import nest_asyncio
import numpy as np
from PIL import Image
from io import BytesIO
import janitor


In [12]:
class Spider:
    def __init__(self):
        self.comments =  "https://www.sofascore.com/api/v1/event/12173502/comments"
        self.shotmap =  "https://www.sofascore.com/api/v1/event/12173502/shotmap"
        self.event = "https://www.sofascore.com/api/v1/event/12173502"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Referer': 'https://www.sofascore.com/',
            'Origin': 'https://www.sofascore.com'
        }
    
    async def get_comments(self):
        async with httpx.AsyncClient(headers=self.headers) as client:
            response = await client.get(self.comments)  
            df = pd.json_normalize(response.json().get('comments'))
            df['game_id'] = 12173502 # hard keyed for now
            return df
    
    async def get_shotmap(self):
        async with httpx.AsyncClient(headers=self.headers) as client:
            response = await client.get(self.shotmap) 
            df = pd.json_normalize(response.json().get('shotmap'))
            df['game_id'] = 12173502
            return df
    
    async def get_event(self):
        async with httpx.AsyncClient(headers=self.headers) as client:
            response = await client.get(self.event)  
            return pd.json_normalize(response.json().get('event'))
            

In [13]:
nest_asyncio.apply()

spider = Spider()
comments_data = await spider.get_comments()
shotmap_data = await spider.get_shotmap()
event_data = await spider.get_event()
    

In [14]:
comments_data.loc[0, 'time'] = comments_data.loc[1, 'time']

In [15]:
shots = ['shotOffTarget', 'shotBlocked', 'scoreChange', 'post', 'penaltyScored', 'shotSaved']


In [16]:
shotmap_data['time_min'] = (shotmap_data['timeSeconds'] / 60).apply(np.floor).astype(int)
shotmap_data['player.id'] = shotmap_data['player.id'].astype(float)

In [17]:
shotmap_data = shotmap_data.sort_values(by='time_min')

In [18]:
cols = ['shotType', 'situation', 'bodyPart', 'goalMouthLocation', 'xg','addedTime', 'timeSeconds', 'reversedPeriodTime', 
       'reversedPeriodTimeSeconds', 'incidentType', 'player.id','playerCoordinates.x', 'playerCoordinates.y', 'playerCoordinates.z',
       'goalMouthCoordinates.x', 'goalMouthCoordinates.y',
       'goalMouthCoordinates.z', 'draw.start.x', 'draw.start.y', 'draw.end.x',
       'draw.end.y', 'draw.goal.x', 'draw.goal.y', 'xgot',
       'blockCoordinates.x', 'blockCoordinates.y', 'blockCoordinates.z',
       'draw.block.x', 'draw.block.y', 'goalType', 'time_min']

shotmap_data = shotmap_data[cols]



In [19]:
comments_data = comments_data.sort_values(by = 'time')

In [20]:
# Some times may be off by 1 so this merge takes care of this by allotting an error of 1
# note: For larger scale I must specify the game ID for the merge
merged_df = pd.merge_asof(comments_data, shotmap_data, left_on='time', right_on = 'time_min', by=['game_id','player.id'], tolerance=1)


In [23]:
event_data

Unnamed: 0,customId,winnerCode,aggregatedWinnerCode,hasGlobalHighlights,hasXg,hasEventPlayerStatistics,hasEventPlayerHeatMap,detailId,crowdsourcingDataDisplayEnabled,id,...,awayScore.overtime,awayScore.penalties,awayScore.aggregated,time.injuryTime1,time.injuryTime2,time.injuryTime3,time.injuryTime4,time.currentPeriodStartTimestamp,changes.changes,changes.changeTimestamp
0,rsEgb,2,2,False,True,True,True,1,False,12173502,...,0,4,4,2,4,2,1,1713388737,"[status.code, status.description]",1713390476


In [24]:
event_data = event_data.groupby('id').agg(
    home_team = ('homeTeam.name', 'first'),
    away_team = ('awayTeam.name', 'first'),
    home_team_id = ('homeTeam.id', 'first'),
    away_team_id = ('awayTeam.id', 'first')
)

merged_df = merged_df.merge(event_data, left_on = 'game_id', right_on = 'id', how = 'left')

In [None]:
time_min, ishome

In [26]:
merged_df['posteam'] = np.where(merged_df['isHome'] == True, merged_df['home_team'], merged_df['away_team']) # fix this
merged_df['defteam'] = np.where(merged_df['isHome'] == False, merged_df['home_team'], merged_df['away_team']) # fix this


In [35]:
# make sure you drop is home
merged_df = janitor.clean_names(merged_df)


merged_df.drop(['time_min', 'ishome', 'player_firstname', 'player_lastname', 'playerin_firstname', 'playerin_lastname', 'assist1_firstname', 'assist1_lastname'], inplace = True, axis =1)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df.drop(['time_min', 'ishome', 'player_firstname', 'player_lastname', 'playerin_firstname', 'playerin_lastname', 'assist1_firstname', 'assist1_lastname'], inplace = True, axis =1)


In [38]:
def map_scores(df):
    df['home_score'] = 0
    df['away_score'] = 0
    
    for game_id, game_df in df.groupby('game_id'):
        home_score = 0
        away_score = 0
        for index, row in game_df.iterrows():
            if row['type'] in ['scoreChange', 'penaltyScored']:
                if row['posteam'] == row['home_team']:
                    home_score += 1
                else:
                    away_score += 1
            df.at[index, 'home_score'] = home_score
            df.at[index, 'away_score'] = away_score
    
    return df


In [39]:
merged_df = map_scores(merged_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['home_score'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['away_score'] = 0


In [42]:
merged_df

Unnamed: 0,text,type,id,time,periodname,player_name,player_slug,player_shortname,player_position,player_jerseynumber,...,draw_block_y,goaltype,home_team,away_team,home_team_id,away_team_id,posteam,defteam,home_score,away_score
0,First Half begins.,matchStarted,23550683,0,1ST,,,,,,...,,,Manchester City,Real Madrid,17,2829,Real Madrid,Real Madrid,0,0
1,Josko Gvardiol (Manchester City) wins a free k...,freeKickWon,23550940,5,1ST,Joško Gvardiol,josko-gvardiol,J. Gvardiol,D,24,...,,,Manchester City,Real Madrid,17,2829,Manchester City,Real Madrid,0,0
2,Foul by Federico Valverde (Real Madrid).,freeKickLost,23550941,5,1ST,Federico Valverde,federico-valverde,F. Valverde,M,15,...,,,Manchester City,Real Madrid,17,2829,Real Madrid,Manchester City,0,0
3,Attempt saved. Eduardo Camavinga (Real Madrid)...,shotSaved,23550982,10,1ST,Eduardo Camavinga,camavinga-eduardo,E. Camavinga,M,12,...,2.2,,Manchester City,Real Madrid,17,2829,Real Madrid,Manchester City,0,0
4,Attempt saved. Rodrygo (Real Madrid) right foo...,shotSaved,23550998,11,1ST,Rodrygo,rodrygo,Rodrygo,F,11,...,,regular,Manchester City,Real Madrid,17,2829,Real Madrid,Manchester City,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,"Goal! Manchester City 1(2), Real Madrid 1(2)....",penaltyScored,23552343,125,PEN,Phil Foden,phil-foden,P. Foden,M,47,...,,penalty,Manchester City,Real Madrid,17,2829,Manchester City,Real Madrid,3,3
149,"Goal! Manchester City 1(2), Real Madrid 1(3)....",penaltyScored,23552347,125,PEN,Nacho Fernández,nacho-fernandez,N. Fernández,D,6,...,,penalty,Manchester City,Real Madrid,17,2829,Real Madrid,Manchester City,3,4
150,"Goal! Manchester City 1(3), Real Madrid 1(3)....",penaltyScored,23552348,126,PEN,Ederson,ederson,Ederson,G,31,...,,penalty,Manchester City,Real Madrid,17,2829,Manchester City,Real Madrid,4,4
151,"Goal! Manchester City 1(3), Real Madrid 1(4)....",penaltyScored,23552350,127,PEN,Antonio Rüdiger,antonio-rudiger,A. Rüdiger,D,22,...,,penalty,Manchester City,Real Madrid,17,2829,Real Madrid,Manchester City,4,5


In [80]:
#list(event_data.columns) # may use hasXg, 'hasXg','hasEventPlayerStatistics','hasEventPlayerHeatMap', as try catch or something



In [4]:
url = 'https://www.sofascore.com/api/v1/player/934386/image'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Referer': 'https://www.sofascore.com/',
    'Origin': 'https://www.sofascore.com'
}

with httpx.Client(headers=headers) as client:
    response = client.get(url)
    i = Image.open(BytesIO(response.content))



If it's a large request, use tqdm to provide user with download time:

```
import tempfile

import httpx
from tqdm import tqdm

with tempfile.NamedTemporaryFile() as download_file:
    url = "https://speed.hetzner.de/100MB.bin"
    with httpx.stream("GET", url) as response:
        total = int(response.headers["Content-Length"])

        with tqdm(total=total, unit_scale=True, unit_divisor=1024, unit="B") as progress:
            num_bytes_downloaded = response.num_bytes_downloaded
            for chunk in response.iter_bytes():
                download_file.write(chunk)
                progress.update(response.num_bytes_downloaded - num_bytes_downloaded)
                num_bytes_downloaded = response.num_bytes_downloaded
```

Note: Don't literally code this, use it as a reference. It's just opening a temp file then running a request to website


Making multiple requests at once:

```
import asyncio
import httpx

async def do_tasks():
    async with httpx.AsyncClient() as client:
        tasks = [client.get(f"http://my-api/{url_param}") for url_param in parameters]
        result = await asyncio.gather(*tasks)

```

In [11]:
game.columns

Index(['text', 'type', 'id', 'time', 'periodName', 'isHome', 'player.name',
       'player.slug', 'player.shortName', 'player.position',
       'player.jerseyNumber', 'player.userCount', 'player.id',
       'player.fieldTranslations.nameTranslation.ar',
       'player.fieldTranslations.shortNameTranslation.ar', 'player.firstName',
       'player.lastName', 'assist1.name', 'assist1.slug', 'assist1.shortName',
       'assist1.position', 'assist1.jerseyNumber', 'assist1.userCount',
       'assist1.id', 'assist1.fieldTranslations.nameTranslation.ar',
       'assist1.fieldTranslations.shortNameTranslation.ar', 'playerIn.name',
       'playerIn.slug', 'playerIn.shortName', 'playerIn.position',
       'playerIn.jerseyNumber', 'playerIn.userCount', 'playerIn.id',
       'playerOut.name', 'playerOut.slug', 'playerOut.shortName',
       'playerOut.position', 'playerOut.jerseyNumber', 'playerOut.userCount',
       'playerOut.id', 'playerOut.fieldTranslations.nameTranslation.ar',
       'playerOut

In [53]:
stats = pd.json_normalize(df.explode('statisticsItems')['statisticsItems'])

In [80]:
stats.head()

Unnamed: 0,name,home,away,compareCode,statisticsType,valueType,homeValue,awayValue,renderType,key,homeTotal,awayTotal
0,Ball possession,74%,26%,1,positive,event,74.0,26.0,2,ballPossession,,
1,Expected goals,4.32,0.51,1,positive,event,4.32,0.51,1,expectedGoals,,
2,Big chances,5,2,1,positive,event,5.0,2.0,1,bigChanceCreated,,
3,Total shots,37,4,1,positive,event,37.0,4.0,1,totalShotsOnGoal,,
4,Goalkeeper saves,1,9,2,positive,event,1.0,9.0,1,goalkeeperSaves,,


In [None]:
stats.set_index("name", inplace=True)



In [90]:
# Transpose the DataFrame
stats.T.iloc[:2, :].reset_index()

name,index,Ball possession,Expected goals,Big chances,Total shots,Goalkeeper saves,Corner kicks,Fouls,Passes,Tackles,...,Aerial duels,Dribbles,Tackles won,Total tackles,Interceptions,Recoveries,Clearances,Total saves,Goals prevented,Goal kicks
0,home,74%,4.32,5,37,1,12,7,763,20,...,9/23 (39%),10/18 (56%),65%,20,5,52,3,1,0.23,1
1,away,26%,0.51,2,4,9,1,6,276,13,...,14/23 (61%),7/11 (64%),54%,13,8,44,28,9,-1.18,16


In [19]:
import json

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService

from webdriver_manager.chrome import ChromeDriverManager

In [20]:
options = webdriver.ChromeOptions()
options.set_capability(
    "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}
)


driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
driver.set_page_load_timeout(10)

try:
    driver.get("https://www.sofascore.com/luton-town-fulham/Tsxb#id:11352568")
except:
    pass


driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [21]:
logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

In [None]:
for x in logs:
    path = x['params'].get('headers', {}).get(':path', '') # Extract the ':path' value from the headers, defaulting to an empty string if not found
    if '/api/' in path:
        print(path)

In [29]:
import requests

response = requests.get('https://www.sofascore.com/api/v1/event/11352568/shotmap').json()

