In [None]:
from playwright.async_api import async_playwright
from collections import defaultdict
import pandas as pd

async def get_sofascore_events(match_id: int):
    """
    Fetches live/unofficial Sofascore event data using Playwright.
    This avoids 403 by performing the request inside a browser context.
    """
    url = f"https://api.sofascore.com/api/v1/event/{match_id}/attacks"

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        # Step 1 — Load homepage to establish session/cookies
        await page.goto("https://www.sofascore.com/", wait_until="networkidle")

        # Step 2 — Use page.evaluate to execute fetch() inside the browser
        js = """
        async (url) => {
            const resp = await fetch(url, {
                method: 'GET',
                headers: {
                    'Accept': 'application/json, text/plain, */*'
                }
            });
            if (!resp.ok) {
                const text = await resp.text();
                return { status: resp.status, error: text };
            }
            const data = await resp.json();
            return { status: resp.status, data };
        }
        """

        result = await page.evaluate(js, url)
        await browser.close()

        if result["status"] != 200:
            raise RuntimeError(
                f"Sofascore error {result['status']}: {result.get('error')}"
            )
        
        return result["data"]

In [2]:
def parse_events(raw_json):
    events = raw_json.get("attacks", [])
    parsed = []

    for ev in events:
        parsed.append({
            "team_id": ev.get("team", {}).get("id"),
            "event_type": ev.get("type"),
            "minute": ev.get("time", {}).get("minute"),
            "x": ev.get("position", {}).get("x"),
            "y": ev.get("position", {}).get("y"),
            "end_x": ev.get("endPosition", {}).get("x"),
            "end_y": ev.get("endPosition", {}).get("y"),
            "is_accurate": ev.get("isAccurate", False),
            "is_goal": ev.get("isGoal", False),
            "is_own_goal": ev.get("isOwnGoal", False),
            "card": ev.get("card", None)   # yellow, red, null
        })
    return parsed

In [4]:
import numpy as np
xt_grid = np.random.random((8, 12))
np.save("xt_grid.npy", xt_grid)

In [5]:
xt_grid = np.load("xt_grid.npy")    # shape (8, 12)

In [6]:
def xt_for_pass(ev, xt_grid):
    if ev["event_type"] != "pass": 
        return 0.0
    if not ev["is_accurate"]:
        return 0.0

    # convert percentage → grid indices
    x1_idx = int(ev["x"] // (100/12))
    y1_idx = int(ev["y"] // (100/8))
    x2_idx = int(ev["end_x"] // (100/12))
    y2_idx = int(ev["end_y"] // (100/8))

    x1_idx = np.clip(x1_idx, 0, 11)
    x2_idx = np.clip(x2_idx, 0, 11)
    y1_idx = np.clip(y1_idx, 0, 7)
    y2_idx = np.clip(y2_idx, 0, 7)

    return float(xt_grid[y2_idx][x2_idx] - xt_grid[y1_idx][x1_idx])

In [None]:


def build_features(match_id, home_team_id, away_team_id):
    raw = get_sofascore_events(match_id)
    parsed = parse_events(raw)

    team_ids = [home_team_id, away_team_id]
    max_minute = 100
    
    goals = {t: [0]*(max_minute+1) for t in team_ids}
    yellows = {t: [0]*(max_minute+1) for t in team_ids}
    reds = {t: [0]*(max_minute+1) for t in team_ids}
    xt_vals = {t: [0]*(max_minute+1) for t in team_ids}
    passes_count = {t: [0]*(max_minute+1) for t in team_ids}

    for ev in parsed:
        t = ev["team_id"]
        if t not in team_ids: 
            continue

        m = ev["minute"]
        if m is None: 
            continue

        # goals
        if ev["is_goal"] and not ev["is_own_goal"]:
            goals[t][m] += 1
        if ev["is_goal"] and ev["is_own_goal"]:
            other = away_team_id if t == home_team_id else home_team_id
            goals[other][m] += 1

        # cards
        if ev["card"] == "yellow":
            yellows[t][m] += 1
        if ev["card"] == "red":
            reds[t][m] += 1

        # xT
        xt = xt_for_pass(ev, xt_grid)
        if xt != 0.0:
            xt_vals[t][m] += xt
            passes_count[t][m] += 1

    # Cumulative
    def cumulative(arr):
        s = 0
        res = []
        for v in arr:
            s += v
            res.append(s)
        return res

    data = []
    for minute in range(max_minute+1):
        for t in team_ids:
            opp = away_team_id if t == home_team_id else home_team_id

            g_for = cumulative(goals[t])[minute]
            g_against = cumulative(goals[opp])[minute]

            y_for = cumulative(yells := yellows[t])[minute]
            y_against = cumulative(yellows[opp])[minute]

            p_for = 11 - cumulative(reds[t])[minute]
            p_against = 11 - cumulative(reds[opp])[minute]

            xt_for = cumulative(xt_vals[t])[minute]
            xt_against = cumulative(xt_vals[opp])[minute]

            avg_team_xt = xt_for / max(1, cumulative(passes_count[t])[minute])
            avg_opp_xt = xt_against / max(1, cumulative(passes_count[opp])[minute])

            data.append({
                "match_id": match_id,
                "team_id": t,
                "is_home_team": int(t == home_team_id),
                "time_interval": minute,
                "goals_scored": g_for,
                "player_differential": p_for - p_against,
                "own_yellow_cards": y_for,
                "opposition_yellow_cards": y_against,
                "avg_team_xt": avg_team_xt,
                "avg_opp_xt": avg_opp_xt,
                "running_xt_differential": xt_for - xt_against,
                "score_differential": g_for - g_against,
            })

    return pd.DataFrame(data)

In [16]:
import requests

def sofascore_get(url):
    session = requests.Session()

    # Step 1: load homepage to get cookies
    session.get(
        "https://www.sofascore.com/",
        headers={
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
        },
        timeout=10,
    )

    # Step 2: call API with cookies + headers
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/121.0.0.0 Safari/537.36"
        ),
        "Accept": "application/json, text/plain, */*",
        "Referer": "https://www.sofascore.com/",
        "Origin": "https://www.sofascore.com",
    }

    r = session.get(url, headers=headers, timeout=10)
    r.raise_for_status()
    return r.json()


def get_today_matches():
    return sofascore_get("https://api.sofascore.com/api/v1/sport/football/events/live")

In [25]:
import asyncio
from playwright.async_api import async_playwright

async def sofascore_get(url: str):
    """
    Async version using Playwright.
    Call this with:  data = await sofascore_get(url)
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        # Step 1: load homepage to establish cookies/session
        await page.goto("https://www.sofascore.com/", wait_until="networkidle")

        # Step 2: use fetch inside the page context to call the API
        js = """
        async (url) => {
            const resp = await fetch(url, {
                method: 'GET',
                headers: {
                    'Accept': 'application/json, text/plain, */*'
                }
            });
            if (!resp.ok) {
                const text = await resp.text();
                return { status: resp.status, error: text };
            }
            const data = await resp.json();
            return { status: resp.status, data };
        }
        """

        result = await page.evaluate(js, url)
        await browser.close()

        if result["status"] != 200:
            raise RuntimeError(f"Sofascore error {result['status']}: {result.get('error')}")
        return result["data"]

In [23]:
async def get_today_matches():
    url = "https://api.sofascore.com/api/v1/sport/football/events/live"
    return await sofascore_get(url)

In [26]:
async def get_today_matches():
    url = "https://api.sofascore.com/api/v1/sport/football/events/live"
    return await sofascore_get(url)

In [36]:
import asyncio

async def main():
    matches = await get_today_matches()
    print(matches)


    
matches = await get_today_matches()

  matches = await get_today_matches()


In [37]:
matches

{'events': [{'tournament': {'name': 'Copa Trinche Rushbet Antioquia - Vacacional La Floresta, Grupo A',
    'slug': 'copa-trinche-rushbet-antioquia-vacacional-la-floresta-grupo-a',
    'category': {'name': 'Colombia Amateur',
     'slug': 'colombia-amateur',
     'sport': {'name': 'Football', 'slug': 'football', 'id': 1},
     'id': 1919,
     'country': {'alpha2': 'CO',
      'alpha3': 'COL',
      'name': 'Colombia',
      'slug': 'colombia'},
     'flag': 'colombia',
     'alpha2': 'CO',
     'fieldTranslations': {'nameTranslation': {'ar': 'كولومبيا هواة'},
      'shortNameTranslation': {}}},
    'uniqueTournament': {'name': 'Copa Trinche Rushbet Antioquia - Vacacional La Floresta',
     'slug': 'copa-trinche-rushbet-antioquia-vacacional-la-floresta',
     'category': {'name': 'Colombia Amateur',
      'slug': 'colombia-amateur',
      'sport': {'name': 'Football', 'slug': 'football', 'id': 1},
      'id': 1919,
      'country': {'alpha2': 'CO',
       'alpha3': 'COL',
       'name'

In [38]:
len(matches['events'])

7

In [39]:
for event in matches["events"]:
    print(event["id"], event["homeTeam"]["name"], "vs", event["awayTeam"]["name"])

15123764 ITM vs Corcasin FC
14970607 LD Alajuelense vs CS Cartaginés
15029788 Club Tijuana vs FC Juárez
15114573 Tepatitlán FC vs CDS Tampico Madero
15119248 Club América vs Tigres UANL
15117913 Saint Louis Billikens vs Kentucky Wildcats
15117915 Oregon State Beavers vs Washington Huskies


In [40]:
match_id = event["id"]
home_team_id = event["homeTeam"]["id"]
away_team_id = event["awayTeam"]["id"]

In [44]:
events = await get_sofascore_events(15029788)
print(events)

RuntimeError: Sofascore error 404: {"error":{"code":404,"message":"Not Found"}}

In [41]:
features_df = build_features(
    match_id=match_id,
    home_team_id=home_team_id,
    away_team_id=away_team_id
)

HTTPError: 403 Client Error: Forbidden for url: https://api.sofascore.com/api/v1/event/15117915/attacks