Baseline expected goals is just the overall scoring percentage.

In [1]:
import pathlib
import requests

from dataclasses import dataclass
import json
from kloppy import statsbomb
from kloppy.domain import EventFactory, create_event, ShotEvent


from soccer_analytics.data.statsbomb import get_metadata

In [2]:
@dataclass(repr=False)
class StatsBombShotEvent(ShotEvent):
    statsbomb_xg: float = None
    is_penalty: bool = False
    
    
class StatsBombEventFactory(EventFactory):
    def build_shot(self, **kwargs) -> ShotEvent:
        kwargs['statsbomb_xg'] = kwargs['raw_event']['shot']['statsbomb_xg']
        kwargs['is_penalty'] = kwargs["raw_event"]["shot"]["type"]["name"] == "Penalty"
        return create_event(StatsBombShotEvent, **kwargs)
       
        
event_factory = StatsBombEventFactory()

In [3]:
competitions = get_metadata()

In [6]:
p = pathlib.Path("../../data/statsbomb/")
lineup_dir = p / "lineups"
event_dir = p / "events"
lineup_dir.mkdir(parents=True, exist_ok=True)
event_dir.mkdir(parents=True, exist_ok=True)

shot_percentage = {}
for competition in competitions:
    print(competition.name, len(competition.seasons))
    shot_percentage[competition.name] = {}
    for season in competition.seasons:
        print(f"    {season.name}: {len(season.matches)}")
        shot_percentage[competition.name][season.name] = []
        for i, match in enumerate(season.matches):
            event_file = event_dir / f"{match.match_id}.json"
            lineup_file = lineup_dir / f"{match.match_id}.json"
            if event_file.is_file() is False:
                event_data = requests.get(
                    f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match.match_id}.json"
                )
                with open(event_file, "w") as f:
                    json.dump(event_data.json(), f)
            if lineup_file.is_file() is False:
                lineup_data = requests.get(
                    f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match.match_id}.json"
                )
                with open(lineup_file, "w") as f:
                    json.dump(lineup_data.json(), f)
            try:
                events = statsbomb.load(
                    event_file, lineup_file, 
                    event_types=["shot"], coordinates="statsbomb", event_factory=event_factory
                )
            except json.JSONDecodeError:
                print(f"Parse error for match_id {match.match_id}")
            num_shots = len(events)
            num_goals = 0
            for event in events:
                if event.result.is_success:
                    num_goals += 1
            if i % 20 == 0:
                print(
                    f"        {i+1}/{len(season.matches)} {match.home_team} — {match.away_team}:"
                    f"Listed goals = {match.home_score + match.away_score}; Calculated goals = {num_goals}"
                    )
            if num_goals > match.home_score + match.away_score:  # < is ok because of own goals
                print(f"            MISMATCH: {match.match_id}")
            shot_percentage[competition.name][season.name].append((num_shots, num_goals))
                
        

1. Bundesliga 1
    2015/2016: 306
        1/306 Hoffenheim — Schalke 04:Listed goals = 5; Calculated goals = 4
        21/306 FSV Mainz 05 — Hertha Berlin:Listed goals = 0; Calculated goals = 0
        41/306 Augsburg — FC Köln:Listed goals = 0; Calculated goals = 0
        61/306 Schalke 04 — Borussia Dortmund:Listed goals = 4; Calculated goals = 4
        81/306 FC Köln — Bayern Munich:Listed goals = 1; Calculated goals = 1
        101/306 Wolfsburg — Borussia Mönchengladbach:Listed goals = 3; Calculated goals = 3
        121/306 Schalke 04 — VfB Stuttgart:Listed goals = 2; Calculated goals = 2
        141/306 Hertha Berlin — Borussia Dortmund:Listed goals = 0; Calculated goals = 0
        161/306 Hoffenheim — Bayer Leverkusen:Listed goals = 2; Calculated goals = 2


KeyboardInterrupt: 

In [None]:
with open("3845506.json", "r") as f:
    raw_data_list = f.readlines()
with open("3845506.json", "r") as f:
    raw_data = f.read()

In [None]:
[event for event in test_data if event.result.is_success]

In [None]:
test_data[-1].raw_event

In [None]:


dataset = statsbomb.load_open_data(
    3794691, event_types=["shot"], coordinates="statsbomb", event_factory=event_factory
)

In [None]:
[attr for attr in dir(dataset[-1]) if not attr.startswith("__")]

In [None]:
[attr for attr in dir(test_data[-1]) if not attr.startswith("__")]

In [None]:
data = statsbomb.load(
    "../../data/statsbomb/events/3890561.json",
    "../../data/statsbomb/lineups/3890561.json"
)

In [None]:
test = requests.get(
    f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/3890561.json"
)

In [None]:
with open("test.json", "w") as f:
    json.dump(test.json(), f)
#test.json()