In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss, roc_auc_score

from soccer_analytics.data.statsbomb import get_metadata, get_events
from soccer_analytics.data_split import split_by_time

In [2]:
competitions = get_metadata()

In [3]:
full_competitions = {
    "1. Bundesliga": ["2015/2016"],
    "FA Women's Super League": ["2018/2019", "2019/2020", "2020/2021"],
    "FIFA World Cup": ["2018", "2022"],
    "Indian Super league": ["2021/2022"],
    "La Liga": ["2015/2016"],
    "Ligue 1": ["2015/2016"],
    "Premier League": ["2015/2016"],
    "Serie A": ["2015/2016"],
    "UEFA Euro": ["2020"],
    "UEFA Women's Euro": ["2022"],
    "Women's World Cup": ["2019", "2023"]
}

In [4]:
matches_by_competition = {}
for competition in competitions:
    print(competition.name, len(competition.seasons))
    if competition.name in full_competitions:
        matches_by_competition[competition.name] = []
        for season in competition.seasons:
            print(f"    {season.name}: {len(season.matches)}")
            if season.name in full_competitions[competition.name]:
                matches_by_competition[competition.name].extend(get_events(
                    season, event_types=["shot"]
                ))

1. Bundesliga 1
    2015/2016: 306
Champions League 18
Copa del Rey 3
FA Women's Super League 3
    2020/2021: 131
    2019/2020: 87
    2018/2019: 108
FIFA U20 World Cup 1
FIFA World Cup 8
    2022: 64
    2018: 64
    1990: 1
    1986: 3
    1974: 6
    1970: 6
    1962: 1
    1958: 2
Indian Super league 1
    2021/2022: 115
La Liga 18
    2020/2021: 35
    2019/2020: 33
    2018/2019: 34
    2017/2018: 36
    2016/2017: 34
    2015/2016: 380
    2014/2015: 38
    2013/2014: 31
    2012/2013: 32
    2011/2012: 37
    2010/2011: 33
    2009/2010: 35
    2008/2009: 31
    2007/2008: 28
    2006/2007: 26
    2005/2006: 17
    2004/2005: 7
    1973/1974: 1
Liga Profesional 2
Ligue 1 3
    2022/2023: 32
    2021/2022: 26
    2015/2016: 377
Major League Soccer 1
North American League 1
NWSL 1
Premier League 2
    2015/2016: 380
    2003/2004: 38
Serie A 2
    2015/2016: 380
    1986/1987: 1
UEFA Euro 1
    2020: 51
UEFA Europa League 1
UEFA Women's Euro 1
    2022: 31
Women's World Cup 2
 

In [5]:
train_matches = []
test_matches = []
validation_matches = []
for competition_name, matches in matches_by_competition.items():
    if len(matches) > 0:
        # out of time validation: last 20% of matches by time in the competition
        competition_train_test_matches, competition_validation_matches = split_by_time(matches, test_frac=0.2)
        # In time test: random 20% of matches from the first 80% of time in the competition:
        competition_train_matches, competition_test_matches = train_test_split(
            competition_train_test_matches, test_size=0.2, random_state=235
        )
        train_matches.extend(competition_train_matches)
        test_matches.extend(competition_test_matches)
        validation_matches.extend(competition_validation_matches)
len(train_matches), len(test_matches), len(validation_matches)

(1650, 417, 523)

In [6]:
are_goals = []
statsbomb_xg = []
for match in test_matches:
    for shot in match.events:
        if shot.freeze_frame and not shot.is_penalty:
            are_goals.append(int(shot.result.is_success))
            statsbomb_xg.append(shot.statsbomb_xg)
len(are_goals), len(statsbomb_xg)

(10330, 10330)

In [7]:
brier_score_loss(are_goals, statsbomb_xg), roc_auc_score(are_goals, statsbomb_xg)

(0.07336164226558287, 0.8095481597250973)