In [1]:
import pandas as pd
import dill

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss, mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropConstantFeatures, DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from types import MethodType

from soccer_analytics.data.statsbomb import get_metadata, get_events
from soccer_analytics.data_split import split_by_time
from soccer_analytics.preprocessing import match_list_to_df, AngleNormalizer

In [2]:
competitions = get_metadata()

In [3]:
full_competitions = {
   "1. Bundesliga": ["2015/2016"],
    "FA Women's Super League": ["2018/2019", "2019/2020", "2020/2021"],
    "FIFA World Cup": ["2018", "2022"],
    "Indian Super league": ["2021/2022"],
    "La Liga": ["2015/2016"],
    "Ligue 1": ["2015/2016"],
    "Premier League": ["2015/2016"],
    "Serie A": ["2015/2016"],
    "UEFA Euro": ["2020"],
    "UEFA Women's Euro": ["2022"],
    "Women's World Cup": ["2019", "2023"]
}

In [4]:
matches_by_competition = {}
for competition in competitions:
    print(competition.name, len(competition.seasons))
    if competition.name in full_competitions:
        matches_by_competition[competition.name] = []
        for season in competition.seasons:
            print(f"    {season.name}: {len(season.matches)}")
            if season.name in full_competitions[competition.name]:
                matches_by_competition[competition.name].extend(get_events(
                    season, event_types=["shot"]
                ))

1. Bundesliga 1
    2015/2016: 306
Champions League 18
Copa del Rey 3
FA Women's Super League 3
    2020/2021: 131
    2019/2020: 87
    2018/2019: 108
FIFA U20 World Cup 1
FIFA World Cup 8
    2022: 64
    2018: 64
    1990: 1
    1986: 3
    1974: 6
    1970: 6
    1962: 1
    1958: 2
Indian Super league 1
    2021/2022: 115
La Liga 18
    2020/2021: 35
    2019/2020: 33
    2018/2019: 34
    2017/2018: 36
    2016/2017: 34
    2015/2016: 380
    2014/2015: 38
    2013/2014: 31
    2012/2013: 32
    2011/2012: 37
    2010/2011: 33
    2009/2010: 35
    2008/2009: 31
    2007/2008: 28
    2006/2007: 26
    2005/2006: 17
    2004/2005: 7
    1973/1974: 1
Liga Profesional 2
Ligue 1 3
    2022/2023: 32
    2021/2022: 26
    2015/2016: 377
Major League Soccer 1
North American League 1
NWSL 1
Premier League 2
    2015/2016: 380
    2003/2004: 38
Serie A 2
    2015/2016: 380
    1986/1987: 1
UEFA Euro 1
    2020: 51
UEFA Europa League 1
UEFA Women's Euro 1
    2022: 31
Women's World Cup 2
 

In [5]:
train_matches = []
test_matches = []
validation_matches = []
for competition_name, matches in matches_by_competition.items():
    if len(matches) > 0:
        # out of time validation: last 20% of matches by time in the competition
        competition_train_test_matches, competition_validation_matches = split_by_time(matches, test_frac=0.2)
        # In time test: random 20% of matches from the first 80% of time in the competition:
        competition_train_matches, competition_test_matches = train_test_split(
            competition_train_test_matches, test_size=0.2, random_state=235
        )
        train_matches.extend(competition_train_matches)
        test_matches.extend(competition_test_matches)
        validation_matches.extend(competition_validation_matches)
len(train_matches), len(test_matches), len(validation_matches)

(1650, 417, 523)

In [6]:
train_df = match_list_to_df(train_matches)
test_df = match_list_to_df(test_matches)
train_df = train_df.loc[train_df["is_penalty"] == False, :]
test_df = test_df.loc[test_df["is_penalty"] == False, :]

data_prep_pipeline = Pipeline((
    ("drop_unused_columns", DropFeatures(["statsbomb_xg", "coordinates_x", "coordinates_y", "is_goal", "technique", "position"])),
    ("drop_constant_columns", DropConstantFeatures()),
    ("create_angle_from_goal", AngleNormalizer(variable="angle_to_goal", new_variable="angle_from_goal")),
    ("passthrough", "passthrough")
))

train_df_transformed = data_prep_pipeline.fit_transform(train_df)
test_df_transformed = data_prep_pipeline.transform(test_df)
train_df_transformed.head()

Unnamed: 0,is_first_time,distance_to_goal,is_body_part_right_foot,is_body_part_left_foot,is_body_part_head,is_body_part_other,angle_from_goal
0,False,23.640326,False,True,False,False,12.831123
1,True,25.948892,True,False,False,False,50.943628
2,False,18.1087,True,False,False,False,46.566291
3,False,32.041926,False,True,False,False,39.427802
4,False,10.08489,False,False,True,False,63.816068


In [7]:
model = LogisticRegression(penalty=None, max_iter=10000, random_state=315)
model.fit(train_df_transformed, train_df["is_goal"])
test_probs = model.predict_proba(test_df_transformed)[:, 1]
len(test_probs)

10330

In [8]:
brier = brier_score_loss(test_df["is_goal"], test_probs)
auroc = roc_auc_score(test_df["is_goal"], test_probs)
brier, auroc

(0.0804275739307427, 0.7652322596671359)

In [9]:
coefficient_info = list(zip(train_df_transformed.columns, model.coef_[0]))
coefficient_info = pd.DataFrame(coefficient_info, columns=["coef_name", "coef_value"])
coefficient_info

Unnamed: 0,coef_name,coef_value
0,is_first_time,-0.304899
1,distance_to_goal,-0.160741
2,is_body_part_right_foot,0.59708
3,is_body_part_left_foot,0.501881
4,is_body_part_head,-0.781629
5,is_body_part_other,0.128582
6,angle_from_goal,-0.009846
