In [1]:
import pandas as pd
from kloppy.domain.services.transformers.attribute import BodyPartTransformer, AngleToGoalTransformer, DistanceToGoalTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, brier_score_loss
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropConstantFeatures, DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from types import MethodType

from soccer_analytics.data.statsbomb import get_metadata, get_events
from soccer_analytics.data_split import split_by_time

In [2]:
competitions = get_metadata()

In [3]:
full_competitions = {
   "1. Bundesliga": ["2015/2016"],
    "FA Women's Super League": ["2018/2019", "2019/2020", "2020/2021"],
    "FIFA World Cup": ["2018", "2022"],
    "Indian Super league": ["2021/2022"],
    "La Liga": ["2015/2016"],
    "Ligue 1": ["2015/2016"],
    "Premier League": ["2015/2016"],
    "Serie A": ["2015/2016"],
    "UEFA Euro": ["2020"],
    "UEFA Women's Euro": ["2022"],
    "Women's World Cup": ["2019", "2023"]
}

In [4]:
matches_by_competition = {}
for competition in competitions:
    print(competition.name, len(competition.seasons))
    if competition.name in full_competitions:
        matches_by_competition[competition.name] = []
        for season in competition.seasons:
            print(f"    {season.name}: {len(season.matches)}")
            if season.name in full_competitions[competition.name]:
                matches_by_competition[competition.name].extend(get_events(
                    season, event_types=["shot"]
                ))

1. Bundesliga 1
    2015/2016: 306
Champions League 18
Copa del Rey 3
FA Women's Super League 3
    2020/2021: 131
    2019/2020: 87
    2018/2019: 108
FIFA U20 World Cup 1
FIFA World Cup 8
    2022: 64
    2018: 64
    1990: 1
    1986: 3
    1974: 6
    1970: 6
    1962: 1
    1958: 2
Indian Super league 1
    2021/2022: 115
La Liga 18
    2020/2021: 35
    2019/2020: 33
    2018/2019: 34
    2017/2018: 36
    2016/2017: 34
    2015/2016: 380
    2014/2015: 38
    2013/2014: 31
    2012/2013: 32
    2011/2012: 37
    2010/2011: 33
    2009/2010: 35
    2008/2009: 31
    2007/2008: 28
    2006/2007: 26
    2005/2006: 17
    2004/2005: 7
    1973/1974: 1
Liga Profesional 2
Ligue 1 1
    2015/2016: 377
North American League 1
NWSL 1
Premier League 2
    2015/2016: 380
    2003/2004: 38
Serie A 2
    2015/2016: 380
    1986/1987: 1
UEFA Euro 1
    2020: 51
UEFA Europa League 1
UEFA Women's Euro 1
    2022: 31
Women's World Cup 2
    2023: 64
    2019: 52


In [5]:
def match_list_to_df(match_list):
    df_list = []
    for match in match_list:
        df_list.append(match.to_df(
            "is_penalty",
            "is_first_time",
            "statsbomb_xg",
            "technique",
            "coordinates*",
            AngleToGoalTransformer(),
            DistanceToGoalTransformer(),
            BodyPartTransformer(),
            position=lambda event: "Unknown" if event.player.position is None else event.player.position.name,
            is_goal=lambda event: int(event.result.is_success)
        ))
    return pd.concat(df_list)


class AngleNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, variable: str, new_variable: str):
        self.variable = variable
        self.new_variable = new_variable
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        normalized_angle = (X[self.variable].abs() - 90).abs()
        X = X.drop(self.variable, axis=1)
        X[self.new_variable] = normalized_angle
        return X

In [8]:
loss_info = []
coefficient_info = []
for competition_name, matches in matches_by_competition.items():
    train_test_matches, _ = split_by_time(matches, test_frac=0.2)
    train_matches, test_matches = train_test_split(
        train_test_matches, test_size=0.2, random_state=235
    )
    train_df = match_list_to_df(train_matches)
    test_df = match_list_to_df(test_matches)
    data_prep_pipeline = Pipeline((
        ("encode_categories", OneHotEncoder(variables=["technique", "position"], drop_last=True)),
        ("drop_unused_columns", DropFeatures(["statsbomb_xg", "coordinates_x", "coordinates_y", "is_goal"])),
        ("drop_constant_columns", DropConstantFeatures()),
        ("create_angle_from_goal", AngleNormalizer(variable="angle_to_goal", new_variable="angle_from_goal")),
        ("passthrough", "passthrough"),
    ))
    model = LogisticRegression(penalty=None, max_iter=10000, random_state=315)
    train_df_transformed = data_prep_pipeline.fit_transform(train_df)
    test_df_transformed = data_prep_pipeline.transform(test_df)
    model.fit(train_df_transformed, train_df["is_goal"])
    test_probs = model.predict_proba(test_df_transformed)[:,1]
    
    mae = mean_absolute_error(test_df["is_goal"], test_probs)
    mae_statsbomb = mean_absolute_error(test_df["is_goal"], test_df["statsbomb_xg"])
    brier = brier_score_loss(test_df["is_goal"], test_probs)
    brier_statsbomb = brier_score_loss(test_df["is_goal"], test_df["statsbomb_xg"])
    loss_info.append({
        "competition": competition_name, 
        "mae": mae,
        "mae_statsbomb": mae_statsbomb,
        "brier": brier,
        "brier_statsbomb": brier_statsbomb
    })
    coefficient_info.extend(list(zip(
        [competition_name] * len(model.coef_[0]), train_df_transformed.columns, model.coef_[0]
    )))
loss_info = pd.DataFrame(loss_info)
coefficient_info = pd.DataFrame(coefficient_info, columns=["competition", "coef_name", "coef_value"])

In [9]:
loss_info

Unnamed: 0,competition,mae,mae_statsbomb,brier,brier_statsbomb
0,1. Bundesliga,0.159071,0.144773,0.0803,0.069194
1,FA Women's Super League,0.18021,0.165271,0.092913,0.088185
2,FIFA World Cup,0.145026,0.134978,0.069972,0.064855
3,Indian Super league,0.183942,0.151895,0.094218,0.081117
4,La Liga,0.16644,0.151398,0.08579,0.076159
5,Ligue 1,0.162668,0.145112,0.084874,0.077408
6,Premier League,0.152728,0.142464,0.074902,0.070332
7,Serie A,0.140389,0.134125,0.070098,0.069914
8,UEFA Euro,0.176905,0.17228,0.099426,0.092162
9,UEFA Women's Euro,0.157715,0.121463,0.093504,0.05007


In [10]:
coefficient_info

Unnamed: 0,competition,coef_name,coef_value
0,1. Bundesliga,is_penalty,1.767217
1,1. Bundesliga,is_first_time,-0.271448
2,1. Bundesliga,distance_to_goal,-0.204031
3,1. Bundesliga,is_body_part_right_foot,1.846472
4,1. Bundesliga,is_body_part_left_foot,1.618581
...,...,...,...
391,Women's World Cup,position_Center Back,-1.915062
392,Women's World Cup,position_Left Attacking Midfield,-1.246888
393,Women's World Cup,position_Left Wing Back,-0.415338
394,Women's World Cup,position_Right Wing Back,-8.225015


In [12]:
coefficient_info[coefficient_info.coef_name == "is_first_time"].sort_values("coef_value")

Unnamed: 0,competition,coef_name,coef_value
114,Indian Super league,is_first_time,-0.385813
328,UEFA Women's Euro,is_first_time,-0.33275
149,La Liga,is_first_time,-0.329625
1,1. Bundesliga,is_first_time,-0.271448
183,Ligue 1,is_first_time,-0.132208
361,Women's World Cup,is_first_time,-0.092106
256,Serie A,is_first_time,-0.076054
75,FIFA World Cup,is_first_time,-0.059197
219,Premier League,is_first_time,-0.014923
37,FA Women's Super League,is_first_time,0.042539
