In [96]:
import pandas as pd
from kloppy.domain.services.transformers.attribute import BodyPartTransformer, AngleToGoalTransformer, DistanceToGoalTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropConstantFeatures, DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from types import MethodType

from soccer_analytics.data.statsbomb import get_metadata, get_events
from soccer_analytics.data_split import split_by_time

In [2]:
competitions = get_metadata()

In [3]:
full_competitions = {
   "1. Bundesliga": ["2015/2016"],
#     "FA Women's Super League": ["2018/2019", "2019/2020", "2020/2021"],
#     "FIFA World Cup": ["2018", "2022"],
#     "Indian Super league": ["2021/2022"],
#     "La Liga": ["2015/2016"],
#     "Ligue 1": ["2015/2016"],
#     "Premier League": ["2015/2016"],
#     "Serie A": ["2015/2016"],
#     "UEFA Euro": ["2020"],
#     "UEFA Women's Euro": ["2022"],
#     "Women's World Cup": ["2019", "2023"]
}

In [4]:
matches_by_competition = {}
for competition in competitions:
    print(competition.name, len(competition.seasons))
    if competition.name in full_competitions:
        matches_by_competition[competition.name] = []
        for season in competition.seasons:
            print(f"    {season.name}: {len(season.matches)}")
            if season.name in full_competitions[competition.name]:
                matches_by_competition[competition.name].extend(get_events(
                    season, event_types=["shot"]
                ))

1. Bundesliga 1
    2015/2016: 306
Champions League 18
Copa del Rey 3
FA Women's Super League 3
FIFA U20 World Cup 1
FIFA World Cup 8
Indian Super league 1
La Liga 18
Liga Profesional 2
Ligue 1 1
North American League 1
NWSL 1
Premier League 2
Serie A 2
UEFA Euro 1
UEFA Europa League 1
UEFA Women's Euro 1
Women's World Cup 2


In [108]:
class MatchListToDf(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, match_list, y=None):
        df_list = []
        for match in match_list:
            df_list.append(match.to_df(
                "is_penalty",
                "is_first_time",
                "statsbomb_xg",
                "technique",
                "coordinates*",
                AngleToGoalTransformer(),
                DistanceToGoalTransformer(),
                BodyPartTransformer(),
                position=lambda event: "Unknown" if event.player.position is None else event.player.position.name,
                is_goal=lambda event: int(event.result.is_success)
            ))
        return pd.concat(df_list)

class AngleNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, variable: str, new_variable: str):
        self.variable = variable
        self.new_variable = new_variable
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        normalized_angle = (X[self.variable].abs() - 90).abs()
        X = X.drop(self.variable, axis=1)
        X[self.new_variable] = normalized_angle
        return X

    
class FitModel(BaseEstimator, TransformerMixin):
    def __init__(self, model_instance, target_variable: str):
        self.model_instance = model_instance
        self.target_variable = target_variable
  
    def __getattr__(self, name:str):
        obj = getattr(self.model_instance, name)
        def extraction_wrapper(*args, **kwargs):
            if len(args) == 2 and isinstance(args[0], pd.DataFrame):
                y = args[0][self.target_variable]
                X = args[0].drop(self.target_variable, axis=1)
                self.columns_ = X.columns
                return obj(X, y, **kwargs)
            else:
                return obj(*args, **kwargs)
        if isinstance(obj, types.MethodType):
            return extraction_wrapper
        return obj
        
        

all_matches = matches_by_competition["1. Bundesliga"]
train_test_matches, validation_matches = split_by_time(all_matches)
train_matches, test_matches = train_test_split(
    train_test_matches, test_size=0.2, random_state=235
)
pipeline = Pipeline((
    ("convert_to_df", MatchListToDf()),
    ("encode_categories", OneHotEncoder(variables=["technique", "position"], drop_last=True)),
    ("drop_unused_columns", DropFeatures(["statsbomb_xg", "coordinates_x", "coordinates_y"])),
    ("drop_constant_columns", DropConstantFeatures()),
    ("create_angle_from_goal", AngleNormalizer(variable="angle_to_goal", new_variable="angle_from_goal")),
    #("scale", SklearnTransformerWrapper(transformer=StandardScaler(), variables=["angle_from_goal", "distance_to_goal"])),
    #("passthrough", "passthrough"),
    ("fit_model", FitModel(LogisticRegression(penalty=None, max_iter=10000, random_state=315), "is_goal")),
))

In [109]:
#pipeline.fit_transform(train_matches).head()

In [116]:
test = pipeline.fit(train_matches)
pd.Series(test.steps[-1][1].coef_[0], index=test.steps[-1][1].columns_), test.steps[-1][1].intercept_

(is_penalty                            1.767217
 is_first_time                        -0.271448
 distance_to_goal                     -0.204031
 is_body_part_right_foot               1.846472
 is_body_part_left_foot                1.618581
 is_body_part_head                    -0.333182
 is_body_part_other                   -0.460701
 technique_Normal                     -0.823287
 technique_Half Volley                -1.525522
 technique_Volley                     -1.681089
 technique_Overhead Kick              -2.636350
 technique_Diving Header               0.752705
 technique_Backheel                   -3.234471
 position_Center Attacking Midfield   -1.329818
 position_Right Wing                  -1.152341
 position_Right Defensive Midfield    -1.360647
 position_Left Center Back            -2.019789
 position_Left Back                   -1.201519
 position_Left Wing                   -1.175909
 position_Center Forward              -1.477697
 position_Right Center Back           -1