In [1]:
import pandas as pd
import dill
import warnings

from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropConstantFeatures, DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from xgboost import XGBClassifier

from soccer_analytics.data.statsbomb import get_metadata, get_events, load_competition_seasons
from soccer_analytics.data_split import split_by_time
from soccer_analytics.preprocessing import match_list_to_df, AngleNormalizer

In [2]:
competitions = get_metadata()

In [3]:
full_competitions = {
#     "1. Bundesliga": ["2015/2016"],
#     "FA Women's Super League": ["2018/2019", "2019/2020", "2020/2021"],
#     "FIFA World Cup": ["2018", "2022"],
#     "Indian Super league": ["2021/2022"],
#     "La Liga": ["2015/2016"],
#     "Ligue 1": ["2015/2016"],
#     "Premier League": ["2015/2016"],
#     "Serie A": ["2015/2016"],
#     "UEFA Euro": ["2020"],
    "UEFA Women's Euro": ["2022"],
#     "Women's World Cup": ["2019", "2023"]
}

In [4]:
matches_by_competition = {}
for competition_name, seasons in full_competitions.items():
    print(competition_name, len(seasons))
    matches_by_competition[competition_name] = load_competition_seasons(
        competition_name, seasons, event_types=["shot"]
    )

UEFA Women's Euro 1


In [5]:
euro_matches = matches_by_competition["UEFA Women's Euro"]
euro_df = match_list_to_df(euro_matches)
euro_df.head()

Unnamed: 0,is_penalty,is_first_time,statsbomb_xg,technique,coordinates_x,coordinates_y,angle_to_goal,distance_to_goal,is_body_part_right_foot,is_body_part_left_foot,...,is_body_part_both_hands,is_body_part_chest,is_body_part_left_hand,is_body_part_right_hand,is_body_part_drop_kick,is_body_part_keeper_arm,is_body_part_other,is_body_part_no_touch,position,is_goal
0,False,False,0.050663,Normal,105.65,31.85,60.40584,16.502879,False,False,...,False,False,False,False,False,False,False,False,Left Wing,0
1,False,False,0.054719,Normal,103.25,46.25,110.462272,17.878059,True,False,...,False,False,False,False,False,False,False,False,Center Attacking Midfield,0
2,False,True,0.065366,Half Volley,104.35,45.25,108.544711,16.50712,True,False,...,False,False,False,False,False,False,False,False,Left Defensive Midfield,0
3,False,False,0.105761,Normal,111.25,48.55,134.337651,12.233765,False,False,...,False,False,False,False,False,False,False,False,Center Forward,0
4,False,False,0.077067,Normal,108.65,30.45,49.922418,14.83324,False,True,...,False,False,False,False,False,False,False,False,Left Wing,0


In [11]:
euro_df.groupby("technique").size()

technique
Backheel           5
Diving Header      2
Half Volley      133
Lob                5
Normal           672
Overhead Kick      1
Volley            63
dtype: int64

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
class ConsolidateBodyParts(BaseEstimator, TransformerMixin):
    def __init__(self, body_parts_to_keep=["is_body_part_right_foot", "is_body_part_left_foot", "is_body_part_head"]):
        self.body_parts_to_keep = body_parts_to_keep
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        assert "is_body_part_other" in X.columns
        body_parts_to_drop = []
        for column in X.columns:
            if (
                column.startswith("is_body_part") and 
                column not in self.body_parts_to_keep and 
                column != "is_body_part_other"
            ):
                X["is_body_part_other"] = X["is_body_part_other"] | X[column]
                body_parts_to_drop.append(column)
                
        X = X.drop(body_parts_to_drop, axis=1)
        return X
    
class ConsolidateCategories(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, categories_to_keep, other_category_name="other"):
        self.column_name = column_name
        self.categories_to_keep = categories_to_keep
        self.other_category_name = other_category_name
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        other_records = (X[self.column_name].isin(self.categories_to_keep) == False)
        X.loc[other_records, self.column_name] = self.other_category_name
        return X

In [19]:
loss_info = []
model_info = {}
train_data = {}
test_data = {}
for competition_name, matches in matches_by_competition.items():
    print(competition_name)
    train_test_matches, _ = split_by_time(matches, test_frac=0.2)
    train_matches, test_matches = train_test_split(
        train_test_matches, test_size=0.2, random_state=235
    )
    train_df = match_list_to_df(train_matches)
    test_df = match_list_to_df(test_matches)
    train_data[competition_name] = train_df.copy()
    test_data[competition_name] = test_df.copy()
    data_prep_pipeline = Pipeline((
        ("consolidate_techniques", ConsolidateCategories("technique", ["Normal", "Half Volley", "Volley"])),
        ("encode_categories", OneHotEncoder(variables=["technique", "position"], drop_last=False)),
        ("consolidate_body_parts", ConsolidateBodyParts()),
        ("drop_unused_columns", DropFeatures(["statsbomb_xg", "coordinates_x", "coordinates_y", "is_goal"])),
        ("drop_constant_columns", DropConstantFeatures()),
        ("create_angle_from_goal", AngleNormalizer(variable="angle_to_goal", new_variable="angle_from_goal")),
        ("passthrough", "passthrough"),
    ))
    # See https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst for parameter
    # definitions
    model = XGBClassifier(
        max_depth=5,
        learning_rate=0.3,
        gamma=0,
        min_child_weight=1,
        subsample=1,
        reg_lambda=1,
        reg_alpha=0,
        eval_metric="mae",
        early_stopping_rounds=5, 
        objective="reg:squarederror",
        monotone_constraints=None,
        random_state=2309
    )
    
    param_grid = {
        "max_depth": [3, 4, 5, 6, 7],
        "learning_rate": [0.3, 0.6],
        "min_child_weight": [1, 5, 10]
    }
    #param_grid = {"max_depth": [5], "learning_rate": [0.3], "min_child_weight": [1]}
    
    model = GridSearchCV(model, param_grid)
    train_df_transformed = data_prep_pipeline.fit_transform(train_df)
    test_df_transformed = data_prep_pipeline.transform(test_df)
    
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        model.fit(
            train_df_transformed, train_df["is_goal"], 
            eval_set=[(test_df_transformed, test_df["is_goal"])],
            verbose=False
        )
        test_probs = model.predict_proba(test_df_transformed)[:,1]
    mae = mean_absolute_error(test_df["is_goal"], test_probs)
    mae_statsbomb = mean_absolute_error(test_df["is_goal"], test_df["statsbomb_xg"])
    mse = mean_squared_error(test_df["is_goal"], test_probs)
    mse_statsbomb = mean_squared_error(test_df["is_goal"], test_df["statsbomb_xg"])
    auroc = roc_auc_score(test_df["is_goal"], test_probs)
    auroc_statsbomb = roc_auc_score(test_df["is_goal"], test_df["statsbomb_xg"])
    loss_info.append({
        "competition": competition_name, 
        "mae": mae,
        "mae_statsbomb": mae_statsbomb,
        "mse": mse,
        "mse_statsbomb": mse_statsbomb,
        "auc": auroc,
        "auc_statsbomb": auroc_statsbomb
    })
    model_info[competition_name] = {"data_prep_pipeline": data_prep_pipeline, "model": model}
    
with open("gbm_feature_engineered.dill", "wb") as f:
    dill.dump(model_info, f)
loss_info = pd.DataFrame(loss_info)

UEFA Women's Euro


In [20]:
loss_info

Unnamed: 0,competition,mae,mae_statsbomb,mse,mse_statsbomb,auc,auc_statsbomb
0,UEFA Women's Euro,0.13359,0.121463,0.056977,0.05007,0.810429,0.844055
