In [2]:
import pandas as pd
import dill
import warnings

from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropConstantFeatures, DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from xgboost import XGBClassifier

from soccer_analytics.data.statsbomb import get_metadata, get_events, load_competition_seasons
from soccer_analytics.data_split import split_by_time
from soccer_analytics.preprocessing import match_list_to_df, AngleNormalizer, BlockScore, uniform_block_score

In [3]:
competitions = get_metadata()

In [56]:
full_competitions = {
    "1. Bundesliga": ["2015/2016"],
#     "FA Women's Super League": ["2018/2019", "2019/2020", "2020/2021"],
#     "FIFA World Cup": ["2022"], #["2018", "2022"],
#     "Indian Super league": ["2021/2022"],
#     "La Liga": ["2015/2016"],
#     "Ligue 1": ["2015/2016"],
#     "Premier League": ["2015/2016"],
#     "Serie A": ["2015/2016"],
#     "UEFA Euro": ["2020"],
#    "UEFA Women's Euro": ["2022"],
#     "Women's World Cup": ["2019", "2023"]
}

In [57]:
matches_by_competition = {}
for competition_name, seasons in full_competitions.items():
    print(competition_name, len(seasons))
    matches_by_competition[competition_name] = load_competition_seasons(
        competition_name, seasons, event_types=["shot"]
    )

1. Bundesliga 1


In [58]:
from sklearn.base import BaseEstimator, TransformerMixin
class ConsolidateBodyParts(BaseEstimator, TransformerMixin):
    def __init__(self, body_parts_to_keep=["is_body_part_right_foot", "is_body_part_left_foot", "is_body_part_head"]):
        self.body_parts_to_keep = body_parts_to_keep
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        assert "is_body_part_other" in X.columns
        body_parts_to_drop = []
        for column in X.columns:
            if (
                column.startswith("is_body_part") and 
                column not in self.body_parts_to_keep and 
                column != "is_body_part_other"
            ):
                X["is_body_part_other"] = X["is_body_part_other"] | X[column]
                body_parts_to_drop.append(column)
                
        X = X.drop(body_parts_to_drop, axis=1)
        return X
    
class ConsolidateCategories(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, categories_to_keep, other_category_name="other"):
        self.column_name = column_name
        self.categories_to_keep = categories_to_keep
        self.other_category_name = other_category_name
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        other_records = (X[self.column_name].isin(self.categories_to_keep) == False)
        X.loc[other_records, self.column_name] = self.other_category_name
        return X
    
class ConsolidatePositions(BaseEstimator, TransformerMixin):
    """
    Based on Appendix 1: Tactical Positions Guide of the Statsbomb Open Data Specification.
    """
    def __init__(self, column_name="position", other="Unknown"):
        self.column_name = column_name
        self.other = other
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        is_defense = (X[self.column_name].str.endswith("Back")) | (X[self.column_name] == "Goalkeeper")
        is_defensive_midfield = X[self.column_name].str.endswith("Defensive Midfield")
        is_central_midfield = (
            X[self.column_name].str.endswith("Center Midfield") | 
            X[self.column_name].isin(["Right Midfield", "Left Midfield"])
        )
        is_attacking_midfield = X[self.column_name].str.endswith("Attacking Midfield")
        is_forward = (
            X[self.column_name].str.endswith("Wing") |
            X[self.column_name].str.endswith("Striker") |
            X[self.column_name].str.endswith("Forward")
        )
        is_unknown = X[self.column_name] == self.other
        found_positions = (
            is_defense | 
            is_defensive_midfield | is_central_midfield | is_attacking_midfield | 
            is_forward | 
            is_unknown
        )
        unknown_positions = X.loc[found_positions == False, self.column_name]
        if len(unknown_positions) > 0:
            print("Unknown positions:")
            print(unknown_positions)
        X.loc[is_defense, self.column_name] = "Defense"
        X.loc[is_defensive_midfield, self.column_name] = "Defensive Midfield"
        X.loc[is_central_midfield, self.column_name] = "Central Midfield"
        X.loc[is_attacking_midfield, self.column_name] = "Attacking Midfield"
        X.loc[is_forward, self.column_name] = "Forward"
        return X

In [82]:
loss_info = []
model_info = {}
train_data = {}
test_data = {}
for competition_name, matches in matches_by_competition.items():
    print(competition_name)
    train_test_matches, _ = split_by_time(matches, test_frac=0.2)
    train_matches, test_matches = train_test_split(
        train_test_matches, test_size=0.2, random_state=235
    )
    train_df = match_list_to_df(train_matches)
    test_df = match_list_to_df(test_matches)
    train_data[competition_name] = train_df.copy()
    test_data[competition_name] = test_df.copy()
    data_prep_pipeline = Pipeline((
        ("consolidate_techniques", ConsolidateCategories("technique", ["Normal", "Half Volley", "Volley"])),
        ("consolidate_positions", ConsolidatePositions()),
        ("encode_categories", OneHotEncoder(variables=["technique", "position"], drop_last=False)),
        ("consolidate_body_parts", ConsolidateBodyParts()),
        ("calculate_block_score", BlockScore(
            "freeze_frame", "coordinates_x", "coordinates_y",
            uniform_block_score(max_distance=0.5, block_score=0.6),
            overlap_strategy="compound"
        )), 
        ("drop_unused_columns", DropFeatures([
            "statsbomb_xg", "coordinates_x", "coordinates_y", "is_goal", "is_blocked", "timestamp", "freeze_frame",
            "closest_defender_distance",
            "situation",
            #"block_score"
        ])),
        ("drop_constant_columns", DropConstantFeatures()),
        ("create_angle_from_goal", AngleNormalizer(variable="angle_to_goal", new_variable="angle_from_goal")),
        ("passthrough", "passthrough"),
    ))
    # See https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst for parameter
    # definitions
    model = XGBClassifier(
        max_depth=5,
        learning_rate=0.3,
        gamma=0,
        min_child_weight=1,
        subsample=1,
        reg_lambda=1,
        reg_alpha=0,
        eval_metric="mae",
        early_stopping_rounds=5, 
        objective="reg:squarederror",
        monotone_constraints=None,
        random_state=2309
    )
    
    param_grid = {
        "max_depth": [3, 4, 5, 6, 7],
        "learning_rate": [0.2, 0.3, 0.4],
        "subsample": [0.8, 0.9, 1.0],
        "min_child_weight": [1, 2, 3]
    }
    #param_grid = {"max_depth": [5], "learning_rate": [0.3], "min_child_weight": [1]}
    
    model = GridSearchCV(model, param_grid)
    train_df_transformed = data_prep_pipeline.fit_transform(train_df)
    test_df_transformed = data_prep_pipeline.transform(test_df)
    
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        model.fit(
            train_df_transformed, train_df["is_goal"], 
            eval_set=[(test_df_transformed, test_df["is_goal"])],
            verbose=False
        )
        test_probs = model.predict_proba(test_df_transformed)[:,1]
    print(model.best_params_)
    mae = mean_absolute_error(test_df["is_goal"], test_probs)
    mae_statsbomb = mean_absolute_error(test_df["is_goal"], test_df["statsbomb_xg"])
    mse = mean_squared_error(test_df["is_goal"], test_probs)
    mse_statsbomb = mean_squared_error(test_df["is_goal"], test_df["statsbomb_xg"])
    auroc = roc_auc_score(test_df["is_goal"], test_probs)
    auroc_statsbomb = roc_auc_score(test_df["is_goal"], test_df["statsbomb_xg"])
    loss_info.append({
        "competition": competition_name, 
        "mae": mae,
        "mae_statsbomb": mae_statsbomb,
        "mse": mse,
        "mse_statsbomb": mse_statsbomb,
        "auc": auroc,
        "auc_statsbomb": auroc_statsbomb
    })
    model_info[competition_name] = {"data_prep_pipeline": data_prep_pipeline, "model": model}
    
with open("gbm_feature_engineered.dill", "wb") as f:
    dill.dump(model_info, f)
loss_info = pd.DataFrame(loss_info)

1. Bundesliga
{'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.9}


In [60]:
loss_info

Unnamed: 0,competition,mae,mae_statsbomb,mse,mse_statsbomb,auc,auc_statsbomb
0,1. Bundesliga,0.153506,0.144773,0.076627,0.069194,0.810504,0.859355


In [61]:
competition = "1. Bundesliga"
transformed_train_data = model_info[competition]["data_prep_pipeline"].transform(train_data[competition].copy())
train_target = train_data[competition]["is_goal"]
test_target = test_data[competition]["is_goal"]
transformed_test_data = model_info[competition]["data_prep_pipeline"].transform(test_data[competition].copy())
transformed_train_data.head().T

Unnamed: 0,0,1,2,3,4
is_penalty,False,False,False,False,False
is_first_time,False,True,False,False,False
distance_to_goal,23.640326,25.948892,18.1087,32.041926,10.08489
is_body_part_right_foot,False,True,True,False,False
is_body_part_left_foot,True,False,False,True,False
is_body_part_head,False,False,False,False,True
is_body_part_other,False,False,False,False,False
technique_Normal,1,0,1,1,1
technique_Half Volley,0,1,0,0,0
technique_Volley,0,0,0,0,0


In [81]:
model_info[competition]["model"].best_params_

{'learning_rate': 0.3, 'max_depth': 4, 'min_child_weight': 1}

In [62]:
len(train_target)

4984

In [79]:
columns = ["block_score", "distance_to_goal", "angle_from_goal", "is_penalty"]
model = XGBClassifier(
    max_depth=9,
    n_estimators=10000,
    learning_rate=0.3,
    gamma=1,
    min_child_weight=3,
    subsample=0.95,
    reg_lambda=1,
    reg_alpha=0,
    eval_metric="mae",
    early_stopping_rounds=100, 
    objective="reg:squarederror",
    monotone_constraints=None,
    random_state=2309
)

with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
    model.fit(
        transformed_train_data[columns], train_target, 
        eval_set=[(transformed_test_data[columns], test_target)],
        #verbose=False
    )
test_probs = model.predict_proba(transformed_test_data[columns])[:,1]
mean_absolute_error(test_target, test_probs)

[0]	validation_0-mae:0.39730
[1]	validation_0-mae:0.32450
[2]	validation_0-mae:0.27488
[3]	validation_0-mae:0.24033
[4]	validation_0-mae:0.21620
[5]	validation_0-mae:0.19884
[6]	validation_0-mae:0.18653
[7]	validation_0-mae:0.17889
[8]	validation_0-mae:0.17323
[9]	validation_0-mae:0.16930
[10]	validation_0-mae:0.16597
[11]	validation_0-mae:0.16355
[12]	validation_0-mae:0.16211
[13]	validation_0-mae:0.16126
[14]	validation_0-mae:0.16070
[15]	validation_0-mae:0.16018
[16]	validation_0-mae:0.15948
[17]	validation_0-mae:0.15934
[18]	validation_0-mae:0.15899
[19]	validation_0-mae:0.15896
[20]	validation_0-mae:0.15901
[21]	validation_0-mae:0.15905
[22]	validation_0-mae:0.15931
[23]	validation_0-mae:0.15918
[24]	validation_0-mae:0.15917
[25]	validation_0-mae:0.15903
[26]	validation_0-mae:0.15909
[27]	validation_0-mae:0.15900
[28]	validation_0-mae:0.15919
[29]	validation_0-mae:0.15927
[30]	validation_0-mae:0.15911
[31]	validation_0-mae:0.15924
[32]	validation_0-mae:0.15846
[33]	validation_0-ma

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


0.15738736454606217

In [None]:
row = 1
record = phantom_blocks.iloc[row].to_dict()
shot_data = {
    "x": [record["coordinates_x"]], "y": [record["coordinates_y"]], "role": ["shooter"]
}
for player in record["freeze_frame"]:
    shot_data["x"].append(player["location"][0])
    shot_data["y"].append(player["location"][1])
    role = "other"
    if player["position"]["name"] == "Goalkeeper":
        role = "goalkeeper"
    elif player["teammate"] == False:
        role = "defender"
    else:
        role = "teammate"
    shot_data["role"].append(role)
    
    #shot_data["role"].append("goalkeeper" if player["position"]["name"] == "Goalkeeper" else "other")
shot_data = pd.DataFrame(shot_data)

In [None]:
import plotly.express as px
import plotly.graph_objects as go
shot_data["error_y"] = 1
fig1 = px.scatter(shot_data, x="x", y="y", color="role", error_y="error_y")

offset = 1
slope_upper = (record["coordinates_y"] + offset - 44) / (record["coordinates_x"] - 120)
slope_lower = (record["coordinates_y"] - offset - 36) / (record["coordinates_x"] - 120)
b_upper = (record["coordinates_y"] + offset) - slope_upper * record["coordinates_x"]
b_lower = (record["coordinates_y"] - offset) - slope_lower * record["coordinates_x"]

triangle = pd.DataFrame({
    "x": [record["coordinates_x"], 120],
})
triangle["y1"] = slope_upper * triangle["x"] + b_upper
triangle["y2"] = slope_lower * triangle["x"] + b_lower
fig2 = px.line(triangle, x="x", y="y1")
fig3 = px.line(triangle, x="x", y="y2")
fig = go.Figure(fig1.data + fig2.data + fig3.data)
fig.update_xaxes(range=[120, 90])
fig.update_yaxes(range=[18, 62]) 
fig.show()

In [None]:
record["timestamp"] / 60

In [None]:
record["freeze_frame"]