In [1]:
import pandas as pd
import dill
import warnings

from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropConstantFeatures, DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from xgboost import XGBClassifier

from soccer_analytics.data.statsbomb import get_metadata, get_events, load_competition_seasons
from soccer_analytics.data_split import split_by_time
from soccer_analytics.preprocessing import match_list_to_df, AngleNormalizer

In [2]:
competitions = get_metadata()

In [3]:
full_competitions = {
#     "1. Bundesliga": ["2015/2016"],
#     "FA Women's Super League": ["2018/2019", "2019/2020", "2020/2021"],
    "FIFA World Cup": ["2022"], #["2018", "2022"],
#     "Indian Super league": ["2021/2022"],
#     "La Liga": ["2015/2016"],
#     "Ligue 1": ["2015/2016"],
#     "Premier League": ["2015/2016"],
#     "Serie A": ["2015/2016"],
#     "UEFA Euro": ["2020"],
#    "UEFA Women's Euro": ["2022"],
#     "Women's World Cup": ["2019", "2023"]
}

In [4]:
matches_by_competition = {}
for competition_name, seasons in full_competitions.items():
    print(competition_name, len(seasons))
    matches_by_competition[competition_name] = load_competition_seasons(
        competition_name, seasons, event_types=["shot"]
    )

FIFA World Cup 1


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
class ConsolidateBodyParts(BaseEstimator, TransformerMixin):
    def __init__(self, body_parts_to_keep=["is_body_part_right_foot", "is_body_part_left_foot", "is_body_part_head"]):
        self.body_parts_to_keep = body_parts_to_keep
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        assert "is_body_part_other" in X.columns
        body_parts_to_drop = []
        for column in X.columns:
            if (
                column.startswith("is_body_part") and 
                column not in self.body_parts_to_keep and 
                column != "is_body_part_other"
            ):
                X["is_body_part_other"] = X["is_body_part_other"] | X[column]
                body_parts_to_drop.append(column)
                
        X = X.drop(body_parts_to_drop, axis=1)
        return X
    
class ConsolidateCategories(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, categories_to_keep, other_category_name="other"):
        self.column_name = column_name
        self.categories_to_keep = categories_to_keep
        self.other_category_name = other_category_name
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        other_records = (X[self.column_name].isin(self.categories_to_keep) == False)
        X.loc[other_records, self.column_name] = self.other_category_name
        return X
    
class ConsolidatePositions(BaseEstimator, TransformerMixin):
    """
    Based on Appendix 1: Tactical Positions Guide of the Statsbomb Open Data Specification.
    """
    def __init__(self, column_name="position", other="Unknown"):
        self.column_name = column_name
        self.other = other
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        is_defense = (X[self.column_name].str.endswith("Back")) | (X[self.column_name] == "Goalkeeper")
        is_defensive_midfield = X[self.column_name].str.endswith("Defensive Midfield")
        is_central_midfield = (
            X[self.column_name].str.endswith("Center Midfield") | 
            X[self.column_name].isin(["Right Midfield", "Left Midfield"])
        )
        is_attacking_midfield = X[self.column_name].str.endswith("Attacking Midfield")
        is_forward = (
            X[self.column_name].str.endswith("Wing") |
            X[self.column_name].str.endswith("Striker") |
            X[self.column_name].str.endswith("Forward")
        )
        is_unknown = X[self.column_name] == self.other
        found_positions = (
            is_defense | 
            is_defensive_midfield | is_central_midfield | is_attacking_midfield | 
            is_forward | 
            is_unknown
        )
        unknown_positions = X.loc[found_positions == False, self.column_name]
        if len(unknown_positions) > 0:
            print("Unknown positions:")
            print(unknown_positions)
        X.loc[is_defense, self.column_name] = "Defense"
        X.loc[is_defensive_midfield, self.column_name] = "Defensive Midfield"
        X.loc[is_central_midfield, self.column_name] = "Central Midfield"
        X.loc[is_attacking_midfield, self.column_name] = "Attacking Midfield"
        X.loc[is_forward, self.column_name] = "Forward"
        return X

In [6]:
loss_info = []
model_info = {}
train_data = {}
test_data = {}
for competition_name, matches in matches_by_competition.items():
    print(competition_name)
    train_test_matches, _ = split_by_time(matches, test_frac=0.2)
    train_matches, test_matches = train_test_split(
        train_test_matches, test_size=0.2, random_state=235
    )
    train_df = match_list_to_df(train_matches)
    test_df = match_list_to_df(test_matches)
    train_data[competition_name] = train_df.copy()
    test_data[competition_name] = test_df.copy()
    data_prep_pipeline = Pipeline((
        ("consolidate_techniques", ConsolidateCategories("technique", ["Normal", "Half Volley", "Volley"])),
        ("consolidate_positions", ConsolidatePositions()),
        ("encode_categories", OneHotEncoder(variables=["technique", "position"], drop_last=False)),
        ("consolidate_body_parts", ConsolidateBodyParts()),
        ("drop_unused_columns", DropFeatures([
            "statsbomb_xg", "coordinates_x", "coordinates_y", "is_goal", "is_blocked", "timestamp", "freeze_frame",
            "closest_defender_distance", 
            #"num_blockers",
        ])),
        ("drop_constant_columns", DropConstantFeatures()),
        ("create_angle_from_goal", AngleNormalizer(variable="angle_to_goal", new_variable="angle_from_goal")),
        ("passthrough", "passthrough"),
    ))
    # See https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst for parameter
    # definitions
    model = XGBClassifier(
        max_depth=5,
        learning_rate=0.3,
        gamma=0,
        min_child_weight=1,
        subsample=1,
        reg_lambda=1,
        reg_alpha=0,
        eval_metric="mae",
        early_stopping_rounds=5, 
        objective="reg:squarederror",
        monotone_constraints=None,
        random_state=2309
    )
    
    param_grid = {
        "max_depth": [3, 4, 5, 6, 7],
        "learning_rate": [0.3, 0.6],
        "min_child_weight": [1, 5, 10]
    }
    #param_grid = {"max_depth": [5], "learning_rate": [0.3], "min_child_weight": [1]}
    
    model = GridSearchCV(model, param_grid)
    train_df_transformed = data_prep_pipeline.fit_transform(train_df)
    test_df_transformed = data_prep_pipeline.transform(test_df)
    
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        model.fit(
            train_df_transformed, train_df["is_goal"], 
            eval_set=[(test_df_transformed, test_df["is_goal"])],
            verbose=False
        )
        test_probs = model.predict_proba(test_df_transformed)[:,1]
    mae = mean_absolute_error(test_df["is_goal"], test_probs)
    mae_statsbomb = mean_absolute_error(test_df["is_goal"], test_df["statsbomb_xg"])
    mse = mean_squared_error(test_df["is_goal"], test_probs)
    mse_statsbomb = mean_squared_error(test_df["is_goal"], test_df["statsbomb_xg"])
    auroc = roc_auc_score(test_df["is_goal"], test_probs)
    auroc_statsbomb = roc_auc_score(test_df["is_goal"], test_df["statsbomb_xg"])
    loss_info.append({
        "competition": competition_name, 
        "mae": mae,
        "mae_statsbomb": mae_statsbomb,
        "mse": mse,
        "mse_statsbomb": mse_statsbomb,
        "auc": auroc,
        "auc_statsbomb": auroc_statsbomb
    })
    model_info[competition_name] = {"data_prep_pipeline": data_prep_pipeline, "model": model}
    
with open("gbm_feature_engineered.dill", "wb") as f:
    dill.dump(model_info, f)
loss_info = pd.DataFrame(loss_info)

FIFA World Cup


In [7]:
loss_info

Unnamed: 0,competition,mae,mae_statsbomb,mse,mse_statsbomb,auc,auc_statsbomb
0,FIFA World Cup,0.159667,0.128149,0.066436,0.057668,0.761628,0.769686


In [8]:
df = train_data["FIFA World Cup"]
phantom_blocks = df[(df["is_blocked"] == 1) & (df["num_blockers"] < 1)]

In [27]:
row = 1
record = phantom_blocks.iloc[row].to_dict()
shot_data = {
    "x": [record["coordinates_x"]], "y": [record["coordinates_y"]], "role": ["shooter"]
}
for player in record["freeze_frame"]:
    shot_data["x"].append(player["location"][0])
    shot_data["y"].append(player["location"][1])
    role = "other"
    if player["position"]["name"] == "Goalkeeper":
        role = "goalkeeper"
    elif player["teammate"] == False:
        role = "defender"
    else:
        role = "teammate"
    shot_data["role"].append(role)
    
    #shot_data["role"].append("goalkeeper" if player["position"]["name"] == "Goalkeeper" else "other")
shot_data = pd.DataFrame(shot_data)

In [34]:
import plotly.express as px
import plotly.graph_objects as go
shot_data["error_y"] = 1
fig1 = px.scatter(shot_data, x="x", y="y", color="role", error_y="error_y")

offset = 1
slope_upper = (record["coordinates_y"] + offset - 44) / (record["coordinates_x"] - 120)
slope_lower = (record["coordinates_y"] - offset - 36) / (record["coordinates_x"] - 120)
b_upper = (record["coordinates_y"] + offset) - slope_upper * record["coordinates_x"]
b_lower = (record["coordinates_y"] - offset) - slope_lower * record["coordinates_x"]

triangle = pd.DataFrame({
    "x": [record["coordinates_x"], 120],
})
triangle["y1"] = slope_upper * triangle["x"] + b_upper
triangle["y2"] = slope_lower * triangle["x"] + b_lower
fig2 = px.line(triangle, x="x", y="y1")
fig3 = px.line(triangle, x="x", y="y2")
fig = go.Figure(fig1.data + fig2.data + fig3.data)
fig.update_xaxes(range=[120, 90])
fig.update_yaxes(range=[18, 62]) 
fig.show()

In [29]:
record["timestamp"] / 60

4.19825

In [30]:
record["freeze_frame"]

[{'location': [118.3, 33.1],
  'player': {'id': 22293, 'name': 'Harry Souttar'},
  'position': {'id': 3, 'name': 'Right Center Back'},
  'teammate': False},
 {'location': [106.6, 38.2],
  'player': {'id': 5490, 'name': 'Jackson Irvine'},
  'position': {'id': 13, 'name': 'Right Center Midfield'},
  'teammate': False},
 {'location': [115.2, 44.5],
  'player': {'id': 5479, 'name': 'Aziz Eraltay Behich'},
  'position': {'id': 6, 'name': 'Left Back'},
  'teammate': False},
 {'location': [112.4, 35.7],
  'player': {'id': 33572, 'name': 'Nathaniel Atkinson'},
  'position': {'id': 2, 'name': 'Right Back'},
  'teammate': False},
 {'location': [110.1, 40.1],
  'player': {'id': 33495, 'name': 'Kye Rowles'},
  'position': {'id': 5, 'name': 'Left Center Back'},
  'teammate': False},
 {'location': [108.4, 39.2],
  'player': {'id': 3281, 'name': 'Aaron Mooy'},
  'position': {'id': 10, 'name': 'Center Defensive Midfield'},
  'teammate': False},
 {'location': [95.3, 45.1],
  'player': {'id': 8346, 'nam