In [1]:
import pandas as pd
import dill

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropConstantFeatures, DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from types import MethodType

from soccer_analytics.data.statsbomb import get_metadata, get_events
from soccer_analytics.data_split import split_by_time
from soccer_analytics.preprocessing import match_list_to_df, AngleNormalizer

In [2]:
competitions = get_metadata()

In [3]:
full_competitions = {
   "1. Bundesliga": ["2015/2016"],
    "FA Women's Super League": ["2018/2019", "2019/2020", "2020/2021"],
    "FIFA World Cup": ["2018", "2022"],
    "Indian Super league": ["2021/2022"],
    "La Liga": ["2015/2016"],
    "Ligue 1": ["2015/2016"],
    "Premier League": ["2015/2016"],
    "Serie A": ["2015/2016"],
    "UEFA Euro": ["2020"],
    "UEFA Women's Euro": ["2022"],
    "Women's World Cup": ["2019", "2023"]
}

In [None]:
matches_by_competition = {}
for competition in competitions:
    print(competition.name, len(competition.seasons))
    if competition.name in full_competitions:
        matches_by_competition[competition.name] = []
        for season in competition.seasons:
            print(f"    {season.name}: {len(season.matches)}")
            if season.name in full_competitions[competition.name]:
                matches_by_competition[competition.name].extend(get_events(
                    season, event_types=["shot"]
                ))

1. Bundesliga 1
    2015/2016: 306
Champions League 18
Copa del Rey 3
FA Women's Super League 3
    2020/2021: 131
    2019/2020: 87
    2018/2019: 108


In [None]:
loss_info = []
coefficient_info = []
model_info = {}
train_data = {}
test_data = {}
for competition_name, matches in matches_by_competition.items():
    train_test_matches, _ = split_by_time(matches, test_frac=0.2)
    train_matches, test_matches = train_test_split(
        train_test_matches, test_size=0.2, random_state=235
    )
    train_df = match_list_to_df(train_matches)
    test_df = match_list_to_df(test_matches)
    train_data[competition_name] = train_df
    test_data[competition_name] = test_df
    data_prep_pipeline = Pipeline((
        ("encode_categories", OneHotEncoder(variables=["technique", "position"], drop_last=True)),
        ("drop_unused_columns", DropFeatures(["statsbomb_xg", "coordinates_x", "coordinates_y", "is_goal"])),
        ("drop_constant_columns", DropConstantFeatures()),
        ("create_angle_from_goal", AngleNormalizer(variable="angle_to_goal", new_variable="angle_from_goal")),
        ("passthrough", "passthrough"),
    ))
    model = LogisticRegression(penalty=None, max_iter=10000, random_state=315)
    train_df_transformed = data_prep_pipeline.fit_transform(train_df)
    test_df_transformed = data_prep_pipeline.transform(test_df)
    model.fit(train_df_transformed, train_df["is_goal"])
    test_probs = model.predict_proba(test_df_transformed)[:,1]
    
    mae = mean_absolute_error(test_df["is_goal"], test_probs)
    mae_statsbomb = mean_absolute_error(test_df["is_goal"], test_df["statsbomb_xg"])
    mse = mean_squared_error(test_df["is_goal"], test_probs)
    mse_statsbomb = mean_squared_error(test_df["is_goal"], test_df["statsbomb_xg"])
    auroc = roc_auc_score(test_df["is_goal"], test_probs)
    auroc_statsbomb = roc_auc_score(test_df["is_goal"], test_df["statsbomb_xg"])
    loss_info.append({
        "competition": competition_name, 
        "mae": mae,
        "mae_statsbomb": mae_statsbomb,
        "mse": mse,
        "mse_statsbomb": mse_statsbomb,
        "auc": auroc,
        "auc_statsbomb": auroc_statsbomb
    })
    coefficient_info.extend(list(zip(
        [competition_name] * len(model.coef_[0]), train_df_transformed.columns, model.coef_[0]
    )))
    model_info[competition_name] = {"data_prep_pipeline": data_prep_pipeline, "model": model}
    
with open("simple_model.dill", "wb") as f:
    dill.dump(model_info, f)
loss_info = pd.DataFrame(loss_info)
coefficient_info = pd.DataFrame(coefficient_info, columns=["competition", "coef_name", "coef_value"])

In [None]:
loss_info

In [None]:
is_technique = coefficient_info["coef_name"].str.startswith("technique")
is_position = coefficient_info["coef_name"].str.startswith("position")
is_body_part = coefficient_info["coef_name"].str.startswith("is_body_part")
coefficient_info["group"] = is_technique + is_position * 2 + is_body_part * 3

coefficient_info[coefficient_info["competition"] == "Premier League"].sort_values(
    ["group", "coef_value"]
)