Liste an möglichen Features:

- start_x
- start_y
- end_x
- end_y
- time_seconds		
- type_name	
- bodypart_name	
- action_distance
- shot_angle_centered
- distance_to_goal

target:
- result_id

In [1]:
# Import packages
import pandas as pd
import numpy as np
import socceraction.spadl as spadl
import xgboost as xgb
import joblib
import os

from sklearn.metrics import brier_score_loss, log_loss, roc_auc_score
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

from interpret import show
from interpret.glassbox import ExplainableBoostingClassifier

from src.data_processing import config_h5_file_paths, split_games, load_match_data
from src.vaep_processing import load_features_labels, train_model, evaluate_model


In [2]:
def evaluate_my_model(model, X, y):
    y_pred = model.predict_proba(X)[:, 1]
    # Brier Score: misst die durchschnittliche quadratische Differenz zwischen 
    # den vorhergesagten Wahrscheinlichkeiten und den tatsächlichen binären Ergebnissen.
    brier = brier_score_loss(y, y_pred)
    print(f"Brier score: {brier:.5f}")

    # Log Loss: misst, wie gut die Wahrscheinlichkeiten die tatsächlichen Ergebnisse repräsentieren.
    ll = log_loss(y, y_pred)
    print(f"log loss score: {ll:.5f}")

    # ROC AUC: misst die Fähigkeit des Modells, zwischen Klassen zu unterscheiden (0 vs. 1).
    roc_auc = roc_auc_score(y, y_pred)
    print(f"ROC AUC: {roc_auc:.5f}")

In [3]:
# Configure format, h5-file, and folder names
datafolder = ".\\xSuccess\\top5_15-16_spadl"
format = "spadl"

match_data_h5, match_data_train_h5, match_data_test_h5, match_data_test_success_h5, match_data_test_fail_h5, features_train_h5, features_test_h5, features_test_success_h5, features_test_fail_h5, labels_train_h5, labels_test_h5, labels_test_success_h5, labels_test_fail_h5, predictions_test_h5, predictions_test_success_h5, predictions_test_fail_h5, vaep_test_h5, vaep_test_success_h5, vaep_test_fail_h5 = config_h5_file_paths(
  datafolder=datafolder,
  format=format
)

The folder '.\xSuccess\top5_15-16_spadl' already exists.


In [4]:
# Lade den Datensatz
spadl_h5 = ".\\xSuccess\\top5_15-16_spadl\\match_data.h5"

with pd.HDFStore(spadl_h5) as spadlstore:
    # Spiele laden
    games = spadlstore["games"]
    
    # teams / players laden, wenn du sie für später brauchst
    teams = spadlstore["teams"]
    players = spadlstore["players"]
    
    all_actions_list = []
    
    # Über alle Spiele iterieren
    for gid in games.game_id:
        df_actions = spadlstore[f"actions/game_{gid}"]
        # Ggf. noch die 'game_id' explizit als Spalte hinzufügen,
        # falls du sie später brauchst.
        df_actions["game_id"] = gid
        
        all_actions_list.append(df_actions)
    
    # Zu einem großen DataFrame zusammenfügen
    all_actions= pd.concat(all_actions_list, ignore_index=True)

all_actions = (
    all_actions
    .merge(spadl.actiontypes_df(), how='left', on='type_id')
    .merge(spadl.results_df(), how='left', on='result_id')
    .merge(spadl.bodyparts_df(), how='left', on='bodypart_id')
)

In [5]:
# Result_id hat mehr als nur 0 und 1, darum werden alle auf 0 gesetzt die nicht 1 sind
all_actions["result_id"] = np.where(all_actions["result_id"] == 1, 1, 0)

In [6]:
# Berechne die Länge der Dribblings (euklidische Distanz zwischen Start- und Endkoordinate)

all_actions["action_distance"] = np.sqrt(
    (all_actions["end_x"] - all_actions["start_x"])**2 +
    (all_actions["end_y"] - all_actions["start_y"])**2
)


Winkel für Schüsse berechnen

In [7]:
# Angenommen: Spielfeld geht von x=0 bis x=105 (Opta Standard)
def detect_goal_side(end_x, field_length=105):
    return 'left' if end_x < field_length / 2 else 'right'

def calculate_centered_shot_angle(row):
    # Ziel-Torposition
    if row['goal_side'] == 'right':
        goal_x, goal_y = 105, 34
        dx = goal_x - row['start_x']
    else:
        goal_x, goal_y = 0, 34
        dx = row['start_x'] - goal_x  # Richtung umdrehen!

    dy = goal_y - row['start_y']

    # Jetzt ist dx immer positiv → Richtung zum Tor ist einheitlich
    angle_rad = np.arctan2(dy, dx)
    angle_deg = np.degrees(angle_rad)

    return angle_deg

In [8]:
# Spalte erstellen
all_actions['shot_angle_centered'] = np.nan

# Torseite erkennen (für alle Zeilen, nicht nur Schüsse)
all_actions['goal_side'] = all_actions['start_x'].apply(detect_goal_side)

# Winkel nur für Schüsse berechnen

shot_mask = all_actions['type_name'] == 'shot'
all_actions.loc[shot_mask, 'shot_angle_centered'] = all_actions[shot_mask].apply(calculate_centered_shot_angle, axis=1)



Distanz zum Tor nach Aktion berechnen

In [9]:
# Berechnung der Distanz zum Tor. Hier wird für jede Zeile abhängig von der goal_side der entsprechende Tor-X-Wert gewählt.

all_actions["distance_to_goal"] = np.sqrt(
    (all_actions["end_x"] - np.where(all_actions["goal_side"] == "right", 105, 0))**2 +
    (all_actions["end_y"] - 34)**2
)


Erstellen der Unique ID

In [10]:
all_actions["unique_id"] = all_actions["game_id"].astype(str) + "_" + all_actions["action_id"].astype(str)
all_actions.fillna(999, inplace=True)
all_actions.sort_values(by="unique_id", ascending=True, inplace=True)

Aufteilen der Daten in Training und Testdaten

In [11]:
# Split games
train_games, test_games, validation_games = split_games(
  games=all_actions,
  train_percentage=50,
  random_state=42,
  shuffle=True,
  stratify='team_id'
)


In [12]:
# Features und die Zielvariable
features = ["start_x", 
            "start_y", 
            "end_x", 
            "end_y",
            "player_id", 
            "bodypart_name", 
            "type_name", 
            "action_distance",
            'shot_angle_centered', 
            "time_seconds",
            'distance_to_goal']
#features = ["start_x", "start_y", "end_x", "end_y", "bodypart_name", "type_name", "action_distance",'shot_angle_centered', "time_seconds",'distance_to_goal']
target = "result_id"

In [13]:
X_train = train_games[features]
Y_train = train_games[target]

X_test = test_games[features]
Y_test = test_games[target]


## Dummy_model

In [14]:
dummy_model = DummyClassifier(strategy='stratified')
dummy_model.fit(X_train, Y_train)

evaluate_my_model(dummy_model, X_test, Y_test)

Brier score: 0.28505
log loss score: 10.27413
ROC AUC: 0.49979


In [15]:
dummy_model = DummyClassifier(strategy='uniform')
dummy_model.fit(X_train, Y_train)
evaluate_my_model(dummy_model, X_test, Y_test)

Brier score: 0.25000
log loss score: 0.69315
ROC AUC: 0.50000


In [16]:
dummy_model = DummyClassifier(strategy='most_frequent')
dummy_model.fit(X_train, Y_train)
evaluate_my_model(dummy_model, X_test, Y_test)

Brier score: 0.17197
log loss score: 6.19832
ROC AUC: 0.50000


## XGBOOST

### xSuccess

In [17]:
X_train = train_games[features].copy()
X_test  = test_games [features].copy()

# Liste der Objekt-Spalten
cat_cols = ["bodypart_name", "type_name","player_id"]

for col in cat_cols:
    X_train[col] = X_train[col].astype("category")
    X_test [col] = X_test [col].astype("category")

In [18]:
xgb_model = xgb.XGBClassifier(
    n_jobs=-1,
    enable_categorical=True,
    random_state=42
    )

xgb_model.fit(X_train, Y_train)

evaluate_my_model(xgb_model, X_test, Y_test)

Brier score: 0.07202
log loss score: 0.22661
ROC AUC: 0.93423


---------------------

## interpret.ml

In [19]:
X_train_full = X_train.copy()
Y_train_full = Y_train.copy()


# Liste der Objekt-Spalten
cat_cols = ["bodypart_name", "type_name"]

for col in cat_cols:
    X_train_full.loc[:,col] = X_train_full[col].astype("category")
    X_test.loc[:, col] = X_test[col].astype("category")

num_cols = [c for c in X_train.columns if c not in cat_cols]
X_train_full.loc[:, num_cols] = X_train_full[num_cols].astype("float32")
X_test.loc[:,num_cols] = X_test[num_cols].astype("float32")

Subsampling, da das Training zu lange dauert

In [20]:
X_train, _, Y_train, _ = train_test_split(
    X_train_full, Y_train_full,
    train_size=50000,      # exakte Anzahl oder als float für Prozent
    random_state=42,
    shuffle=True
)

In [22]:
# Optional: in NumPy-Array umwandeln
y_arr = Y_train.values

ebm = ExplainableBoostingClassifier(
    n_jobs=-1,          # alle Kerne nutzen
    random_state=42,    # für Reproduzierbarkeit
    max_bins=32,       # weniger Splits = schneller
    interactions=5
)

In [23]:


ebm.fit(X_train, y_arr)

evaluate_my_model(ebm, X_test, Y_test)

Brier score: 0.07665
log loss score: 0.24165
ROC AUC: 0.92733


max_bins=16

train_size=10000  == Brier score: 0.07678 \
train_size=20000  == Brier score: 0.07398 \
train_size=30000  == Brier score: 0.07424 \
train_size=40000  == Brier score: 0.07373 \
train_size=50000  == Brier score: 0.07350 \
train_size=60000  == Brier score: 0.07414 <Time: >


max_bins=32

train_size=10000  == Brier score: 0.07534 \
train_size=20000  == Brier score: 0.07337 \
train_size=30000  == Brier score: 0.07305 \
train_size=40000  == Brier score: 0.07271 \
train_size=50000  == Brier score: 0.07243 \
train_size=60000  == Brier score: 0.07261 <Time: 1m 6s>


max_bins=64

train_size=10000  == Brier score: 0.07513\
train_size=20000  == Brier score: 0.07329\
train_size=30000  == Brier score: 0.07300\
train_size=40000  == Brier score: 0.07267\
train_size=50000  == Brier score: 0.07244\
train_size=60000  == Brier score: 0.07228 <Time: 1m 3s>


max_bins=128

train_size=10000  == Brier score: 0.07435\
train_size=20000  == Brier score: 0.07246\
train_size=30000  == Brier score: 0.07204\
train_size=40000  == Brier score: 0.07173\
train_size=50000  == Brier score: 0.07145   <Time: 1m 7s>\
train_size=60000  == Brier score: 0.07134   <Time: 1m> \
train_size=100000  == Brier score: 0.07086  <Time: 2m 14s>\
train_size=200000  == Brier score: 0.07001  <Time: 6m 2s>\
train_size=300000  ==  <Time: >


Modelle speichern

In [None]:
#Ordner "Models" anlegen (wenn er noch nicht existiert)
os.makedirs("Models", exist_ok=True)

# Modelle in diesem Ordner speichern
joblib.dump(dummy_model, os.path.join("Models", "dummy_model.joblib"))
joblib.dump(xgb_model,   os.path.join("Models", "xgb_model_with_player_id.joblib"))
joblib.dump(ebm,         os.path.join("Models", "ebm_model_with_player_id.joblib"))

['Models\\ebm_model.joblib']

Modelle laden

In [None]:
# Pfad zum Models-Ordner
models_dir = "Models"

# Modelle laden
dummy_model = joblib.load(os.path.join(models_dir, "dummy_model.joblib"))
xgb_model   = joblib.load(os.path.join(models_dir, "xgb_model.joblib"))
ebm         = joblib.load(os.path.join(models_dir, "ebm_model.joblib"))

# prüfen, ob es geklappt hat
print(type(dummy_model), type(xgb_model), type(ebm))

In [25]:
from interpret import show
show(ebm.explain_global())

------------------

Konfussionsmatrix für beide Modelle zum auswerten

# 3 zufällige Indizes aus dem Testset auswählen
np.random.seed(42)  # für Reproduzierbarkeit
sample_indices = np.random.choice(X_test.index, size=3, replace=False)

# Beispiel-Daten auswählen
X_sample = X_test.loc[sample_indices]
Y_true_sample = Y_test.loc[sample_indices]

# Vorhersagen von beiden Modellen
Y_pred_xgb_sample = xgb_model.predict_proba(X_sample)[:-1]
Y_pred_gam_sample = ebm.predict_proba(X_sample.values)[:-1]

# Übersicht als Tabelle
vergleich_df = pd.DataFrame({
    "True Label": Y_true_sample.values,
    "XGBoost Prediction": Y_pred_xgb_sample,
    "GAM Prediction": Y_pred_gam_sample
}, index=sample_indices)

# Input-Features anhängen (optional)
vergleich_df = pd.concat([X_sample.reset_index(drop=True), vergleich_df.reset_index(drop=True)], axis=1)


print(tabulate(vergleich_df, headers='keys', tablefmt='github'))

Verteilung der Werte in result_id anzeigen