In [2]:
# Import packages
import pandas as pd
import warnings
import xgboost as xgb
import socceraction.spadl as spadl
import numpy as np
import os
from datetime import datetime
import matplotlib.pyplot as plt
import joblib


# Import functions
from src.data_processing import config_h5_file_paths, fetch_match_data, generate_match_data, split_games, split_match_data, adjust_results, store_match_data, load_match_data
from src.feature_processing import select_features, generate_features, store_features
from src.label_processing import select_labels, generate_labels, store_labels
from src.vaep_processing import load_features_labels, train_model, evaluate_model, store_predictions, compute_vaep, store_vaep, load_vaep, compare_vaep, compare_vaep_on_actions

In [3]:
# Configure format, h5-file, and folder names
datafolder = ".\\xSuccess\\top5_15-16_spadl"
format = "spadl"

match_data_h5, match_data_train_h5, match_data_test_h5, match_data_test_success_h5, match_data_test_fail_h5, features_train_h5, features_test_h5, features_test_success_h5, features_test_fail_h5, labels_train_h5, labels_test_h5, labels_test_success_h5, labels_test_fail_h5, predictions_test_h5, predictions_test_success_h5, predictions_test_fail_h5, vaep_test_h5, vaep_test_success_h5, vaep_test_fail_h5 = config_h5_file_paths(
  datafolder=datafolder,
  format=format
)

The folder '.\xSuccess\top5_15-16_spadl' already exists.


In [4]:
# Load full match data
games, teams, players, player_games, actions = load_match_data(
    format=format,
    match_data_h5=match_data_h5
)

Loading match data: 100%|██████████| 1823/1823 [00:26<00:00, 69.32it/s]


In [5]:
df_bundesliga = games[games['competition_id'] == 9]
df_bundesliga

Unnamed: 0,game_id,season_id,competition_id,competition_stage,game_day,game_date,home_team_id,away_team_id,home_score,away_score,venue,referee
0,3890561,27,9,Regular Season,34,2016-05-14 15:30:00,175,181,1,4,PreZero Arena,Felix Brych
1,3890505,27,9,Regular Season,28,2016-04-02 15:30:00,169,184,1,0,Allianz Arena,Florian Meyer
2,3890511,27,9,Regular Season,29,2016-04-08 20:30:00,173,178,2,2,Olympiastadion Berlin,Benjamin Brand
3,3890515,27,9,Regular Season,29,2016-04-09 15:30:00,171,872,1,2,Volksparkstadion,Peter Sippel
4,3890411,27,9,Regular Season,17,2015-12-20 16:30:00,173,177,2,0,Olympiastadion Berlin,Peter Sippel
...,...,...,...,...,...,...,...,...,...,...,...,...
301,3890263,27,9,Regular Season,1,2015-08-15 15:30:00,177,189,0,1,Mewa Arena,Guido Winkmann
302,3890262,27,9,Regular Season,1,2015-08-15 15:30:00,872,178,2,2,Merck-Stadion am Böllenfalltor,Felix Brych
303,3890261,27,9,Regular Season,1,2015-08-15 18:30:00,180,185,4,0,Signal-Iduna-Park,Tobias Stieler
304,3890260,27,9,Regular Season,1,2015-08-15 15:30:00,904,175,2,1,BayArena,Robert Hartmann


In [6]:
df_sorted = df_bundesliga.sort_values('game_day')
df_sorted

Unnamed: 0,game_id,season_id,competition_id,competition_stage,game_day,game_date,home_team_id,away_team_id,home_score,away_score,venue,referee
305,3890259,27,9,Regular Season,1,2015-08-14 20:30:00,169,171,5,0,Allianz Arena,Bastian Dankert
297,3890267,27,9,Regular Season,1,2015-08-16 15:30:00,179,184,2,1,Volkswagen Arena,Christian Dingert
298,3890266,27,9,Regular Season,1,2015-08-16 17:30:00,174,186,1,3,Mercedes-Benz-Arena,Wolfgang Stark
300,3890264,27,9,Regular Season,1,2015-08-15 15:30:00,176,181,0,3,Wohninvest Weserstadion,Daniel Siebert
299,3890265,27,9,Regular Season,1,2015-08-15 15:30:00,172,173,0,1,WWK Arena,Tobias Welz
...,...,...,...,...,...,...,...,...,...,...,...,...
18,3890560,27,9,Regular Season,34,2016-05-14 15:30:00,872,185,0,2,Merck-Stadion am Böllenfalltor,Peter Sippel
17,3890562,27,9,Regular Season,34,2016-05-14 15:30:00,180,186,2,2,Signal-Iduna-Park,Michael Weiner
16,3890563,27,9,Regular Season,34,2016-05-14 15:30:00,904,189,3,2,BayArena,Guido Winkmann
15,3890564,27,9,Regular Season,34,2016-05-14 15:30:00,172,171,1,3,WWK Arena,Florian Meyer


In [7]:
df_table = (
    df_bundesliga
    .sort_values(["game_day", "game_date"])  # zuerst nach Spieltag, dann Spiel‐Uhrzeit
    .loc[:, [
        "game_day",
        "game_date",
        "home_team_id", "home_score",
        "away_score",   "away_team_id",
        "venue", "referee"
    ]]
    .reset_index(drop=True)
)

display(df_table)


Unnamed: 0,game_day,game_date,home_team_id,home_score,away_score,away_team_id,venue,referee
0,1,2015-08-14 20:30:00,169,5,0,171,Allianz Arena,Bastian Dankert
1,1,2015-08-15 15:30:00,172,0,1,173,WWK Arena,Tobias Welz
2,1,2015-08-15 15:30:00,176,0,3,181,Wohninvest Weserstadion,Daniel Siebert
3,1,2015-08-15 15:30:00,177,0,1,189,Mewa Arena,Guido Winkmann
4,1,2015-08-15 15:30:00,872,2,2,178,Merck-Stadion am Böllenfalltor,Felix Brych
...,...,...,...,...,...,...,...,...
301,34,2016-05-14 15:30:00,872,0,2,185,Merck-Stadion am Böllenfalltor,Peter Sippel
302,34,2016-05-14 15:30:00,169,3,1,178,Allianz Arena,Knut Kircher
303,34,2016-05-14 15:30:00,177,0,0,173,Mewa Arena,Sascha Stegemann
304,34,2016-05-14 15:30:00,176,1,0,184,Wohninvest Weserstadion,Deniz Aytekin


In [8]:
# Spiele in „Heim“ und „Auswärts“ auftrennen
home = df_table[['game_day','home_team_id','home_score','away_score']].copy()
home.columns = ['game_day','team_id','gf','ga']
away = df_table[['game_day','away_team_id','away_score','home_score']].copy()
away.columns = ['game_day','team_id','gf','ga']

df_long = pd.concat([home, away], ignore_index=True)

# Punkte und W/D/L berechnen
df_long['points'] = np.where(df_long.gf > df_long.ga, 3,
                      np.where(df_long.gf == df_long.ga, 1, 0))
df_long['win']  = (df_long.points == 3).astype(int)
df_long['draw'] = (df_long.points == 1).astype(int)
df_long['loss'] = (df_long.points == 0).astype(int)

# Saison-Statistiken pro Team aggregieren
season_table = (
    df_long.groupby('team_id')
    .agg(
      played = ('points','size'),
      points = ('points','sum'),
      wins   = ('win','sum'),
      draws  = ('draw','sum'),
      losses = ('loss','sum'),
      gf     = ('gf','sum'),
      ga     = ('ga','sum')
    )
    .assign(gd = lambda d: d.gf - d.ga)
    .sort_values(['points','gd','gf'], ascending=[False,False,False])
)
print(season_table)


         played  points  wins  draws  losses  gf  ga  gd
team_id                                                 
169          34      88    28      4       2  80  17  63
180          34      78    24      6       4  82  34  48
904          34      60    18      6      10  56  40  16
185          34      55    17      4      13  67  50  17
181          34      52    15      7      12  51  49   2
177          34      50    14      8      12  46  42   4
173          34      50    14      8      12  42  42   0
179          34      45    12      9      13  47  49  -2
186          34      43    10     13      11  38  42  -4
171          34      41    11      8      15  40  46  -6
189          34      40    10     10      14  33  42  -9
172          34      38     9     11      14  42  52 -10
176          34      38    10      8      16  50  65 -15
872          34      38     9     11      14  38  53 -15
175          34      37     9     10      15  39  54 -15
184          34      36     9  