## 0) Import and configure

In [3]:
# Import packages
import pandas as pd
import warnings

# Import functions
from src.data_processing import config_h5_file_paths, fetch_match_data, generate_match_data, split_games, split_match_data, adjust_results, store_match_data, load_match_data
from src.feature_processing import select_features, generate_features, store_features
from src.label_processing import select_labels, generate_labels, store_labels
from src.vaep_processing import load_features_labels, train_model, evaluate_model, store_predictions, compute_vaep, store_vaep, load_vaep, compare_vaep

In [4]:
# Ignore warnings
warnings.filterwarnings(
  action="ignore", 
  message="credentials were not supplied. open data access only"
)
warnings.simplefilter(
  action='ignore', 
  category=pd.errors.PerformanceWarning
)
pd.set_option('future.no_silent_downcasting', True)

warnings.filterwarnings(
    "ignore",
    message="A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method."
)

In [5]:
# Configure format, h5-file, and folder names
datafolder = ".\\xSuccess\\top5_15-16_spadl"
format = "spadl"

match_data_h5, match_data_train_h5, match_data_test_h5, match_data_test_success_h5, match_data_test_fail_h5, features_train_h5, features_test_h5, features_test_success_h5, features_test_fail_h5, labels_train_h5, labels_test_h5, labels_test_success_h5, labels_test_fail_h5, predictions_test_h5, predictions_test_success_h5, predictions_test_fail_h5, vaep_test_h5, vaep_test_success_h5, vaep_test_fail_h5 = config_h5_file_paths(
  datafolder=datafolder,
  format=format
)

The folder '.\xSuccess\top5_15-16_spadl' already exists.


## 1) Match data

### Fetch, generate, and store full match data (Only once!)

In [None]:
# Fetch match data
loader, selected_competitions, games = fetch_match_data(
  data_provider="statsbomb", 
  source="free", 
  seasons=["2015/2016"], 
  competitions=["Premier League", "La Liga", "1. Bundesliga", "Serie A", "Ligue 1"]
)

In [14]:
# Generate match data
teams, players, actions = generate_match_data(
  games=games,
  loader=loader,
  format=format
)

Converting match data: 100%|██████████| 1823/1823 [1:13:26<00:00,  2.42s/it]


In [15]:
# Store match data
store_match_data(
  games=games,
  teams=teams,
  players=players,
  actions=actions,
  format=format,
  match_data_h5=match_data_h5
)

Match data (spadl format) successfully stored.


### Generate and store train & test match data

In [6]:
# Load full match data
games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_h5
)

Loading match data: 100%|██████████| 1823/1823 [00:15<00:00, 115.50it/s]


Inspect h5 file if empty or not

In [None]:
print("Games:")
print(games.head())

In [None]:
# Falls 'actions' ein Dictionary von DataFrames ist:
for game_id, df in actions.items():
    print(f"Actions for game {game_id}: {df.shape[0]} rows")
    break  # Nur das erste Beispiel anzeigen

In [7]:
# Split games
train_games, test_games, validation_games = split_games(
  games=games,
  train_percentage=50,
  random_state=42,
  shuffle=True
)

In [8]:
# Split match data
split_player_games, split_actions = split_match_data(
    split_games=test_games,
    player_games=player_games,
    actions=actions
)

In [9]:
# Store train and test match data
store_match_data(
  games=train_games,
  teams=teams,
  players=players,
  player_games=split_player_games,
  actions=split_actions,
  format=format,
  match_data_h5=match_data_train_h5
)

Match data (spadl format) successfully stored.


In [None]:
print("Games:")
print(games.head())

In [10]:
# Store train and test match data
store_match_data(
  games=test_games,
  teams=teams,
  players=players,
  player_games=split_player_games,
  actions=split_actions,
  format=format,
  match_data_h5=match_data_test_h5
)

Match data (spadl format) successfully stored.


In [None]:
with pd.HDFStore(match_data_test_h5, 'r') as store:
    print("Gespeicherte Keys und deren Dimensionen:")
    for key in store.keys():
        df = store.get(key)
        print(f"Key: {key} -> {df.shape[0]} Zeilen, {df.shape[1]} Spalten")

### Change results for adjusted test match data

Brauche ich diese Zeile Code?

# If test match data is already stored, load it
train_games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_train_h5)


In [11]:
# If test match data is already stored, load it
test_games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_test_h5
)

Loading match data: 100%|██████████| 912/912 [00:07<00:00, 115.39it/s]


In [12]:
# Change results (outcome) of actions for each game and action of test match data
actions_success, actions_fail = adjust_results(
  test_games=test_games,
  actions=actions
)

In [13]:
# Store adjusted test_success match data
store_match_data(
  games=test_games,
  teams=teams,
  players=players,
  player_games=player_games,
  actions=actions_success,
  format=format,
  match_data_h5=match_data_test_success_h5
)

Match data (spadl format) successfully stored.


with pd.HDFStore(match_data_test_success_h5, 'r') as store:
    print("Gespeicherte Keys und deren Dimensionen:")
    for key in store.keys():
        df = store.get(key)
        print(f"Key: {key} -> {df.shape[0]} Zeilen, {df.shape[1]} Spalten")

In [14]:
# Store adjusted test_fail match data
store_match_data(
  games=test_games,
  teams=teams,
  players=players,
  player_games=player_games,
  actions=actions_fail,
  format=format,
  match_data_h5=match_data_test_fail_h5
)

Match data (spadl format) successfully stored.


In [None]:
with pd.HDFStore(match_data_test_fail_h5, 'r') as store:
    print("Gespeicherte Keys und deren Dimensionen:")
    for key in store.keys():
        df = store.get(key)
        print(f"Key: {key} -> {df.shape[0]} Zeilen, {df.shape[1]} Spalten")

# 2) VAEP values

## 2.1) Generate and store features & labels

In [15]:
# Load match data
games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_test_fail_h5
)

Loading match data: 100%|██████████| 912/912 [00:07<00:00, 114.22it/s]


### Select, generate, and store features

In [16]:
# Select features
xfns = select_features(
  format=format
)

## Feature_Train

In [17]:
# Generate features
X_dict = generate_features(
  games=games,
  xfns=xfns,
  nb_prev_actions=1,
  format=format,
  match_data_h5=match_data_train_h5
)

Generating features: 100%|██████████| 912/912 [00:29<00:00, 30.72it/s]


In [None]:
# Store features
store_features(
  X_dict=X_dict,
  format=format,
  features_h5=features_train_h5
)

## Feature Test Fail

In [None]:
# Generate features
X_dict = generate_features(
  games=games,
  xfns=xfns,
  nb_prev_actions=1,
  format=format,
  match_data_h5=match_data_test_fail_h5
)

In [None]:
# Store features
store_features(
  X_dict=X_dict,
  format=format,
  features_h5=features_test_fail_h5
)

In [None]:

with pd.HDFStore(features_test_fail_h5, 'r') as store:
    keys = store.keys()
    print("Anzahl der Keys in der Datei:", len(keys))

    # Beispiel: Laden der Features eines bestimmten Keys (z.B. des ersten)
    first_key = store.keys()[0]
    features_df = store[first_key]
    
# Tabellarische Darstellung der ersten Zeilen
#print(features_df.head())



with pd.HDFStore(features_test_fail_h5, 'r') as store:
    print("Verfügbare Keys in der Datei:")
    for key in store.keys():
        print(key)


### Select, generate, and store labels

In [None]:
# Select labels
yfns = select_labels(
  format=format
)

## Label Train

In [None]:
# Generate labels
Y_dict = generate_labels(
  games=test_games,
  yfns=yfns,
  format=format,
  match_data_h5=match_data_test_h5
)

In [None]:
# Store regular labels
store_labels(
  Y_dict=Y_dict,
  format=format,
  labels_h5=labels_test_h5
)

## Label test fail

In [None]:
# Generate labels
Y_dict = generate_labels(
  games=test_games,
  yfns=yfns,
  format=format,
  match_data_h5=match_data_test_fail_h5
)

In [None]:
# Store regular labels
store_labels(
  Y_dict=Y_dict,
  format=format,
  labels_h5=labels_test_fail_h5
)

In [None]:
with pd.HDFStore(labels_test_fail_h5, 'r') as store:
    keys = store.keys()
    print("Anzahl der Keys in der Datei:", len(keys))
    # Beispiel: Laden der Features eines bestimmten Keys (z.B. des ersten)
    first_key = store.keys()[0]
    labels_df = store[first_key]
    
# Tabellarische Darstellung der ersten Zeilen
print(labels_df.head())



## 2.2) Train VAEP model

In [None]:
# Load training data
train_games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_train_h5
)

In [None]:
# Load features and labels for training data
X_train, Y_train = load_features_labels(
  split_games=train_games,
  nb_prev_actions=1,
  format=format,
  features_h5=features_train_h5,
  labels_h5=labels_train_h5
)

In [None]:
# Train the model
models = train_model(
  X_train=X_train,
  Y_train=Y_train,
  n_estimators=50,
  max_depth=3,
  n_jobs=-3,
  verbosity=1,
  enable_categorical=True
)

## 2.3) Compute VAEP values

In [None]:
# Load training data
test_games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_test_fail_h5
)

In [None]:
# Load features and labels for testing data
X_test, Y_test = load_features_labels(
  split_games=test_games,
  nb_prev_actions=1,
  format=format,
  features_h5=features_test_fail_h5,
  labels_h5=labels_test_fail_h5
)

In [None]:
# Evaluate the model
Y_hat = evaluate_model(
  X_test=X_test,
  Y_test=Y_test,
  models=models
)

In [None]:
# Store predictions
store_predictions(
  test_games=test_games,
  Y_hat=Y_hat,
  format=format,
  match_data_h5=match_data_test_fail_h5,
  predictions_h5=predictions_test_fail_h5
)

In [None]:
# Compute VAEP values
vaep_values = compute_vaep(
  test_games=test_games,
  teams=teams,
  players=players,
  format=format,
  match_data_h5=match_data_test_fail_h5,
  predictions_h5=predictions_test_fail_h5
)

In [None]:
# Store VAEP values
store_vaep(
  vaep_values=vaep_values,
  format=format,
  vaep_h5=vaep_test_fail_h5
)

## 2.4) Compare VAEP values

In [None]:
# Load VAEP values
vaep_values = load_vaep(
  vaep_h5=vaep_test_h5
)

vaep_values_success = load_vaep(
  vaep_h5=vaep_test_success_h5
)

vaep_values_fail = load_vaep(
  vaep_h5=vaep_test_fail_h5
)

In [None]:
# VAEP comparison for each game
vaep_comparison_games = compare_vaep(
    vaep_values=vaep_values,
    vaep_values_success=vaep_values_success,
    vaep_values_fail=vaep_values_fail,
    comparison="games"
)

In [None]:
# VAEP comparison for each action
vaep_comparison_actions = compare_vaep(
    vaep_values=vaep_values,
    vaep_values_success=vaep_values_success,
    vaep_values_fail=vaep_values_fail,
    comparison="actions"
)

In [None]:
# VAEP comparison for each action type
vaep_comparison_action_types = compare_vaep(
    vaep_values=vaep_values,
    vaep_values_success=vaep_values_success,
    vaep_values_fail=vaep_values_fail,
    comparison="action_types"
)

In [None]:
vaep_values.result_name.value_counts() / len(vaep_values)

# 3) xSuccess

## 3.1) Filter match data

In [None]:
# Load match data
games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_train_h5
)

In [None]:
# Filter offensive actions for each game

## 3.2) Generate and store features & labels

In [None]:
# Load match data
games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_test_fail_h5
)

### Select, generate, and store features

In [None]:
# Select features
xfns = select_features(
  format=format
)

In [None]:
# Generate features
X_dict = generate_features(
  games=games,
  xfns=xfns,
  nb_prev_actions=1,
  format=format,
  match_data_h5=match_data_test_fail_h5
)

In [None]:
# Store features
store_features(
  X_dict=X_dict,
  format=format,
  features_h5=features_test_fail_h5
)

### Select, generate, and store labels

In [None]:
# Select labels
yfns = select_labels(
  format=format
)

In [None]:
# Generate labels
Y_dict = generate_labels(
  games=test_games,
  yfns=yfns,
  format=format,
  match_data_h5=match_data_test_fail_h5
)

In [None]:
# Store regular labels
store_labels(
  Y_dict=Y_dict,
  format=format,
  labels_h5=labels_test_fail_h5
)

## 3.3) Train xSuccess model

In [None]:
# Load training data
train_games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_train_h5
)

In [None]:
# Load features and labels for training data
X_train, Y_train = load_features_labels(
  split_games=train_games,
  nb_prev_actions=1,
  format=format,
  features_h5=features_train_h5,
  labels_h5=labels_train_h5
)

In [None]:
# Train the model
models = train_model(
  X_train=X_train,
  Y_train=Y_train,
  n_estimators=50,
  max_depth=3,
  n_jobs=-3,
  verbosity=1,
  enable_categorical=True
)

## 3.4) Compute xSuccess values

In [None]:
# Load training data
test_games, teams, players, player_games, actions = load_match_data(
  format=format,
  match_data_h5=match_data_test_fail_h5
)

In [None]:
# Load features and labels for testing data
X_test, Y_test = load_features_labels(
  split_games=test_games,
  nb_prev_actions=1,
  format=format,
  features_h5=features_test_fail_h5,
  labels_h5=labels_test_fail_h5
)

In [None]:
# Evaluate the model
Y_hat = evaluate_model(
  X_test=X_test,
  Y_test=Y_test,
  models=models
)

In [None]:
# Store predictions
store_predictions(
  test_games=test_games,
  Y_hat=Y_hat,
  format=format,
  match_data_h5=match_data_test_fail_h5,
  predictions_h5=predictions_test_fail_h5
)

In [None]:
# Compute VAEP values
vaep_values = compute_vaep(
  test_games=test_games,
  teams=teams,
  players=players,
  format=format,
  match_data_h5=match_data_test_fail_h5,
  predictions_h5=predictions_test_fail_h5
)

In [None]:
# Store VAEP values
store_vaep(
  vaep_values=vaep_values,
  format=format,
  vaep_h5=vaep_test_fail_h5
)

# Nächste Schritte

Expected Success -> Paper

- erstmal nur offensive Aktionen
- resultname als label, offside = fail

- features: start-position, eventtype, body, direction (Winkel), result von letzten 3 events -> Erfolg jeder Art von Event

- VAEP-Wert ohne result-feature -> Torwahrscheinlichkeit offensiv (immer success in der Anwendung des Modells)

- dann fehlpässe darauf trainieren, ob dananch ein tor (in 10 Aktionen danach) fällt -> Torwahrscheinlichkeit nach Ballverlust (Defensiv-Risiko) (immer fail in der Anwendung des Modells)

- Biermann Counter-Attacks Paper durchschauen

# Tests

In [None]:
import socceraction.vaep.features as fs

nb_prev_actions = 1

xfns = [
  fs.actiontype,
  fs.actiontype_onehot,
  fs.bodypart,
  fs.bodypart_onehot,
  fs.result,
  fs.result_onehot,
  fs.goalscore,
  fs.startlocation,
  fs.endlocation,
  fs.movement,
  fs.space_delta,
  fs.startpolar,
  fs.endpolar,
  fs.team,
  fs.time,
  fs.time_delta
  ]

Xcols = fs.feature_column_names(xfns, nb_prev_actions)

Ycols = ["scores","concedes"]

In [None]:
import tqdm

features_loaded = {}
for game_id in tqdm.tqdm(train_games.game_id, desc="Loading features"):
    Xi = pd.read_hdf(features_train_h5, f"game_{game_id}")
    x = Xi[Xcols]
    features_loaded[game_id] = x


In [None]:
f = features_loaded[3825751]

In [None]:
features_success = {
  key: df.assign(
    result_a0="success", 
    result_fail_a0="false",
    result_success_a0="true",
    result_offside_a0="false",
    result_owngoal_a0="false",
    result_yellow_card_a0="false",
    result_red_card_a0="false"
  ) if key in set(test_games["game_id"]) else df for key, df in features_loaded.items()
}

In [None]:
f_succ = features_success[3825751]

In [None]:
labels_loaded = {}
for game_id in tqdm.tqdm(games.game_id, desc="Loading label"):
  Yi = pd.read_hdf(labels_test_h5, f"game_{game_id}")
  y = Yi[Ycols]
  labels_loaded[game_id] = y

In [None]:
l = labels_loaded[3825751]

In [None]:
a = actions[0]

In [None]:
v = vaep_values[vaep_values["game_id"] == 3754035]

In [None]:
# Load actions 
import socceraction.spadl as spadl
actions = {}

for game in list(games.itertuples()):
  with pd.HDFStore(match_data_h5) as matchdatastore:
   actions[game.game_id] = (
        matchdatastore[f"actions/game_{game.game_id}"]
        .merge(spadl.actiontypes_df(), how="left")
        .merge(spadl.results_df(), how="left")
        .merge(spadl.bodyparts_df(), how="left")
        .merge(matchdatastore["players"], how="left")
        .merge(matchdatastore["teams"], how="left")
   )

In [None]:
action_game = actions[3890562]

In [None]:
action_game[action_game["type_name"] == "interception"]

In [None]:
action_game.type_name.value_counts()

In [None]:
games[games["venue"] == "VELTINS-Arena"]

In [None]:
a = actions[3890293]

In [None]:
a[a["type_id"] == 12]