Hier soll eine geordnete Main entstehen, die alle wichtigen Funktionen importiert
Beispiel:
- xgboost
- gradient_boosting

In [1]:
#import
import pandas as pd
import warnings
import xgboost as xgb
import socceraction.spadl as spadl
import numpy as np

# Import functions
from src.data_processing import config_h5_file_paths, fetch_match_data, generate_match_data, split_games, split_match_data, adjust_results, store_match_data, load_match_data
from src.feature_processing import select_features, generate_features, store_features
from src.label_processing import select_labels, generate_labels, store_labels
from src.vaep_processing import load_features_labels, train_model, evaluate_model, store_predictions, compute_vaep, store_vaep, load_vaep, compare_vaep, compare_vaep_on_actions

In [3]:
def generate_full_match_data(datafolder, format):

    match_data_h5, match_data_train_h5, match_data_test_h5, match_data_test_success_h5, match_data_test_fail_h5, features_train_h5, features_test_h5, features_test_success_h5, features_test_fail_h5, labels_train_h5, labels_test_h5, labels_test_success_h5, labels_test_fail_h5, predictions_test_h5, predictions_test_success_h5, predictions_test_fail_h5, vaep_test_h5, vaep_test_success_h5, vaep_test_fail_h5 = config_h5_file_paths(
    datafolder=datafolder,
    format=format
    )
    #  Fetch match data
    loader, selected_competitions, games = fetch_match_data(
    data_provider="statsbomb", 
    source="free", 
    seasons=["2015/2016"], 
    competitions=["Premier League", "La Liga", "1. Bundesliga", "Serie A", "Ligue 1"]
    )

    # Generate match data
    comps, teams, players, actions = generate_match_data(
    games=games,
    loader=loader,
    format=format
    )

    # Store match data
    store_match_data(
    competitions=comps,
    games=games,
    teams=teams,
    players=players,
    actions=actions,
    format=format,
    match_data_h5=match_data_h5
    )

In [4]:
def generate_training_data(datafolder, format):
    match_data_h5, match_data_train_h5, match_data_test_h5, match_data_test_success_h5, match_data_test_fail_h5, features_train_h5, features_test_h5, features_test_success_h5, features_test_fail_h5, labels_train_h5, labels_test_h5, labels_test_success_h5, labels_test_fail_h5, predictions_test_h5, predictions_test_success_h5, predictions_test_fail_h5, vaep_test_h5, vaep_test_success_h5, vaep_test_fail_h5 = config_h5_file_paths(
    datafolder=datafolder,
    format=format
    )

    # Load full match data
    comps, games, teams, players, player_games, actions = load_match_data(
        format=format,
        match_data_h5=match_data_h5
    )

     # Split games
    train_games, test_games, validation_games = split_games(
        games=games,
        train_percentage=50,
        random_state=42,
        shuffle=True
    )

    # Split match data
    split_player_games, split_actions = split_match_data(
        split_games=train_games,
        player_games=player_games,
        actions=actions
    )
    # Store train and test match data
    store_match_data(
        competitions=comps,
        games=train_games,
        teams=teams,
        players=players,
        player_games=split_player_games,
        actions=split_actions,
        format=format,
        match_data_h5=match_data_train_h5
    )

     # Select features
    xfns = select_features(
        format=format
    )
    
    # Generate train features normal
    X_dict = generate_features(
        games=train_games,
        xfns=xfns,
        nb_prev_actions=1,
        format=format,
        match_data_h5=match_data_train_h5
    )

    # Store features
    store_features(
        X_dict=X_dict,
        format=format,
        features_h5=features_train_h5
    )

    # Select labels
    yfns = select_labels(
        format=format
    )

    # Generate train labels normal
    Y_dict = generate_labels(
        games=train_games,
        yfns=yfns,
        format=format,
        match_data_h5=match_data_train_h5
    )

    # Store regular labels
    store_labels(
        Y_dict=Y_dict,
        format=format,
        labels_h5=labels_train_h5
    )

In [5]:
def generate_normal_test_data(datafolder, format):
    match_data_h5, match_data_train_h5, match_data_test_h5, match_data_test_success_h5, match_data_test_fail_h5, features_train_h5, features_test_h5, features_test_success_h5, features_test_fail_h5, labels_train_h5, labels_test_h5, labels_test_success_h5, labels_test_fail_h5, predictions_test_h5, predictions_test_success_h5, predictions_test_fail_h5, vaep_test_h5, vaep_test_success_h5, vaep_test_fail_h5 = config_h5_file_paths(
    datafolder=datafolder,
    format=format
    )

    # Load full match data
    comps, games, teams, players, player_games, actions = load_match_data(
        format=format,
        match_data_h5=match_data_h5
    )

     # Split games
    train_games, test_games, validation_games = split_games(
        games=games,
        train_percentage=50,
        random_state=42,
        shuffle=True
    )

    # Split match data
    split_player_games, split_actions = split_match_data(
        split_games=test_games,
        player_games=player_games,
        actions=actions
    )
    # Store test match data
    store_match_data(
        competitions=comps,
        games=test_games,
        teams=teams,
        players=players,
        player_games=split_player_games,
        actions=split_actions,
        format=format,
        match_data_h5=match_data_test_h5
    )

     # Select features
    xfns = select_features(
        format=format
    )
    
    # Generate test features normal
    X_dict = generate_features(
        games=test_games,
        xfns=xfns,
        nb_prev_actions=1,
        format=format,
        match_data_h5=match_data_test_h5
    )

    # Store features
    store_features(
        X_dict=X_dict,
        format=format,
        features_h5=features_test_h5
    )

    # Select labels
    yfns = select_labels(
        format=format
    )

    # Generate test labels normal
    Y_dict = generate_labels(
        games=test_games,
        yfns=yfns,
        format=format,
        match_data_h5=match_data_test_h5
    )

    # Store regular labels
    store_labels(
        Y_dict=Y_dict,
        format=format,
        labels_h5=labels_test_h5
    )

In [6]:
def generate_success_fail_test_data(datafolder, format):
    match_data_h5, match_data_train_h5, match_data_test_h5, match_data_test_success_h5, match_data_test_fail_h5, features_train_h5, features_test_h5, features_test_success_h5, features_test_fail_h5, labels_train_h5, labels_test_h5, labels_test_success_h5, labels_test_fail_h5, predictions_test_h5, predictions_test_success_h5, predictions_test_fail_h5, vaep_test_h5, vaep_test_success_h5, vaep_test_fail_h5 = config_h5_file_paths(
    datafolder=datafolder,
    format=format
    )

    # Load full match data
    comps, games, teams, players, player_games, actions = load_match_data(
        format=format,
        match_data_h5=match_data_h5
    )

     # Split games
    train_games, test_games, validation_games = split_games(
        games=games,
        train_percentage=50,
        random_state=42,
        shuffle=True
    )

    # Split match data
    split_player_games, split_actions = split_match_data(
        split_games=test_games,
        player_games=player_games,
        actions=actions
    )
    # Change results (outcome) of actions for each game and action of test match data
    actions_success, actions_fail = adjust_results(
        test_games=test_games,
        actions=actions # oder split_actions?
    )
    # Store adjusted test_success match data
    store_match_data(
        competitions=comps,
        games=test_games,
        teams=teams,
        players=players,
        player_games=split_player_games,
        actions=actions_success,
        format=format,
        match_data_h5=match_data_test_success_h5
    )

    # Store adjusted test_fail match data
    store_match_data(
        competitions=comps,
        games=test_games,
        teams=teams,
        players=players,
        player_games=split_player_games,
        actions=actions_fail,
        format=format,
        match_data_h5=match_data_test_fail_h5
    )

     # Select features
    xfns = select_features(
        format=format
    )
    
    # Generate test features success
    X_dict = generate_features(
        games=test_games,
        xfns=xfns,
        nb_prev_actions=1,
        format=format,
        match_data_h5=match_data_test_success_h5
    )

    # Store features success
    store_features(
        X_dict=X_dict,
        format=format,
        features_h5=features_test_success_h5
    )

        # Generate test features fail
    X_dict = generate_features(
        games=test_games,
        xfns=xfns,
        nb_prev_actions=1,
        format=format,
        match_data_h5=match_data_test_fail_h5
    )

    # Store features fail
    store_features(
        X_dict=X_dict,
        format=format,
        features_h5=features_test_fail_h5
    )
    # Select labels
    yfns = select_labels(
        format=format
    )

    # Generate test labels success
    Y_dict = generate_labels(
        games=test_games,
        yfns=yfns,
        format=format,
        match_data_h5=match_data_test_success_h5
    )

    # Store labels success
    store_labels(
        Y_dict=Y_dict,
        format=format,
        labels_h5=labels_test_success_h5
    )

    # Generate test labels fail
    Y_dict = generate_labels(
        games=test_games,
        yfns=yfns,
        format=format,
        match_data_h5=match_data_test_fail_h5
    )

    # Store labels fail
    store_labels(
        Y_dict=Y_dict,
        format=format,
        labels_h5=labels_test_fail_h5
    )

In [7]:
def train_all_vaep_model(datafolder, format=format): # train_vaep_model
    match_data_h5, match_data_train_h5, match_data_test_h5, match_data_test_success_h5, match_data_test_fail_h5, features_train_h5, features_test_h5, features_test_success_h5, features_test_fail_h5, labels_train_h5, labels_test_h5, labels_test_success_h5, labels_test_fail_h5, predictions_test_h5, predictions_test_success_h5, predictions_test_fail_h5, vaep_test_h5, vaep_test_success_h5, vaep_test_fail_h5 = config_h5_file_paths(
    datafolder=datafolder,
    format=format
    )

    # Load training data
    comps,train_games, teams, players, player_games, actions = load_match_data(
        format=format,
        match_data_h5=match_data_train_h5
    )

    # Load features and labels for training data
    X_train, Y_train = load_features_labels(
        split_games=train_games,
        nb_prev_actions=1,
        format=format,
        features_h5=features_train_h5,
        labels_h5=labels_train_h5
    )

    # Train the model
    models = train_model(
        X_train=X_train,
        Y_train=Y_train,
        n_estimators=50,
        max_depth=3,
        n_jobs=-1,
        verbosity=1,
        enable_categorical=True
    )

    # Load training data
    comps, test_games, teams, players, player_games, actions = load_match_data(
        format=format,
        match_data_h5=match_data_test_h5
    )

    # Load features and labels for testing data
    X_test, Y_test = load_features_labels(
        split_games=test_games,
        nb_prev_actions=1,
        format=format,
        features_h5=features_test_h5,
        labels_h5=labels_test_h5
    )

    # Evaluate the model
    Y_hat = evaluate_model(
        X_test=X_test,
        Y_test=Y_test,
        models=models
    )

    # Store predictions
    store_predictions(
        test_games=test_games,
        Y_hat=Y_hat,
        format=format,
        match_data_h5=match_data_test_h5,
        predictions_h5=predictions_test_h5
    )

    # Compute VAEP values
    vaep_values = compute_vaep(
        test_games=test_games,
        teams=teams,
        players=players,
        format=format,
        match_data_h5=match_data_test_h5,
        predictions_h5=predictions_test_h5
    )



    print("vaep_values shape: ", vaep_values.shape, "\n" )


    

    # Store VAEP values
    store_vaep(
        vaep_values=vaep_values,
        format=format,
        vaep_h5=vaep_test_h5
    )

    ######## Fail ########
    
    # Load test action fail data
    comps, test_games, teams, players, player_games, actions = load_match_data(
        format=format,
        match_data_h5=match_data_test_fail_h5
    )
    
    # Load features and labels for testing data
    X_test, Y_test = load_features_labels(
        split_games=test_games,
        nb_prev_actions=1,
        format=format,
        features_h5=features_test_fail_h5,
        labels_h5=labels_test_fail_h5
    )

    # Evaluate the model
    Y_hat = evaluate_model(
        X_test=X_test,
        Y_test=Y_test,
        models=models
    )

    # Store predictions
    store_predictions(
        test_games=test_games,
        Y_hat=Y_hat,
        format=format,
        match_data_h5=match_data_test_fail_h5,
        predictions_h5=predictions_test_fail_h5
    )

    # Compute VAEP values
    vaep_values_fail = compute_vaep(
        test_games=test_games,
        teams=teams,
        players=players,
        format=format,
        match_data_h5=match_data_test_fail_h5,
        predictions_h5=predictions_test_fail_h5
    )

    # Store VAEP values fail
    store_vaep(
        vaep_values=vaep_values_fail,
        format=format,
        vaep_h5=vaep_test_fail_h5
    )



    print("vaep_values_fail shape: ", vaep_values_fail.shape, "\n" )




    ######## Success ########


    # Load test action success data
    comps, test_games, teams, players, player_games, actions = load_match_data(
        format=format,
        match_data_h5=match_data_test_success_h5
    )

    # Load features and labels for testing success data
    X_test, Y_test = load_features_labels(
        split_games=test_games,
        nb_prev_actions=1,
        format=format,
        features_h5=features_test_success_h5,
        labels_h5=labels_test_success_h5
    ) 

    # Evaluate the model
    Y_hat = evaluate_model(
        X_test=X_test,
        Y_test=Y_test,
        models=models
    )

    # Store predictions
    store_predictions(
        test_games=test_games,
        Y_hat=Y_hat,
        format=format,
        match_data_h5=match_data_test_success_h5,
        predictions_h5=predictions_test_success_h5
    )    

    # Compute VAEP values
    vaep_values_success = compute_vaep(
        test_games=test_games,
        teams=teams,
        players=players,
        format=format,
        match_data_h5=match_data_test_success_h5,
        predictions_h5=predictions_test_success_h5
    )

    # Store VAEP values
    store_vaep(
        vaep_values=vaep_values_success,
        format=format,
        vaep_h5=vaep_test_success_h5
    )

    print("vaep_values_success shape: ", vaep_values_success.shape, "\n" ) 



In [None]:
def main():
    # Configure format, h5-file, and folder names
    datafolder = ".\\xSuccess\\top5_15-16_spadl"
    format = "spadl"
    
    """generate_full_match_data(
        datafolder=datafolder,
        format=format)"""

    generate_training_data(
        datafolder=datafolder,
        format=format)
    
    generate_normal_test_data(
        datafolder=datafolder,
        format=format)

    generate_success_fail_test_data(
        datafolder=datafolder,
        format=format)

    train_all_vaep_model(
        datafolder=datafolder, 
        format=format)
    
    return

----------------

In [9]:
# Ignore warnings
warnings.filterwarnings(
  action="ignore", 
  message="credentials were not supplied. open data access only"
)
warnings.simplefilter(
  action='ignore', 
  category=pd.errors.PerformanceWarning
)
pd.set_option('future.no_silent_downcasting', True)

warnings.filterwarnings(
    "ignore",
    message="A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method."
)

In [10]:
if __name__ == "__main__":
    main()
    print("Done")


The folder '.\xSuccess\top5_15-16_spadl' already exists.


Converting match data: 100%|██████████| 1823/1823 [1:15:51<00:00,  2.50s/it]


Match data (spadl) with xG successfully stored to .\xSuccess\top5_15-16_spadl\match_data.h5.
Done
