## Environment/Import/Configuration Setup

In [None]:
import sys
print(sys.executable)
import ipywidgets as widgets
widgets.IntSlider()
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import warnings
warnings.filterwarnings("ignore")
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import matplotlib
matplotlib.use('module://matplotlib_inline.backend_inline')
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# make the plot appear under the code, in the cell
%matplotlib inline

# VAEP Model Training (Top-5 Leagues, 2015/16, XGBoost)

This script trains a **VAEP model** using **StatsBomb open data** for the **Top 5 European leagues** in the **2015/16 season**.

---

## Steps

1. **Load data**  
   - Uses `StatsBombLoader` (local open-data).  
   - Selects Premier League, La Liga, Serie A, Bundesliga, Ligue 1 (2015/16).

2. **Convert events → actions**  
   - With SPADL (`convert_to_actions`, `add_names`).  
   - Builds gamestates (previous 3 actions).  
   - Standardizes play direction.  

3. **Feature engineering**  
   - Encodes action type, result, body part, start/end location, movement, time.  

4. **Labels (VAEP)**  
   - `VAEP = p(score in next 10 actions) - p(concede in next 10 actions)`.

5. **Train model**  
   - Fit `XGBRegressor` on features to VAEP.  
   - Learns how context + action features affect value.  

6. **Save output**  
   - Saves model, feature columns, competitions, season info with `joblib`.

---

When you run this, it will give you a different model every time because probability scores will change every time

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from socceraction.data.statsbomb import StatsBombLoader
from socceraction.spadl import statsbomb as spadl_sb
from socceraction import spadl
from socceraction.vaep import features as feat, labels as lab
from xgboost import XGBRegressor  
from tqdm import tqdm
import joblib

#Load StatsBomb data locally
sbl = StatsBombLoader(
    getter="local",  # from local files
    root="C:/Users/helio/MachineLearning/Bachelor/statsbomb_data/open-data-master/data"
)

# Get all games for a given competition and season
# Competition ID = 2, Season ID = 27 (Premier League 2015/16)
# Competition ID = 11, Season ID = 27 (La Liga 2015/16)
# Competition ID = 9, Season ID = 27 (Bundesliga 2015/16)
# Competition ID = 12, Season ID = 27 (Serie A 2015/16)
# Competition ID = 7, Season ID = 27 (Ligue 1 2015/16)

# we want all 5 competitons
competitions = [
    (2, 27),    # Premier League 2015/2016
    (9, 27),    # Bundesliga 2015/2016
    (7, 27),    # Ligue 1 2015/2016
    (11, 27),   # La Liga 2015/2016
    (12, 27),   # Serie A 2015/2016
]

# Enable progress bar support in pandas
tqdm.pandas()

# Store all actions from all competitions
all_actions = []

# Loop through each competition and season pair
for comp_id, season_id in competitions:
    # Load all games for this specific competition and season
    games = sbl.games(comp_id, season_id)
    print(f"Charge games for the competition : {comp_id}, saison {season_id} ({len(games)} matchs)")

     # Iterate over each game in the competition
    for _, match in tqdm(games.iterrows(), total=len(games), desc=f"Comp {comp_id}"):

        #identifier for the game
        game_id = match["game_id"]
        # home team id , needed for left-to-right direction play
        home_id = match["home_team_id"]

        # Load raw event data for this gam
        events = sbl.events(game_id)
        # covert raw event to SPADL format
        actions = spadl_sb.convert_to_actions(events, home_id)
        # Add descriptive action names
        actions = spadl.add_names(actions)

        # Build game states with 3 previous actions as context
        gamestates = feat.gamestates(actions, nb_prev_actions=3)
        #Standardize direction, play left-to-right
        gamestates = feat.play_left_to_right(gamestates, home_id)

        # Define which features to compute from actions
        feature_functions = [
            feat.actiontype_onehot, feat.result_onehot, feat.bodypart_onehot,
            feat.startlocation, feat.endlocation, feat.movement, feat.time
        ]

        # apply each feature function and combine into a DataFrame 
        X = pd.concat([f(gamestates) for f in feature_functions], axis=1)

        # Compute VAEP labels, probability of scoring and conceding a goal in next 10 actions
        scores_df = lab.scores(actions, nr_actions=10)
        concedes_df = lab.concedes(actions, nr_actions=10)
        # VAEP value = scoring probability − conceding probability , this is the formula
        vaep_values = scores_df['scores'].astype(int) - concedes_df['concedes'].astype(int)

        # Add VAEP as target variable
        X["vaep"] = vaep_values
        # Store for later merging
        all_actions.append(X)


# Merge data from all competitions into one dataset
training_data = pd.concat(all_actions, axis=0).dropna()
# features (X_all) and target (y_all)
X_all = training_data.drop(columns=["vaep"])
y_all = training_data["vaep"]

print(f" Size of data : {len(X_all)} exemples")

# Train with XGBoost
model = XGBRegressor(
    n_estimators=200,        # number of trees 
    max_depth=6,             # tree depth
    learning_rate=0.1,       # step size shrinkage
    subsample=0.8,           # fraction of data sampled per tree
    colsample_bytree=0.8,    # fraction of features sampled per tree
    objective="reg:squarederror",  # regression loss function
    n_jobs=-1,               # use all CPU cores
    random_state=23          # not very useful since VAEP scores vary and change every time we run the code
)

#train model on all data
model.fit(X_all, y_all)

# Save model and metadata
joblib.dump({
    "model": model,
    "feature_columns": X_all.columns.tolist(),
    "competitions": competitions,
    "season_name": "2015/2016",
    "description": "VAEP model with Xgboost for 5 competitions 2015/2016"
}, "vaep_model_xgboost_5leagues_1516.pkl")

print("VAEP Xgboost saved in :  'vaep_model_xgboost_5leagues_1516.pkl'")