In [1]:
%load_ext autoreload
%autoreload 2
import os; import sys; sys.path.insert(0,'../')
import pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
import sys
sys.path.insert(0, '../../')

import atomic.features as fs
import atomic.labels as lab

In [3]:
## Configure file and folder names
datafolder = "../data/socceraction"
spadl_h5 = os.path.join(datafolder,"spadl-statsbomb.h5")
features_h5 = os.path.join(datafolder,"atomic-features.h5")
labels_h5 = os.path.join(datafolder,"atomic-labels.h5")
predictions_h5 = os.path.join(datafolder,"atomic-predictions-3-actions.h5")

In [4]:
games = pd.read_hdf(spadl_h5,"games")
traingames = games
testgames = games
print("nb of train games and test games:", len(traingames),len(testgames))

nb of train games and test games: 452 452


In [5]:
# 1. Select feature set X
xfns = [fs.actiontype_onehot,
        fs.bodypart_onehot,
        fs.goalscore,
        fs.location,
        fs.polar,
        fs.direction,
        fs.team,
        fs.time,
        fs.time_delta
        ]
nb_prev_actions = 3

Xcols = fs.feature_column_names(xfns,nb_prev_actions)

def getXY(games,Xcols):
    # generate the columns of the selected feature
    X = []
    for game_id in tqdm(games.game_id,desc="selecting features"):
        Xi = pd.read_hdf(features_h5,f"game_{game_id}")
        X.append(Xi[Xcols])
    X = pd.concat(X).reset_index(drop=True)

    # 2. Select label Y
    Ycols = ["scores","concedes"]
    Y = []
    for game_id in tqdm(games.game_id,desc="selecting label"):
        Yi = pd.read_hdf(labels_h5,f"game_{game_id}")
        Y.append(Yi[Ycols])
    Y = pd.concat(Y).reset_index(drop=True)
    return X,Y

X,Y = getXY(traingames,Xcols)
print("X:", list(X.columns))
print("Y:", list(Y.columns))
X = X.fillna(0)

HBox(children=(FloatProgress(value=0.0, description='selecting features', max=452.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='selecting label', max=452.0, style=ProgressStyle(descript…


X: ['type_pass_a0', 'type_cross_a0', 'type_throw_in_a0', 'type_freekick_crossed_a0', 'type_freekick_short_a0', 'type_corner_crossed_a0', 'type_corner_short_a0', 'type_take_on_a0', 'type_foul_a0', 'type_tackle_a0', 'type_interception_a0', 'type_shot_a0', 'type_shot_penalty_a0', 'type_shot_freekick_a0', 'type_keeper_save_a0', 'type_keeper_claim_a0', 'type_keeper_punch_a0', 'type_keeper_pick_up_a0', 'type_clearance_a0', 'type_bad_touch_a0', 'type_non_action_a0', 'type_dribble_a0', 'type_goalkick_a0', 'type_receival_a0', 'type_out_a0', 'type_offside_a0', 'type_goal_a0', 'type_owngoal_a0', 'type_yellow_card_a0', 'type_red_card_a0', 'type_corner_a0', 'type_freekick_a0', 'type_pass_a1', 'type_cross_a1', 'type_throw_in_a1', 'type_freekick_crossed_a1', 'type_freekick_short_a1', 'type_corner_crossed_a1', 'type_corner_short_a1', 'type_take_on_a1', 'type_foul_a1', 'type_tackle_a1', 'type_interception_a1', 'type_shot_a1', 'type_shot_penalty_a1', 'type_shot_freekick_a1', 'type_keeper_save_a1', 'typ

In [6]:
%%time
# train classifiers F(X) = Y
import xgboost

Y_hat = pd.DataFrame()
models = {}
for col in list(Y.columns):
    print(col)
    model = xgboost.XGBClassifier(n_estimators=10,
                                  max_depth=4,
                                  n_jobs=-3)
    model.fit(X,Y[col])
    models[col] = model

scores
concedes
CPU times: user 3min 18s, sys: 6.37 s, total: 3min 25s
Wall time: 3min 14s


In [7]:
testX,testY = X,Y

In [8]:
from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss

def evaluate(y,y_hat):
    p = sum(y)/len(y)
    base = [p] * len(y)
    brier = brier_score_loss(y,y_hat)
    print(f"  Brier score: %.5f %.5f" % (brier,brier/brier_score_loss(y,base)))
    ll = log_loss(y,y_hat)
    print(f"  log loss score: %.5f %.5f" % (ll,ll/log_loss(y,base)))
    print(f"  ROC AUC: %.5f" % roc_auc_score(y,y_hat))

for col in testY.columns:
    Y_hat[col] = [p[1] for p in models[col].predict_proba(testX)]
    #Y_hat[col] = [p for p in models[col].predict(testX)]
    print(f"Y: {col}")
    evaluate(testY[col],Y_hat[col])

Y: scores
  Brier score: 0.00836 0.87953
  log loss score: 0.05358 0.98944
  ROC AUC: 0.88104
Y: concedes
  Brier score: 0.00141 1.31650
  log loss score: 0.02653 3.15931
  ROC AUC: 0.85989


### Save predictions

In [10]:
# get rows with game id per action
A = []
for game_id in tqdm(testgames.game_id,"loading game ids"):
    Ai = pd.read_hdf(spadl_h5,f"atomic_actions/game_{game_id}")
    A.append(Ai[["game_id"]])
A = pd.concat(A)
A = A.reset_index(drop=True)

# concatenate action game id rows with predictions and save per game
grouped_predictions = pd.concat([A,Y_hat],axis=1).groupby("game_id")
for k,df in tqdm(grouped_predictions,desc="saving predictions per game"):
    df = df.reset_index(drop=True)
    df[Y_hat.columns].to_hdf(predictions_h5,f"game_{int(k)}")

HBox(children=(FloatProgress(value=0.0, description='loading game ids', max=452.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='saving predictions per game', max=452.0, style=ProgressSt…


