In [None]:
import os
import json
from collections import Counter

# Path to the parent folder containing the 6 subfolders
data_dir = "/content/SoccerNet/SN-BAS-2025/train"

# Initialize a counter for all ball actions
all_ball_actions = Counter()

# Loop through each subfolder in data_dir recursively
for root, dirs, files in os.walk(data_dir):
    # Check if the current directory contains the Labels-ball.json file
    if "Labels-ball.json" in files:
        json_file_path = os.path.join(root, "Labels-ball.json")
        if os.path.isfile(json_file_path):
            with open(json_file_path, "r") as file:
                try:
                    data = json.load(file)

                    # Assuming JSON structure: {"annotations": [{"label": "pass"}, ...]}
                    # Adjust this depending on your actual JSON structure
                    for action in data.get("annotations", []):
                        label = action.get("label")
                        if label:
                            all_ball_actions[label] += 1
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file: {json_file_path}")

# Print summary
print("Unique Ball Actions Found:", len(all_ball_actions))
print("Ball Action Distribution:")
for action, count in all_ball_actions.items():
    print(f"{action}: {count}")

Unique Ball Actions Found: 12
Ball Action Distribution:
PASS: 4400
DRIVE: 3746
HIGH PASS: 646
SHOT: 144
OUT: 476
HEADER: 586
BALL PLAYER BLOCK: 195
PLAYER SUCCESSFUL TACKLE: 62
THROW IN: 308
CROSS: 237
GOAL: 12
FREE KICK: 17


In [None]:
import os
import json
from collections import Counter
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

data_dir = "/content/SoccerNet/SN-BAS-2025/"

ball_actions = []

for root, dirs, files in os.walk(data_dir):
    for f in files:
        if f.endswith("Labels-ball.json"):  # <- this ensures all matching files are found
            json_file_path = os.path.join(root, f)
            with open(json_file_path, "r") as file:
                data = json.load(file)
                for action in data.get("annotations", []):
                    label = action.get("label")
                    if label:
                        ball_actions.append(label)

print("Total actions loaded:", len(ball_actions))
print("First 10 actions:", ball_actions[:10])

Total actions loaded: 12433
First 10 actions: ['PASS', 'DRIVE', 'PASS', 'DRIVE', 'HIGH PASS', 'DRIVE', 'PASS', 'DRIVE', 'PASS', 'DRIVE']


In [None]:
# Encode actions
le = LabelEncoder()
encoded_actions = le.fit_transform(ball_actions)
num_classes = len(le.classes_)
X, y = [], []

seq_length = 5
for i in range(len(encoded_actions) - seq_length):
    X.append(encoded_actions[i:i+seq_length])
    y.append(encoded_actions[i+seq_length])

X = np.array(X, dtype='int32')
y = to_categorical(y, num_classes=num_classes)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Num classes:", num_classes)

X shape: (12428, 5)
y shape: (12428, 12)
Num classes: 12


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.metrics import TopKCategoricalAccuracy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

data = np.load("actions_dataset.npz")
X, y = data["X"], data["y"]

In [None]:
## transformer :
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42, stratify=y_tmp
)

seq_len = X.shape[1]
num_classes = y.shape[1]
vocab_size = int(np.max(X)) + 1

print(f"seq_len={seq_len}, vocab_size={vocab_size}, num_classes={num_classes}")

# ???
class PositionalEmbedding(layers.Layer):
    def __init__(self, vocab_size, d_model, max_len=512):
        super().__init__()
        self.token_emb = layers.Embedding(vocab_size, d_model)
        self.pos_emb = layers.Embedding(max_len, d_model)

    def call(self, x):
        # x: (B, T) int32
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        pos_embeddings = self.pos_emb(positions)[None, ...]          # (1,T,D)
        tok_embeddings = self.token_emb(x)                           # (B,T,D)
        return tok_embeddings + pos_embeddings

seq_len=5, vocab_size=12, num_classes=12


In [None]:
import numpy as np

np.savez("actions_dataset.npz", X=X, y=y)

print("✅ Dataset saved to actions_dataset.npz")

✅ Dataset saved to actions_dataset.npz


In [None]:
def transformer_block(x, n_heads=4, d_model=128, d_ff=256, dropout=0.15, name=None):
    # Pre-norm
    attn_input = layers.LayerNormalization(epsilon=1e-6, name=f"{name}_ln1")(x)
    attn_out = layers.MultiHeadAttention(num_heads=n_heads, key_dim=d_model//n_heads,
                                         dropout=dropout, name=f"{name}_mha")(attn_input, attn_input, use_causal_mask=True)
    x = layers.Add(name=f"{name}_add1")([x, attn_out])
    # FFN
    ffn_input = layers.LayerNormalization(epsilon=1e-6, name=f"{name}_ln2")(x)
    ffn = layers.Dense(d_ff, activation="gelu", name=f"{name}_ff1")(ffn_input)
    ffn = layers.Dropout(dropout, name=f"{name}_drop1")(ffn)
    ffn = layers.Dense(d_model, name=f"{name}_ff2")(ffn)
    x = layers.Add(name=f"{name}_add2")([x, ffn])
    x = layers.Dropout(dropout, name=f"{name}_drop2")(x)
    return x


d_model = 128
d_ff = 4 * d_model
n_layers = 4
n_heads = 8
dropout = 0.15

inputs = layers.Input(shape=(seq_len,), dtype="int32")
x = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model, max_len=max(512, seq_len))(inputs)

for i in range(n_layers):
    x = transformer_block(x, n_heads=n_heads, d_model=d_model, d_ff=d_ff, dropout=dropout, name=f"blk{i+1}")

# take LAST token representation
x = layers.LayerNormalization(epsilon=1e-6, name="final_ln")(x)
x_last = layers.Lambda(lambda t: t[:, -1, :], name="take_last")(x)
x_last = layers.Dropout(dropout, name="head_drop")(x_last)
outputs = layers.Dense(num_classes, activation="softmax", name="classifier")(x_last)

model = models.Model(inputs, outputs, name="NextActionTransformer")
model.summary()

In [None]:
loss = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.05)
metrics = [
    "accuracy",
    TopKCategoricalAccuracy(k=3, name="top3_acc"),
]
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_top3_acc", factor=0.5, patience=2, min_lr=1e-5, verbose=1
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor="val_top3_acc", patience=6, mode="max", restore_best_weights=True, verbose=1
    ),
    tf.keras.callbacks.ModelCheckpoint(
        "transformer_best.keras", monitor="val_top3_acc", mode="max",
        save_best_only=True, verbose=1
    ),
]

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# treat imbalance classes.
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.arange(num_classes),
    y=np.argmax(y_train, axis=1)
)
class_weights = dict(enumerate(class_weights))


history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=40,
    batch_size=128,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/40
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step - accuracy: 0.5105 - loss: 1.7900 - top3_acc: 0.7683
Epoch 1: val_top3_acc improved from -inf to 0.89622, saving model to transformer_best.keras
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 273ms/step - accuracy: 0.5112 - loss: 1.7873 - top3_acc: 0.7690 - val_accuracy: 0.6420 - val_loss: 1.3220 - val_top3_acc: 0.8962 - learning_rate: 3.0000e-04
Epoch 2/40
[1m77/78[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.6233 - loss: 1.3690 - top3_acc: 0.8699
Epoch 2: val_top3_acc improved from 0.89622 to 0.90587, saving model to transformer_best.keras
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.6234 - loss: 1.3689 - top3_acc: 0.8701 - val_accuracy: 0.6597 - val_loss: 1.2555 - val_top3_acc: 0.9059 - learning_rate: 3.0000e-04
Epoch 3/40
[1m76/78[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accurac

In [None]:
from sklearn.metrics import f1_score, classification_report
import numpy as np

y_true = np.argmax(y_test, axis=1)
probs = model.predict(X_test)
y_pred_top1 = np.argmax(probs, axis=1)


top1_acc = np.mean(y_true == y_pred_top1)
print("Top-1 Accuracy:", round(top1_acc, 4))

top3_pred = np.argsort(probs, axis=1)[:, -3:]
top3_acc = np.mean([y_true[i] in top3_pred[i] for i in range(len(y_true))])
print("Top-3 Accuracy:", round(top3_acc, 4))


f1 = f1_score(y_true, y_pred_top1, average="weighted")
print("Weighted F1-score:", round(f1, 4))



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step
Top-1 Accuracy: 0.6613
Top-3 Accuracy: 0.9067
Weighted F1-score: 0.6254

Classification Report (Top-1):


NameError: name 'le' is not defined