In [10]:
import os
import json
from collections import Counter

# Path to the parent folder containing the 6 subfolders
data_dir = "/content/SoccerNet/SN-BAS-2025/train"

# counter for all ball actions
all_ball_actions = Counter()

# Loop through each subfolder in data_dir recursively
for root, dirs, files in os.walk(data_dir):
    # Check if the current directory contains the Labels-ball.json file
    if "Labels-ball.json" in files:
        json_file_path = os.path.join(root, "Labels-ball.json")
        if os.path.isfile(json_file_path):
            with open(json_file_path, "r") as file:
                try:
                    data = json.load(file)

                    # Assuming JSON structure: {"annotations": [{"label": "pass"}, ...]}
                    for action in data.get("annotations", []):
                        label = action.get("label")
                        if label:
                            all_ball_actions[label] += 1
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file: {json_file_path}")

# Print summary
print("Unique Ball Actions Found:", len(all_ball_actions))
print("Ball Action Distribution:")
for action, count in all_ball_actions.items():
    print(f"{action}: {count}")

Unique Ball Actions Found: 12
Ball Action Distribution:
PASS: 4400
DRIVE: 3746
HIGH PASS: 646
OUT: 476
THROW IN: 308
HEADER: 586
CROSS: 237
BALL PLAYER BLOCK: 195
SHOT: 144
PLAYER SUCCESSFUL TACKLE: 62
FREE KICK: 17
GOAL: 12


In [11]:
import os
import json
from collections import Counter
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

data_dir = "/content/SoccerNet/SN-BAS-2025/"

ball_actions = []

for root, dirs, files in os.walk(data_dir):
    for f in files:
        if f.endswith("Labels-ball.json"):  # <- this ensures all matching files are found
            json_file_path = os.path.join(root, f)
            with open(json_file_path, "r") as file:
                data = json.load(file)
                for action in data.get("annotations", []):
                    label = action.get("label")
                    if label:
                        ball_actions.append(label)

print("Total actions loaded:", len(ball_actions))
print("First 10 actions:", ball_actions[:10])

Total actions loaded: 12433
First 10 actions: ['PASS', 'DRIVE', 'HIGH PASS', 'PASS', 'DRIVE', 'PASS', 'DRIVE', 'OUT', 'THROW IN', 'HEADER']


In [12]:
# Encode actions
le = LabelEncoder()
encoded_actions = le.fit_transform(ball_actions)
num_classes = len(le.classes_)
X, y = [], []

seq_length = 5
for i in range(len(encoded_actions) - seq_length):
    X.append(encoded_actions[i:i+seq_length])
    y.append(encoded_actions[i+seq_length])

X = np.array(X, dtype='int32')
y = to_categorical(y, num_classes=num_classes)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Num classes:", num_classes)

X shape: (12428, 5)
y shape: (12428, 12)
Num classes: 12


In [13]:
#Define LSTM model for action regognition
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.utils import to_categorical
import numpy as np

# Reshape X to be 3D for LSTM (batch_size, timesteps, feature_dim)
# Since each encoded action is a single feature, feature_dim is 1
X = np.expand_dims(X, axis=-1)

timesteps = X.shape[1]
feature_dim = X.shape[2]


model = Sequential([
    LSTM(128, return_sequences=False, input_shape=(timesteps, feature_dim)),
    Dropout(0.5),
    Dense(num_classes, activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(X, y, epochs=5, batch_size=32, validation_split=0.1, verbose=1)

model.summary()

Epoch 1/5


  super().__init__(**kwargs)


[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.5156 - loss: 1.5686 - val_accuracy: 0.5583 - val_loss: 1.4866
Epoch 2/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.5952 - loss: 1.3624 - val_accuracy: 0.5599 - val_loss: 1.4800
Epoch 3/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.6024 - loss: 1.3474 - val_accuracy: 0.5623 - val_loss: 1.4483
Epoch 4/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.6015 - loss: 1.3324 - val_accuracy: 0.5607 - val_loss: 1.4462
Epoch 5/5
[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.6075 - loss: 1.3103 - val_accuracy: 0.5632 - val_loss: 1.4296


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Get predictions for all sequences
y_true = np.argmax(y, axis=1)  # ground truth labels (from one-hot to index)
y_pred = []

for i in range(len(X)):
    seq = np.expand_dims(X[i], axis=0)  # reshape for LSTM (1, timesteps)
    probs = model.predict(seq, verbose=0)
    pred_index = np.argmax(probs)
    y_pred.append(pred_index)

y_pred = np.array(y_pred)

# 2. Compute overall accuracy
acc = accuracy_score(y_true, y_pred)
print(f"LSTM accuracy: {acc*100:.2f}%")

# 3. Classification report (precision, recall, f1)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=le.classes_, zero_division=0))

# 4. Confusion matrix
cm = confusion_matrix(y_true, y_pred)

counts = Counter(encoded_actions)
labels = le.inverse_transform(list(counts.keys()))

plt.figure(figsize=(12, 8))
sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Ball Action Prediction")
plt.tight_layout()
plt.show()

In [18]:
# Pick one test sequence (last one in X for example)
test_seq = np.expand_dims(X[-1], axis=0)  # shape (1, seq_length, feature_dim)

# Predict probabilities
pred_probs = model.predict(test_seq, verbose=0)

# Get predicted class index
pred_index = np.argmax(pred_probs)

# Decode to action label
pred_label = le.inverse_transform([pred_index])[0]

print("\n--- Next Ball Action Prediction ---")
print("Input sequence (last 5 actions):", le.inverse_transform(X[-1]))
print("Predicted next action:", pred_label)



--- Next Ball Action Prediction ---
Input sequence (last 5 actions): ['HEADER' 'DRIVE' 'PASS' 'DRIVE' 'SHOT']
Predicted next action: DRIVE


  y = column_or_1d(y, warn=True)


In [19]:
# Random prediction
def random_baseline(num_samples, num_classes):
    return np.random.choice(num_classes, size=num_samples)

In [20]:
#Weighted random based on frequency
action_counts = Counter(encoded_actions)
total_actions = sum(action_counts.values())
weights = np.array([action_counts[i]/total_actions for i in range(num_classes)])

def weighted_random_baseline(num_samples, weights):
    return np.random.choice(len(weights), size=num_samples, p=weights)

In [26]:
# Evaluate models
from sklearn.metrics import top_k_accuracy_score

y_true = np.argmax(y, axis=1)

# Random baseline
y_pred_random = random_baseline(len(y_true), num_classes)
print(f"Random baseline accuracy: {accuracy_score(y_true, y_pred_random)*100:.2f}%")

# Weighted random baseline
y_pred_weighted = weighted_random_baseline(len(y_true), weights)
print(f"Weighted random baseline accuracy: {accuracy_score(y_true, y_pred_weighted)*100:.2f}%")

# LSTM model predictions
y_true = np.argmax(y, axis=1)
y_pred_lstm = model.predict(X, verbose=0) # Get probabilities for top-k

acc_top1 = accuracy_score(y_true, np.argmax(y_pred_lstm, axis=1))
print(f"LSTM accuracy (Top-1): {acc_top1*100:.2f}%")

acc_top3 = top_k_accuracy_score(y_true, y_pred_lstm, k=3, labels=np.arange(num_classes))
print(f"LSTM accuracy (Top-3): {acc_top3*100:.2f}%")

from sklearn.metrics import f1_score

f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
print(f"F1 Score : {f1 * 100:.2f}%")

Random baseline accuracy: 8.17%
Weighted random baseline accuracy: 29.35%
LSTM accuracy (Top-1): 59.98%
LSTM accuracy (Top-3): 83.28%
F1 Score : 11.45%
