In [43]:
import requests
import pandas as pd
import json
import numpy as np
import os
from datetime import timedelta


# Setup pitch and plot
from mplsoccer.pitch import Pitch ,VerticalPitch

# username = "XXX"
# password = "XXX"


# from skillcorner.client import SkillcornerClient
# client=SkillcornerClient(username=username,password=password)

def time_to_seconds(time_str):
    if time_str is None:
        return 90 * 60  # 120 minutes = 7200 seconds
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s


In [2]:
####-----------------------------------------------------------------------------------
# If you're on a separate project/environemnt
###------------------------------------------------------------------------------------

matches_json_path = os.path.join(os.getcwd(), "data/matches.json")

with open(matches_json_path, "r") as f:
    matches_json = json.load(f)

match_id = matches_json[0]["id"]

# # Construct the raw GitHub URL
tracking_data_github_url=f'https://media.githubusercontent.com/media/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_tracking_extrapolated.jsonl' # Data is stored using GitLFS
raw_data=pd.read_json(tracking_data_github_url,lines=True)


raw_df = pd.json_normalize(
    raw_data.to_dict("records"),
    "player_data",
    ["frame", "timestamp", "period", "possession", "ball_data"],
)

# Extract 'player_id' and 'group from the 'possession' dictionary
raw_df["possession_player_id"] = raw_df["possession"].apply(
    lambda x: x.get("player_id")
)
raw_df["possession_group"] = raw_df["possession"].apply(lambda x: x.get("group"))


# (Optional) Expand the ball_data with json_normalize
raw_df[["ball_x", "ball_y", "ball_z", "is_detected_ball"]] = pd.json_normalize(
    raw_df.ball_data
)


# (Optional) Drop the original 'possession' column if you no longer need it
raw_df = raw_df.drop(columns=["possession", "ball_data"])

# Add the match_id identifier to your dataframe
raw_df["match_id"] = match_id
tracking_df = raw_df.copy()
tracking_df.head()

Unnamed: 0,x,y,player_id,is_detected,frame,timestamp,period,possession_player_id,possession_group,ball_x,ball_y,ball_z,is_detected_ball,match_id
0,-38.16,1.51,51678,False,2510,2025-10-27,1.0,,,-0.46,-0.12,0.14,True,2017461
1,-20.78,3.31,51013,True,2510,2025-10-27,1.0,,,-0.46,-0.12,0.14,True,2017461
2,-20.93,14.81,51685,True,2510,2025-10-27,1.0,,,-0.46,-0.12,0.14,True,2017461
3,-21.03,-8.49,800322,True,2510,2025-10-27,1.0,,,-0.46,-0.12,0.14,True,2017461
4,-8.6,23.72,811820,True,2510,2025-10-27,1.0,,,-0.46,-0.12,0.14,True,2017461


In [28]:

####-----------------------------------------------------------------------------------
# If you're on a separate project/environemnt
###------------------------------------------------------------------------------------
# match_id=1886347
meta_data_github_url=f'https://raw.githubusercontent.com/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_match.json'
# # Read the JSON data as a JSON object
response = requests.get(meta_data_github_url)
raw_match_data = response.json()


# The output has nested json elements. We process them
raw_match_df = pd.json_normalize(raw_match_data, max_level=2)
raw_match_df["home_team_side"] = raw_match_df["home_team_side"].astype(str)

players_df = pd.json_normalize(
    raw_match_df.to_dict("records"),
    record_path="players",
    meta=[
        "home_team_score",
        "away_team_score",
        "date_time",
        "home_team_side",
        "home_team.name",
        "home_team.id",
        "away_team.name",
        "away_team.id",
    ],  # data we keep
)


# Take only players who played and create their total time
players_df = players_df[
    ~((players_df.start_time.isna()) & (players_df.end_time.isna()))
]
players_df["total_time"] = players_df["end_time"].apply(time_to_seconds) - players_df[
    "start_time"
].apply(time_to_seconds)

# Create a flag for GK
players_df["is_gk"] = players_df["player_role.acronym"] == "GK"

# Add a flag if the given player is home or away
players_df["match_name"] = (
    players_df["home_team.name"] + " vs " + players_df["away_team.name"]
)


# Add a flag if the given player is home or away
players_df["home_away_player"] = np.where(
    players_df.team_id == players_df["home_team.id"], "Home", "Away"
)

# Create flag from player
players_df["team_name"] = np.where(
    players_df.team_id == players_df["home_team.id"],
    players_df["home_team.name"],
    players_df["away_team.name"],
)

# Figure out sides
players_df[["home_team_side_1st_half", "home_team_side_2nd_half"]] = (
    players_df["home_team_side"]
    .astype(str)
    .str.strip("[]")
    .str.replace("'", "")
    .str.split(", ", expand=True)
)
# Clean up sides
players_df["direction_player_1st_half"] = np.where(
    players_df.home_away_player == "Home",
    players_df.home_team_side_1st_half,
    players_df.home_team_side_2nd_half,
)
players_df["direction_player_2nd_half"] = np.where(
    players_df.home_away_player == "Home",
    players_df.home_team_side_2nd_half,
    players_df.home_team_side_1st_half,
)

print("Columns in players data:")
print(players_df.columns.tolist())
# Clean up and keep the columns that we want to keep about

columns_to_keep = [
    "start_time",
    "end_time",
    "match_name",
    "date_time",
    "home_team.name",
    "away_team.name",
    "id",
    "short_name",
    "number",
    "team_id",
    "team_name",
    "player_role.position_group",
    "total_time",
    "player_role.name",
    "player_role.acronym",
    "is_gk",
    "direction_player_1st_half",
    "direction_player_2nd_half",
    "playing_time.total.minutes_played",
]
players_df = players_df[columns_to_keep]
players_df.head()
players_df.shape

Columns in players data:
['start_time', 'end_time', 'number', 'yellow_card', 'red_card', 'injured', 'goal', 'own_goal', 'team_player_id', 'team_id', 'id', 'first_name', 'last_name', 'short_name', 'birthday', 'trackable_object', 'gender', 'player_role.id', 'player_role.position_group', 'player_role.name', 'player_role.acronym', 'playing_time.total.minutes_tip', 'playing_time.total.minutes_otip', 'playing_time.total.start_frame', 'playing_time.total.end_frame', 'playing_time.total.minutes_played', 'playing_time.total.minutes_played_regular_time', 'playing_time.by_period', 'playing_time.total', 'home_team_score', 'away_team_score', 'date_time', 'home_team_side', 'home_team.name', 'home_team.id', 'away_team.name', 'away_team.id', 'total_time', 'is_gk', 'match_name', 'home_away_player', 'team_name', 'home_team_side_1st_half', 'home_team_side_2nd_half', 'direction_player_1st_half', 'direction_player_2nd_half']


(32, 19)

In [33]:
# Merging datasets
enriched_tracking_data = tracking_df.merge(
    players_df, left_on=["player_id"], right_on=["id"]
)
enriched_tracking_data.head()

Unnamed: 0,x,y,player_id,is_detected,frame,timestamp,period,possession_player_id,possession_group,ball_x,...,team_id,team_name,player_role.position_group,total_time,player_role.name,player_role.acronym,is_gk,direction_player_1st_half,direction_player_2nd_half,playing_time.total.minutes_played
0,-38.16,1.51,51678,False,2510,2025-10-27,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Other,5400,Goalkeeper,GK,True,left_to_right,right_to_left,97.42
1,-20.78,3.31,51013,True,2510,2025-10-27,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Central Defender,3469,Right Center Back,RCB,False,left_to_right,right_to_left,58.98
2,-20.93,14.81,51685,True,2510,2025-10-27,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Central Defender,5400,Left Center Back,LCB,False,left_to_right,right_to_left,97.42
3,-21.03,-8.49,800322,True,2510,2025-10-27,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Full Back,5400,Right Back,RB,False,left_to_right,right_to_left,97.42
4,-8.6,23.72,811820,True,2510,2025-10-27,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Full Back,5400,Left Back,LB,False,left_to_right,right_to_left,97.42


In [5]:
# match_id = 1886347
url = f"https://raw.githubusercontent.com/SkillCorner/opendata/master/data/matches/{match_id}/{match_id}_dynamic_events.csv"
de_match = pd.read_csv(url)

# Print column names, head, and shape
print(f"Columns in dynamic events data: {de_match.columns.tolist()}")
print(f"Shape of dynamic events data: {de_match.shape}")

Columns in dynamic events data: ['event_id', 'index', 'match_id', 'frame_start', 'frame_end', 'frame_physical_start', 'time_start', 'time_end', 'minute_start', 'second_start', 'duration', 'period', 'attacking_side_id', 'attacking_side', 'event_type_id', 'event_type', 'event_subtype_id', 'event_subtype', 'player_id', 'player_name', 'player_position_id', 'player_position', 'player_in_possession_id', 'player_in_possession_name', 'player_in_possession_position_id', 'player_in_possession_position', 'team_id', 'team_shortname', 'x_start', 'y_start', 'channel_id_start', 'channel_start', 'third_id_start', 'third_start', 'penalty_area_start', 'x_end', 'y_end', 'channel_id_end', 'channel_end', 'third_id_end', 'third_end', 'penalty_area_end', 'associated_player_possession_event_id', 'associated_player_possession_frame_start', 'associated_player_possession_frame_end', 'associated_player_possession_end_type_id', 'associated_player_possession_end_type', 'associated_off_ball_run_event_id', 'associate

  de_match = pd.read_csv(url)


In [36]:
# ------------- OFF-BALL EVENT ANALYSIS ------------------
# How many were targeted and how many were received?

# Get the total minutes played in the match
total_minutes_played = enriched_tracking_data["playing_time.total.minutes_played"].max()
print(f"Total minutes played in the match: {total_minutes_played}")

# Safeguard and compute scaling factor to per 90 minutes
if pd.isna(total_minutes_played) or total_minutes_played <= 0:
    scale_per90 = 1.0  # avoid division by zero; fallback to raw counts
    print("Warning: total_minutes_played is not positive; per90 scaling disabled.")
else:
    scale_per90 = 90.0 / total_minutes_played

# Get off ball events
off_ball_events = de_match[de_match["event_type_id"] == 1]
print(f"Number of off-ball events: {off_ball_events.shape[0]}")

# Group by team, player_position and event_subtype
off_ball_event_groups = off_ball_events.groupby(["team_shortname","player_position","event_subtype"])

# For each team and subtype
off_ball_event_stats = []
for (team_shortname, player_position, subtype), group in off_ball_event_groups:
    targeted_count = group["targeted"].sum()
    received_count = group["received"].sum()
    total_count = group.shape[0]
    targeted_proportion = targeted_count / total_count if total_count > 0 else 0
    received_proportion = received_count / total_count if total_count > 0 else 0
    # per90 scaled values
    targeted_per90 = targeted_count * scale_per90
    received_per90 = received_count * scale_per90
    total_per90 = total_count * scale_per90
    print(f"Team {team_shortname} - Position: {player_position} - Subtype: {subtype}, Total: {total_count} ({total_per90:.2f}/90min), Targeted: {targeted_count} ({targeted_per90:.2f}/90min, {targeted_proportion:.2%}), Received: {received_count} ({received_per90:.2f}/90min, {received_proportion:.2%})")
    # Save these values for later use
    off_ball_event_stats.append({
        "team_shortname": team_shortname,
        "player_position": player_position,
        "subtype": subtype,
        "total": total_count,
        "total_per90": total_per90,
        "targeted": targeted_count,
        "targeted_per90": targeted_per90,
        "received": received_count,
        "received_per90": received_per90,
        "targeted_proportion": targeted_proportion,
        "received_proportion": received_proportion
    })

# Convert to DataFrame for easier handling
off_ball_event_stats_df = pd.DataFrame(off_ball_event_stats)

# Create a pivoted view (team x subtype) for per90 metrics
off_ball_pivot = off_ball_event_stats_df.pivot_table(index=["team_shortname","player_position","subtype"], values=["total_per90", "targeted_per90", "received_per90", "total", "targeted", "received"], aggfunc="sum").reset_index()
print("\nSample of per-team, per-playerposition, per-subtype off-ball metrics (per90 + counts):")
print(off_ball_pivot.head())

# Also calculate overall proportions (match-level) and per90 totals
total_off_ball_events = off_ball_events.shape[0]
total_targeted = off_ball_events["targeted"].sum()
total_received = off_ball_events["received"].sum()
overall_targeted_proportion = total_targeted / total_off_ball_events if total_off_ball_events > 0 else 0
overall_received_proportion = total_received / total_off_ball_events if total_off_ball_events > 0 else 0
total_off_ball_events_per90 = total_off_ball_events * scale_per90
total_targeted_per90 = total_targeted * scale_per90
total_received_per90 = total_received * scale_per90
print(f"\nOverall Off-Ball Events: Total: {total_off_ball_events} ({total_off_ball_events_per90:.2f}/90min), Targeted: {total_targeted} ({total_targeted_per90:.2f}/90min, {overall_targeted_proportion:.2%}), Received: {total_received} ({total_received_per90:.2f}/90min, {overall_received_proportion:.2%})")

Total minutes played in the match: 97.42
Number of off-ball events: 460
Team Auckland FC - Position: CF - Subtype: behind, Total: 10 (9.24/90min), Targeted: 7 (6.47/90min, 70.00%), Received: 4 (3.70/90min, 40.00%)
Team Auckland FC - Position: CF - Subtype: coming_short, Total: 6 (5.54/90min), Targeted: 2 (1.85/90min, 33.33%), Received: 2 (1.85/90min, 33.33%)
Team Auckland FC - Position: CF - Subtype: cross_receiver, Total: 16 (14.78/90min), Targeted: 5 (4.62/90min, 31.25%), Received: 2 (1.85/90min, 12.50%)
Team Auckland FC - Position: CF - Subtype: overlap, Total: 1 (0.92/90min), Targeted: True (0.92/90min, 100.00%), Received: True (0.92/90min, 100.00%)
Team Auckland FC - Position: CF - Subtype: pulling_half_space, Total: 2 (1.85/90min), Targeted: 1 (0.92/90min, 50.00%), Received: 0 (0.00/90min, 0.00%)
Team Auckland FC - Position: CF - Subtype: run_ahead_of_the_ball, Total: 22 (20.32/90min), Targeted: 9 (8.31/90min, 40.91%), Received: 7 (6.47/90min, 31.82%)
Team Auckland FC - Position:

In [38]:
# Now analyze by phases of play

# Get off ball events
off_ball_events = de_match[de_match["event_type_id"] == 1]

# Group by team_shortname, team_in_possession_phase_type and event_subtype
off_ball_event_groups = off_ball_events.groupby(["team_shortname", "team_in_possession_phase_type", "event_subtype"])

# For each group, calculate relevant statistics using also per90 scaling
off_ball_phase_stats = []
for (team_shortname, phase_type, subtype), group in off_ball_event_groups:
    targeted_count = group["targeted"].sum()
    received_count = group["received"].sum()
    total_count = group.shape[0]
    targeted_proportion = targeted_count / total_count if total_count > 0 else 0
    received_proportion = received_count / total_count if total_count > 0 else 0
    # per90 scaled values
    targeted_per90 = targeted_count * scale_per90
    received_per90 = received_count * scale_per90
    total_per90 = total_count * scale_per90
    print(f"Team {team_shortname} - Phase: {phase_type} - Subtype: {subtype}, Total: {total_count} ({total_per90:.2f}/90min), Targeted: {targeted_count} ({targeted_per90:.2f}/90min, {targeted_proportion:.2%}), Received: {received_count} ({received_per90:.2f}/90min, {received_proportion:.2%})")
    # Save these values for later use
    off_ball_phase_stats.append({
        "team_shortname": team_shortname,
        "phase_type": phase_type,
        "subtype": subtype,
        "total": total_count,
        "total_per90": total_per90,
        "targeted": targeted_count,
        "targeted_per90": targeted_per90,
        "received": received_count,
        "received_per90": received_per90,
        "targeted_proportion": targeted_proportion,
        "received_proportion": received_proportion
    })

# Convert to DataFrame for easier handling
off_ball_phase_stats_df = pd.DataFrame(off_ball_phase_stats)

Team Auckland FC - Phase: build_up - Subtype: coming_short, Total: 1 (0.92/90min), Targeted: False (0.00/90min, 0.00%), Received: False (0.00/90min, 0.00%)
Team Auckland FC - Phase: build_up - Subtype: dropping_off, Total: 3 (2.77/90min), Targeted: 0 (0.00/90min, 0.00%), Received: 0 (0.00/90min, 0.00%)
Team Auckland FC - Phase: build_up - Subtype: pulling_wide, Total: 2 (1.85/90min), Targeted: 1 (0.92/90min, 50.00%), Received: 1 (0.92/90min, 50.00%)
Team Auckland FC - Phase: build_up - Subtype: run_ahead_of_the_ball, Total: 2 (1.85/90min), Targeted: 2 (1.85/90min, 100.00%), Received: 2 (1.85/90min, 100.00%)
Team Auckland FC - Phase: build_up - Subtype: support, Total: 1 (0.92/90min), Targeted: True (0.92/90min, 100.00%), Received: True (0.92/90min, 100.00%)
Team Auckland FC - Phase: chaotic - Subtype: behind, Total: 1 (0.92/90min), Targeted: True (0.92/90min, 100.00%), Received: False (0.00/90min, 0.00%)
Team Auckland FC - Phase: chaotic - Subtype: coming_short, Total: 7 (6.47/90min), 

In [61]:
# Identify number of untargeted off-ball runs where the phase of play for the team in possession changed within 5 seconds after the off-ball run event.

from anyio import current_time


def safe_to_timedelta(x):
    """Convert time strings like '00:01.9' or '12:34.567' into Timedelta safely."""
    if isinstance(x, pd.Series):
        # Apply recursively to each element
        return x.apply(safe_to_timedelta)
    if isinstance(x, str) and x.count(":") == 1:
        x = "00:" + x  # prepend hours if missing
    return pd.to_timedelta(x, errors="coerce")

# Get off ball events
off_ball_events = de_match[de_match["event_type_id"] == 1]

# Get untargeted off-ball events
untargeted_off_ball_events = off_ball_events[off_ball_events["targeted"] == 0].copy()

# Identify number of untargeted off-ball runs where the phase of play for the team in possession changed within 5 seconds after the off-ball run event.
def identify_phase_changes(untargeted_events):
    # Create columns to store previous and new phase and a flag that indicates a phase change
    untargeted_events["previous_phase"] = None
    untargeted_events["new_phase"] = None
    untargeted_events["phase_change"] = False
    phase_changes = 0
    for i, event in untargeted_events.iterrows():
        # Get the time the event starts, recorded to the millisecond
        start_time = safe_to_timedelta(event["time_start"]).total_seconds()
        end_time = safe_to_timedelta(event["time_end"]).total_seconds()
        # Get the team in possession
        team_in_possession = event["team_shortname"]
        # Check for subsequent events within 5 seconds
        subsequent_events = de_match[
            (safe_to_timedelta(de_match["time_start"]).dt.total_seconds() > start_time)
            & ((safe_to_timedelta(de_match["time_start"]).dt.total_seconds() <= end_time + 2) |
               (safe_to_timedelta(de_match["time_start"]).dt.total_seconds() <= start_time + 5))
            & (de_match["team_shortname"] == team_in_possession)]
        # If there are subsequent events, check if the phase has changed. There might be multiple events, consider any of them.
        for j, sub_event in subsequent_events.iterrows():
            if sub_event["team_in_possession_phase_type"] != event["team_in_possession_phase_type"]:
                phase_changes += 1
                # Save the previous phase and the new phase in new columns
                untargeted_events.loc[i, "phase_change"] = True
                untargeted_events.loc[i, "previous_phase"] = event["team_in_possession_phase_type"]
                untargeted_events.loc[i, "new_phase"] = sub_event["team_in_possession_phase_type"]
                print(f"Phase change detected for team {team_in_possession} from {event['team_in_possession_phase_type']} to {sub_event['team_in_possession_phase_type']} within 2 seconds after the end of the off-ball event.")
                break  # No need to check further subsequent events for this off-ball event
    return phase_changes

# Apply the function to the untargeted off-ball events
phase_changes_count = identify_phase_changes(untargeted_off_ball_events)

# Print the total number of phase changes detected and the proportion of untargeted off-ball runs leading to phase changes
print(f"Total number of untargeted off-ball runs leading to phase changes within 2 seconds after the end of the event: {phase_changes_count}")
print(f"Proportion of untargeted off-ball runs leading to phase changes: {phase_changes_count / len(untargeted_off_ball_events) if len(untargeted_off_ball_events) > 0 else 0:.2%}")

Phase change detected for team Melbourne V FC from direct to finish within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from build_up to direct within 2 seconds after the end of the off-ball