In [2]:
import requests
import pandas as pd
import json
import numpy as np
import os
from datetime import timedelta


# Setup pitch and plot
from mplsoccer import Pitch
#from mplsoccer.pitch import Pitch ,VerticalPitch

# username = "XXX"
# password = "XXX"


# from skillcorner.client import SkillcornerClient
# client=SkillcornerClient(username=username,password=password)

def time_to_seconds(time_str):
    if time_str is None:
        return 90 * 60  # 120 minutes = 7200 seconds
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s


In [2]:
# CELL TO LOAD ALL MATCHES TRACKING DATA, PLAYERS DATA AND EVENTS DATA

matches_json_path = os.path.join(os.getcwd(), "data/matches.json")

with open(matches_json_path, "r") as f:
    matches_json = json.load(f)

match_ids = [match["id"] for match in matches_json]

all_tracking_dfs = []

for match_id in match_ids:
    tracking_data_github_url = f'https://media.githubusercontent.com/media/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_tracking_extrapolated.jsonl'
    
    try:
        raw_data = pd.read_json(tracking_data_github_url, lines=True)
        
        raw_df = pd.json_normalize(
            raw_data.to_dict("records"),
            "player_data",
            ["frame", "timestamp", "period", "possession", "ball_data"],
        )
        
        # Extract 'player_id' and 'group' from possession
        raw_df["possession_player_id"] = raw_df["possession"].apply(lambda x: x.get("player_id"))
        raw_df["possession_group"] = raw_df["possession"].apply(lambda x: x.get("group"))
        
        # Expand ball_data
        raw_df[["ball_x", "ball_y", "ball_z", "is_detected_ball"]] = pd.json_normalize(raw_df.ball_data)
        
        # Drop unnecessary columns
        raw_df = raw_df.drop(columns=["possession", "ball_data"])
        
        # Add match_id
        raw_df["match_id"] = match_id
        
        all_tracking_dfs.append(raw_df)
        
    except Exception as e:
        print(f"Failed to load match {match_id}: {e}")

tracking_df_all = pd.concat(all_tracking_dfs, ignore_index=True)
tracking_df_all.head()

# -------------------------------------------------------------------------------------

all_players_dfs = []

for match_id in match_ids:
    meta_data_github_url = f'https://raw.githubusercontent.com/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_match.json'
    
    try:
        response = requests.get(meta_data_github_url)
        raw_match_data = response.json()
        
        raw_match_df = pd.json_normalize(raw_match_data, max_level=2)
        raw_match_df["home_team_side"] = raw_match_df["home_team_side"].astype(str)
        
        players_df = pd.json_normalize(
            raw_match_df.to_dict("records"),
            record_path="players",
            meta=[
                "home_team_score",
                "away_team_score",
                "date_time",
                "home_team_side",
                "home_team.name",
                "home_team.id",
                "away_team.name",
                "away_team.id",
            ],
        )
        
        # Keep only players who played
        players_df = players_df[~((players_df.start_time.isna()) & (players_df.end_time.isna()))]
        players_df["total_time"] = players_df["end_time"].apply(time_to_seconds) - players_df["start_time"].apply(time_to_seconds)
        players_df["is_gk"] = players_df["player_role.acronym"] == "GK"
        players_df["match_name"] = players_df["home_team.name"] + " vs " + players_df["away_team.name"]
        players_df["home_away_player"] = np.where(players_df.team_id == players_df["home_team.id"], "Home", "Away")
        players_df["team_name"] = np.where(players_df.team_id == players_df["home_team.id"], players_df["home_team.name"], players_df["away_team.name"])
        
        # Figure out sides
        players_df[["home_team_side_1st_half", "home_team_side_2nd_half"]] = (
            players_df["home_team_side"].astype(str).str.strip("[]").str.replace("'", "").str.split(", ", expand=True)
        )
        players_df["direction_player_1st_half"] = np.where(players_df.home_away_player == "Home",
                                                           players_df.home_team_side_1st_half,
                                                           players_df.home_team_side_2nd_half)
        players_df["direction_player_2nd_half"] = np.where(players_df.home_away_player == "Home",
                                                           players_df.home_team_side_2nd_half,
                                                           players_df.home_team_side_1st_half)
                                                           
        # Keep only relevant columns
        columns_to_keep = [
            "start_time",
            "end_time",
            "match_name",
            "date_time",
            "home_team.name",
            "away_team.name",
            "id",
            "short_name",
            "number",
            "team_id",
            "team_name",
            "player_role.position_group",
            "total_time",
            "player_role.name",
            "player_role.acronym",
            "is_gk",
            "direction_player_1st_half",
            "direction_player_2nd_half",
            "playing_time.total.minutes_played",
        ]
        players_df = players_df[columns_to_keep]
        
        all_players_dfs.append(players_df)
        
    except Exception as e:
        print(f"Failed to process match {match_id}: {e}")

all_players_df = pd.concat(all_players_dfs, ignore_index=True)
all_players_df.head()
all_players_df.shape

# -------------------------------------------------------------------------------------

# Merging datasets
enriched_all_tracking_data = tracking_df_all.merge(
    all_players_df, left_on=["player_id"], right_on=["id"]
)
enriched_all_tracking_data.head()

# -------------------------------------------------------------------------------------

all_de_dfs = []

for match_id in match_ids:
    url = f"https://raw.githubusercontent.com/SkillCorner/opendata/master/data/matches/{match_id}/{match_id}_dynamic_events.csv"
    try:
        de_match = pd.read_csv(url)
        all_de_dfs.append(de_match)
    except Exception as e:
        print(f"Failed to load dynamic events for match {match_id}: {e}")

de_all_matches = pd.concat(all_de_dfs, ignore_index=True)
print(de_all_matches.shape)



  de_match = pd.read_csv(url)
  de_match = pd.read_csv(url)


(47853, 294)


In [3]:
####-----------------------------------------------------------------------------------
# If you're on a separate project/environemnt
###------------------------------------------------------------------------------------

matches_json_path = os.path.join(os.getcwd(), "data/matches.json")

with open(matches_json_path, "r") as f:
    matches_json = json.load(f)

match_id = matches_json[0]["id"]

# # Construct the raw GitHub URL
tracking_data_github_url=f'https://media.githubusercontent.com/media/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_tracking_extrapolated.jsonl' # Data is stored using GitLFS
raw_data=pd.read_json(tracking_data_github_url,lines=True)


raw_df = pd.json_normalize(
    raw_data.to_dict("records"),
    "player_data",
    ["frame", "timestamp", "period", "possession", "ball_data"],
)

# Extract 'player_id' and 'group from the 'possession' dictionary
raw_df["possession_player_id"] = raw_df["possession"].apply(
    lambda x: x.get("player_id")
)
raw_df["possession_group"] = raw_df["possession"].apply(lambda x: x.get("group"))


# (Optional) Expand the ball_data with json_normalize
raw_df[["ball_x", "ball_y", "ball_z", "is_detected_ball"]] = pd.json_normalize(
    raw_df.ball_data
)


# (Optional) Drop the original 'possession' column if you no longer need it
raw_df = raw_df.drop(columns=["possession", "ball_data"])

# Add the match_id identifier to your dataframe
raw_df["match_id"] = match_id
tracking_df = raw_df.copy()
tracking_df.head()
print(tracking_df.shape)

(888888, 14)


In [4]:

####-----------------------------------------------------------------------------------
# If you're on a separate project/environemnt
###------------------------------------------------------------------------------------
# match_id=1886347
meta_data_github_url=f'https://raw.githubusercontent.com/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_match.json'
# # Read the JSON data as a JSON object
response = requests.get(meta_data_github_url)
raw_match_data = response.json()


# The output has nested json elements. We process them
raw_match_df = pd.json_normalize(raw_match_data, max_level=2)
print(raw_match_df.columns.tolist())
print(raw_match_df.shape)
print(raw_match_df.metadata)
raw_match_df["home_team_side"] = raw_match_df["home_team_side"].astype(str)

players_df = pd.json_normalize(
    raw_match_df.to_dict("records"),
    record_path="players",
    meta=[
        "home_team_score",
        "away_team_score",
        "date_time",
        "home_team_side",
        "home_team.name",
        "home_team.id",
        "away_team.name",
        "away_team.id",
    ],  # data we keep
)


# Take only players who played and create their total time
players_df = players_df[
    ~((players_df.start_time.isna()) & (players_df.end_time.isna()))
]
players_df["total_time"] = players_df["end_time"].apply(time_to_seconds) - players_df[
    "start_time"
].apply(time_to_seconds)

# Create a flag for GK
players_df["is_gk"] = players_df["player_role.acronym"] == "GK"

# Add a flag if the given player is home or away
players_df["match_name"] = (
    players_df["home_team.name"] + " vs " + players_df["away_team.name"]
)


# Add a flag if the given player is home or away
players_df["home_away_player"] = np.where(
    players_df.team_id == players_df["home_team.id"], "Home", "Away"
)

# Create flag from player
players_df["team_name"] = np.where(
    players_df.team_id == players_df["home_team.id"],
    players_df["home_team.name"],
    players_df["away_team.name"],
)

# Figure out sides
players_df[["home_team_side_1st_half", "home_team_side_2nd_half"]] = (
    players_df["home_team_side"]
    .astype(str)
    .str.strip("[]")
    .str.replace("'", "")
    .str.split(", ", expand=True)
)
# Clean up sides
players_df["direction_player_1st_half"] = np.where(
    players_df.home_away_player == "Home",
    players_df.home_team_side_1st_half,
    players_df.home_team_side_2nd_half,
)
players_df["direction_player_2nd_half"] = np.where(
    players_df.home_away_player == "Home",
    players_df.home_team_side_2nd_half,
    players_df.home_team_side_1st_half,
)

print("Columns in players data:")
print(players_df.columns.tolist())
# Clean up and keep the columns that we want to keep about

columns_to_keep = [
    "start_time",
    "end_time",
    "match_name",
    "date_time",
    "home_team.name",
    "away_team.name",
    "id",
    "short_name",
    "number",
    "team_id",
    "team_name",
    "player_role.position_group",
    "total_time",
    "player_role.name",
    "player_role.acronym",
    "is_gk",
    "direction_player_1st_half",
    "direction_player_2nd_half",
    "playing_time.total.minutes_played",
]
players_df = players_df[columns_to_keep]
players_df.head()
players_df.shape

['id', 'home_team_score', 'away_team_score', 'date_time', 'home_team_coach', 'away_team_coach', 'match_periods', 'referees', 'players', 'status', 'home_team_side', 'pitch_length', 'pitch_width', 'stadium.id', 'stadium.name', 'stadium.city', 'stadium.capacity', 'home_team.id', 'home_team.name', 'home_team.short_name', 'home_team.acronym', 'home_team_kit.id', 'home_team_kit.team_id', 'home_team_kit.season.id', 'home_team_kit.season.start_year', 'home_team_kit.season.end_year', 'home_team_kit.season.name', 'home_team_kit.name', 'home_team_kit.jersey_color', 'home_team_kit.number_color', 'away_team.id', 'away_team.name', 'away_team.short_name', 'away_team.acronym', 'away_team_kit.id', 'away_team_kit.team_id', 'away_team_kit.season.id', 'away_team_kit.season.start_year', 'away_team_kit.season.end_year', 'away_team_kit.season.name', 'away_team_kit.name', 'away_team_kit.jersey_color', 'away_team_kit.number_color', 'home_team_playing_time.minutes_tip', 'home_team_playing_time.minutes_otip', 'a

AttributeError: 'DataFrame' object has no attribute 'metadata'

In [47]:
# Merging datasets
enriched_tracking_data = tracking_df.merge(
    players_df, left_on=["player_id"], right_on=["id"]
)
enriched_tracking_data.head()

Unnamed: 0,x,y,player_id,is_detected,frame,timestamp,period,possession_player_id,possession_group,ball_x,...,team_id,team_name,player_role.position_group,total_time,player_role.name,player_role.acronym,is_gk,direction_player_1st_half,direction_player_2nd_half,playing_time.total.minutes_played
0,-38.16,1.51,51678,False,2510,2025-10-28,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Other,5400,Goalkeeper,GK,True,left_to_right,right_to_left,97.42
1,-20.78,3.31,51013,True,2510,2025-10-28,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Central Defender,3469,Right Center Back,RCB,False,left_to_right,right_to_left,58.98
2,-20.93,14.81,51685,True,2510,2025-10-28,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Central Defender,5400,Left Center Back,LCB,False,left_to_right,right_to_left,97.42
3,-21.03,-8.49,800322,True,2510,2025-10-28,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Full Back,5400,Right Back,RB,False,left_to_right,right_to_left,97.42
4,-8.6,23.72,811820,True,2510,2025-10-28,1.0,,,-0.46,...,868,Melbourne Victory Football Club,Full Back,5400,Left Back,LB,False,left_to_right,right_to_left,97.42


In [48]:
# match_id = 1886347
url = f"https://raw.githubusercontent.com/SkillCorner/opendata/master/data/matches/{match_id}/{match_id}_dynamic_events.csv"
de_match = pd.read_csv(url)

# Print column names, head, and shape
print(f"Columns in dynamic events data: {de_match.columns.tolist()}")
print(f"Shape of dynamic events data: {de_match.shape}")

Columns in dynamic events data: ['event_id', 'index', 'match_id', 'frame_start', 'frame_end', 'frame_physical_start', 'time_start', 'time_end', 'minute_start', 'second_start', 'duration', 'period', 'attacking_side_id', 'attacking_side', 'event_type_id', 'event_type', 'event_subtype_id', 'event_subtype', 'player_id', 'player_name', 'player_position_id', 'player_position', 'player_in_possession_id', 'player_in_possession_name', 'player_in_possession_position_id', 'player_in_possession_position', 'team_id', 'team_shortname', 'x_start', 'y_start', 'channel_id_start', 'channel_start', 'third_id_start', 'third_start', 'penalty_area_start', 'x_end', 'y_end', 'channel_id_end', 'channel_end', 'third_id_end', 'third_end', 'penalty_area_end', 'associated_player_possession_event_id', 'associated_player_possession_frame_start', 'associated_player_possession_frame_end', 'associated_player_possession_end_type_id', 'associated_player_possession_end_type', 'associated_off_ball_run_event_id', 'associate

  de_match = pd.read_csv(url)


In [4]:
# ------------- OFF-BALL RUN EVENTS ANALYSIS ---------------------

# Get off-ball events
off_ball_events = de_all_matches[de_all_matches["event_type_id"] == 1]
print(f"Number of off-ball events: {off_ball_events.shape[0]}")

# Get the total minutes played per match and added in the de_all_matches dataframe
total_minutes_per_match = enriched_all_tracking_data.groupby('match_id')['playing_time.total.minutes_played'].max().reset_index().rename(columns={'playing_time.total.minutes_played': 'total_minutes_played'})
print(total_minutes_per_match.head(20))

total_minutes = total_minutes_per_match['total_minutes_played'].sum()
print(f"Total minutes played across all matches: {total_minutes}")

# merge total minutes played into off_ball_events
off_ball_events = off_ball_events.merge(total_minutes_per_match, on='match_id', how='left')

# Group by event_subtype
off_ball_events_grouped = off_ball_events.groupby("event_subtype")

# For each subtype, get count and scaled count and proportion of targeted and received
off_ball_event_summary = []
for subtype, group in off_ball_events_grouped:
    targeted_count = group["targeted"].sum()
    received_count = group["received"].sum()
    total_count = group.shape[0]
    # per90 scaled values
    total_per90 = (total_count * (90 / total_minutes))
    targeted_per90 = (targeted_count * (90 / total_minutes))
    received_per90 = (received_count * (90 / total_minutes))
    targeted_proportion90 = targeted_per90 / total_per90 if total_per90 > 0 else 0
    received_proportion90 = received_per90 / total_per90 if total_per90 > 0 else 0
    print(f"Subtype: {subtype} Total: {total_count} ({total_per90:.2f}/90min), Targeted: {targeted_count} ({targeted_per90:.2f}/90min, {targeted_proportion90:.2%}), Received: {received_count} ({received_per90:.2f}/90min, {received_proportion90:.2%})")
    # Save these values for later use
    off_ball_event_summary.append({
        "subtype": subtype,
        "total": total_count,
        "total_per90": total_per90,
        "targeted": targeted_count,
        "targeted_per90": targeted_per90,
        "received": received_count,
        "received_per90": received_per90,
        "targeted_proportion": targeted_proportion90,
        "received_proportion": received_proportion90
    })

# Convert to DataFrame for easier handling
off_ball_event_stats_df = pd.DataFrame(off_ball_event_summary)
off_ball_event_stats_df.head(20)

# Get the overall stats too
overall_total_count = off_ball_events.shape[0]
overall_targeted_count = off_ball_events["targeted"].sum()
overall_received_count = off_ball_events["received"].sum()
totalper90 = (overall_total_count * (90 / total_minutes))
targetedper90 = (overall_targeted_count * (90 / total_minutes))
receivedper90 = (overall_received_count * (90 / total_minutes))
targetedproportion = overall_targeted_count / overall_total_count if overall_total_count > 0 else 0
receivedproportion = overall_received_count / overall_total_count if overall_total_count > 0 else 0
print(f"Overall Off-Ball Events - Total: {overall_total_count} ({totalper90:.2f}/90min), Targeted: {overall_targeted_count} ({targetedper90:.2f}/90min, {targetedproportion:.2%}), Received: {overall_received_count} ({receivedper90:.2f}/90min, {receivedproportion:.2%})")


Number of off-ball events: 5002
   match_id  total_minutes_played
0   1886347                100.85
1   1899585                100.85
2   1925299                102.13
3   1953632                 98.75
4   1996435                 96.00
5   2006229                 98.75
6   2011166                100.85
7   2013725                 99.83
8   2015213                100.85
9   2017461                100.85
Total minutes played across all matches: 999.71
Subtype: behind Total: 363 (32.68/90min), Targeted: 155 (13.95/90min, 42.70%), Received: 65 (5.85/90min, 17.91%)
Subtype: coming_short Total: 701 (63.11/90min), Targeted: 244 (21.97/90min, 34.81%), Received: 223 (20.08/90min, 31.81%)
Subtype: cross_receiver Total: 423 (38.08/90min), Targeted: 130 (11.70/90min, 30.73%), Received: 46 (4.14/90min, 10.87%)
Subtype: dropping_off Total: 631 (56.81/90min), Targeted: 245 (22.06/90min, 38.83%), Received: 237 (21.34/90min, 37.56%)
Subtype: overlap Total: 153 (13.77/90min), Targeted: 60 (5.40/90min, 3

In [49]:
# ------------- OFF-BALL EVENT ANALYSIS ------------------
# How many were targeted and how many were received?

# Get the total minutes played in the match
total_minutes_played = enriched_tracking_data["playing_time.total.minutes_played"].max()
print(f"Total minutes played in the match: {total_minutes_played}")


# Safeguard and compute scaling factor to per 90 minutes
if pd.isna(total_minutes_played) or total_minutes_played <= 0:
    scale_per90 = 1.0  # avoid division by zero; fallback to raw counts
    print("Warning: total_minutes_played is not positive; per90 scaling disabled.")
else:
    scale_per90 = 90.0 / total_minutes_played

# Get off ball events
off_ball_events = de_match[de_match["event_type_id"] == 1]
print(f"Number of off-ball events: {off_ball_events.shape[0]}")

# Group by team, player_position and event_subtype
off_ball_event_groups = off_ball_events.groupby(["event_subtype"])

# For each team and subtype
off_ball_event_stats = []
for subtype, group in off_ball_event_groups:
    targeted_count = group["targeted"].sum()
    received_count = group["received"].sum()
    total_count = group.shape[0]
    targeted_proportion = targeted_count / total_count if total_count > 0 else 0
    received_proportion = received_count / total_count if total_count > 0 else 0
    # per90 scaled values
    targeted_per90 = targeted_count * scale_per90
    received_per90 = received_count * scale_per90
    total_per90 = total_count * scale_per90
    print(f"Subtype: {subtype}, Total: {total_count} ({total_per90:.2f}/90min), Targeted: {targeted_count} ({targeted_per90:.2f}/90min, {targeted_proportion:.2%}), Received: {received_count} ({received_per90:.2f}/90min, {received_proportion:.2%})")
    # Save these values for later use
    off_ball_event_stats.append({
        "team_shortname": team_shortname,
        "player_position": player_position,
        "subtype": subtype,
        "total": total_count,
        "total_per90": total_per90,
        "targeted": targeted_count,
        "targeted_per90": targeted_per90,
        "received": received_count,
        "received_per90": received_per90,
        "targeted_proportion": targeted_proportion,
        "received_proportion": received_proportion
    })

# Convert to DataFrame for easier handling
off_ball_event_stats_df = pd.DataFrame(off_ball_event_stats)

# Also calculate overall proportions (match-level) and per90 totals
total_off_ball_events = off_ball_events.shape[0]
total_targeted = off_ball_events["targeted"].sum()
total_received = off_ball_events["received"].sum()
overall_targeted_proportion = total_targeted / total_off_ball_events if total_off_ball_events > 0 else 0
overall_received_proportion = total_received / total_off_ball_events if total_off_ball_events > 0 else 0
total_off_ball_events_per90 = total_off_ball_events * scale_per90
total_targeted_per90 = total_targeted * scale_per90
total_received_per90 = total_received * scale_per90
print(f"\nOverall Off-Ball Events: Total: {total_off_ball_events} ({total_off_ball_events_per90:.2f}/90min), Targeted: {total_targeted} ({total_targeted_per90:.2f}/90min, {overall_targeted_proportion:.2%}), Received: {total_received} ({total_received_per90:.2f}/90min, {overall_received_proportion:.2%})")

Total minutes played in the match: 97.42
Number of off-ball events: 460
Subtype: ('behind',), Total: 28 (25.87/90min), Targeted: 13 (12.01/90min, 46.43%), Received: 5 (4.62/90min, 17.86%)
Subtype: ('coming_short',), Total: 52 (48.04/90min), Targeted: 23 (21.25/90min, 44.23%), Received: 19 (17.55/90min, 36.54%)
Subtype: ('cross_receiver',), Total: 46 (42.50/90min), Targeted: 12 (11.09/90min, 26.09%), Received: 5 (4.62/90min, 10.87%)
Subtype: ('dropping_off',), Total: 45 (41.57/90min), Targeted: 22 (20.32/90min, 48.89%), Received: 21 (19.40/90min, 46.67%)
Subtype: ('overlap',), Total: 21 (19.40/90min), Targeted: 10 (9.24/90min, 47.62%), Received: 8 (7.39/90min, 38.10%)
Subtype: ('pulling_half_space',), Total: 13 (12.01/90min), Targeted: 4 (3.70/90min, 30.77%), Received: 2 (1.85/90min, 15.38%)
Subtype: ('pulling_wide',), Total: 27 (24.94/90min), Targeted: 13 (12.01/90min, 48.15%), Received: 10 (9.24/90min, 37.04%)
Subtype: ('run_ahead_of_the_ball',), Total: 134 (123.79/90min), Targeted: 5

In [10]:
# Now analyze by phases of play

# Get off ball events
off_ball_events = de_match[de_match["event_type_id"] == 1]

# Group by team_shortname, team_in_possession_phase_type and event_subtype
off_ball_event_groups = off_ball_events.groupby(["team_shortname", "team_in_possession_phase_type", "event_subtype"])

# For each group, calculate relevant statistics using also per90 scaling
off_ball_phase_stats = []
for (team_shortname, phase_type, subtype), group in off_ball_event_groups:
    targeted_count = group["targeted"].sum()
    received_count = group["received"].sum()
    total_count = group.shape[0]
    targeted_proportion = targeted_count / total_count if total_count > 0 else 0
    received_proportion = received_count / total_count if total_count > 0 else 0
    # per90 scaled values
    targeted_per90 = targeted_count * scale_per90
    received_per90 = received_count * scale_per90
    total_per90 = total_count * scale_per90
    print(f"Team {team_shortname} - Phase: {phase_type} - Subtype: {subtype}, Total: {total_count} ({total_per90:.2f}/90min), Targeted: {targeted_count} ({targeted_per90:.2f}/90min, {targeted_proportion:.2%}), Received: {received_count} ({received_per90:.2f}/90min, {received_proportion:.2%})")
    # Save these values for later use
    off_ball_phase_stats.append({
        "team_shortname": team_shortname,
        "phase_type": phase_type,
        "subtype": subtype,
        "total": total_count,
        "total_per90": total_per90,
        "targeted": targeted_count,
        "targeted_per90": targeted_per90,
        "received": received_count,
        "received_per90": received_per90,
        "targeted_proportion": targeted_proportion,
        "received_proportion": received_proportion
    })

# Convert to DataFrame for easier handling
off_ball_phase_stats_df = pd.DataFrame(off_ball_phase_stats)

Team Auckland FC - Phase: build_up - Subtype: coming_short, Total: 1 (0.92/90min), Targeted: False (0.00/90min, 0.00%), Received: False (0.00/90min, 0.00%)
Team Auckland FC - Phase: build_up - Subtype: dropping_off, Total: 3 (2.77/90min), Targeted: 0 (0.00/90min, 0.00%), Received: 0 (0.00/90min, 0.00%)
Team Auckland FC - Phase: build_up - Subtype: pulling_wide, Total: 2 (1.85/90min), Targeted: 1 (0.92/90min, 50.00%), Received: 1 (0.92/90min, 50.00%)
Team Auckland FC - Phase: build_up - Subtype: run_ahead_of_the_ball, Total: 2 (1.85/90min), Targeted: 2 (1.85/90min, 100.00%), Received: 2 (1.85/90min, 100.00%)
Team Auckland FC - Phase: build_up - Subtype: support, Total: 1 (0.92/90min), Targeted: True (0.92/90min, 100.00%), Received: True (0.92/90min, 100.00%)
Team Auckland FC - Phase: chaotic - Subtype: behind, Total: 1 (0.92/90min), Targeted: True (0.92/90min, 100.00%), Received: False (0.00/90min, 0.00%)
Team Auckland FC - Phase: chaotic - Subtype: coming_short, Total: 7 (6.47/90min), 

In [13]:
# Identify number of untargeted off-ball runs where the phase of play for the team in possession changed within 5 seconds after the off-ball run event.

from anyio import current_time


def safe_to_timedelta(x):
    """Convert time strings like '00:01.9' or '12:34.567' into Timedelta safely."""
    if isinstance(x, pd.Series):
        # Apply recursively to each element
        return x.apply(safe_to_timedelta)
    if isinstance(x, str) and x.count(":") == 1:
        x = "00:" + x  # prepend hours if missing
    return pd.to_timedelta(x, errors="coerce")

# Get off ball events
off_ball_events = de_match[de_match["event_type_id"] == 1]

# Get untargeted off-ball events
untargeted_off_ball_events = off_ball_events[off_ball_events["targeted"] == False].copy()

# Identify number of untargeted off-ball runs where the phase of play for the team in possession changed within 5 seconds after the off-ball run event.
def identify_phase_changes(untargeted_events):
    # Create columns to store previous and new phase and a flag that indicates a phase change
    untargeted_events["previous_phase"] = None
    untargeted_events["new_phase"] = None
    untargeted_events["phase_change"] = False
    phase_changes = 0
    for i, event in untargeted_events.iterrows():
        # Get the time the event starts, recorded to the millisecond
        start_time = safe_to_timedelta(event["time_start"]).total_seconds()
        end_time = safe_to_timedelta(event["time_end"]).total_seconds()
        # Get the team in possession
        team_in_possession = event["team_shortname"]
        # Check for subsequent events within 5 seconds
        subsequent_events = de_match[
            (safe_to_timedelta(de_match["time_start"]).dt.total_seconds() > start_time)
            & ((safe_to_timedelta(de_match["time_start"]).dt.total_seconds() <= end_time + 2) |
               (safe_to_timedelta(de_match["time_start"]).dt.total_seconds() <= start_time + 5))
            & (de_match["team_shortname"] == team_in_possession)]
        # If there are subsequent events, check if the phase has changed. There might be multiple events, consider any of them.
        for j, sub_event in subsequent_events.iterrows():
            if sub_event["team_in_possession_phase_type"] != event["team_in_possession_phase_type"]:
                phase_changes += 1
                # Save the previous phase and the new phase in new columns
                untargeted_events.loc[i, "phase_change"] = True
                untargeted_events.loc[i, "previous_phase"] = event["team_in_possession_phase_type"]
                untargeted_events.loc[i, "new_phase"] = sub_event["team_in_possession_phase_type"]
                print(f"Phase change detected for team {team_in_possession} from {event['team_in_possession_phase_type']} to {sub_event['team_in_possession_phase_type']} within 2 seconds after the end of the off-ball event.")
                break  # No need to check further subsequent events for this off-ball event
    return phase_changes

# Apply the function to the untargeted off-ball events
phase_changes_count = identify_phase_changes(untargeted_off_ball_events)

# Print the total number of phase changes detected and the proportion of untargeted off-ball runs leading to phase changes
print(f"Total number of untargeted off-ball runs leading to phase changes within 2 seconds after the end of the event: {phase_changes_count}")
print(f"Proportion of untargeted off-ball runs leading to phase changes: {phase_changes_count / len(untargeted_off_ball_events) if len(untargeted_off_ball_events) > 0 else 0:.2%}")

Phase change detected for team Melbourne V FC from direct to finish within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from chaotic to transition within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from build_up to direct within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from create to chaotic within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from create to finish within 2 seconds after the end of the off-ball event.
Phase change detected for team Melbourne V FC from create to finish within 2 seconds after the end of the off-ball event.
Phase 

In [51]:
# Get the average xpass_completion per subtype for untargeted off-ball runs for all the matches
off_ball_events = de_all_matches[de_all_matches["event_type_id"] == 1]
untargeted_off_ball_events = off_ball_events[off_ball_events["targeted"] == False]

untargeted_xpass_by_subtype = untargeted_off_ball_events.groupby("event_subtype")["xpass_completion"].agg(['mean', 'count']).reset_index()
print("Average xpass_completion for untargeted off-ball runs by subtype:")
print(untargeted_xpass_by_subtype)

# Overall average xpass_completion for untargeted off-ball runs
overall_untargeted_xpass = untargeted_off_ball_events["xpass_completion"].mean()
overall_count = untargeted_off_ball_events.shape[0]
print(f"Overall average xpass_completion for untargeted off-ball runs: {overall_untargeted_xpass:.4f} over {overall_count} events")

# Same for passing_option_score
untargeted_posscore_by_subtype = untargeted_off_ball_events.groupby("event_subtype")["passing_option_score"].agg(['mean', 'count']).reset_index()
print("Average passing_option_score for untargeted off-ball runs by subtype:")
print(untargeted_posscore_by_subtype)
overall_untargeted_posscore = untargeted_off_ball_events["passing_option_score"].mean()
overall_count = untargeted_off_ball_events.shape[0]
print(f"Overall average passing_option_score for untargeted off-ball runs: {overall_untargeted_posscore:.4f} over {overall_count} events")

# Compare it to targeted off-ball runs
targeted_off_ball_events = off_ball_events[off_ball_events["targeted"] == True]
targeted_xpass_by_subtype = targeted_off_ball_events.groupby("event_subtype")["xpass_completion"].agg(['mean', 'count']).reset_index()
print("Average xpass_completion for targeted off-ball runs by subtype:")
print(targeted_xpass_by_subtype)
overall_targeted_xpass = targeted_off_ball_events["xpass_completion"].mean()
overall_count = targeted_off_ball_events.shape[0]
print(f"Overall average xpass_completion for targeted off-ball runs: {overall_targeted_xpass:.4f} over {overall_count} events")
targeted_posscore_by_subtype = targeted_off_ball_events.groupby("event_subtype")["passing_option_score"].agg(['mean', 'count']).reset_index()
print("Average passing_option_score for targeted off-ball runs by subtype:")
print(targeted_posscore_by_subtype)
overall_targeted_posscore = targeted_off_ball_events["passing_option_score"].mean()
overall_count = targeted_off_ball_events.shape[0]
print(f"Overall average passing_option_score for targeted off-ball runs: {overall_targeted_posscore:.4f} over {overall_count} events")

Average xpass_completion for untargeted off-ball runs by subtype:
           event_subtype      mean  count
0                 behind  0.522310    208
1           coming_short  0.882890    457
2         cross_receiver  0.407946    293
3           dropping_off  0.958893    386
4                overlap  0.809235     93
5     pulling_half_space  0.768247    113
6           pulling_wide  0.836040    190
7  run_ahead_of_the_ball  0.691333    903
8                support  0.765235    525
9               underlap  0.709186     78
Overall average xpass_completion for untargeted off-ball runs: 0.7406 over 3246 events
Average passing_option_score for untargeted off-ball runs by subtype:
           event_subtype      mean  count
0                 behind  0.851152    208
1           coming_short  0.809396    457
2         cross_receiver  0.781848    293
3           dropping_off  0.829698    386
4                overlap  0.845774     93
5     pulling_half_space  0.810879    113
6           pulling_w

In [None]:
# Now analyze by phases of play

# Get off ball events
off_ball_events = de_all_matches[de_all_matches["event_type_id"] == 1]

total_minutes_per_match = enriched_all_tracking_data.groupby('match_id')['playing_time.total.minutes_played'].max().reset_index().rename(columns={'playing_time.total.minutes_played': 'total_minutes_played'})
print(total_minutes_per_match.head(20))

total_minutes = total_minutes_per_match['total_minutes_played'].sum()
print(f"Total minutes played across all matches: {total_minutes}")

# merge total minutes played into off_ball_events
off_ball_events = off_ball_events.merge(total_minutes_per_match, on='match_id', how='left')

# Group by team_shortname, team_in_possession_phase_type and event_subtype
off_ball_events_grouped = off_ball_events.groupby(["team_in_possession_phase_type", "event_subtype"])


off_ball_event_summary = []
for (phase, subtype), group in off_ball_events_grouped:
    targeted_count = group["targeted"].sum()
    received_count = group["received"].sum()
    total_count = group.shape[0]
    # per90 scaled values
    total_per90 = (total_count * (90 / total_minutes))
    targeted_per90 = (targeted_count * (90 / total_minutes))
    received_per90 = (received_count * (90 / total_minutes))
    targeted_proportion90 = targeted_per90 / total_per90 if total_per90 > 0 else 0
    received_proportion90 = received_per90 / total_per90 if total_per90 > 0 else 0
    print(f"Subtype: {subtype} and phase: {phase} Total: {total_count} ({total_per90:.2f}/90min), Targeted: {targeted_count} ({targeted_per90:.2f}/90min, {targeted_proportion90:.2%}), Received: {received_count} ({received_per90:.2f}/90min, {received_proportion90:.2%})")
    # Save these values for later use
    off_ball_event_summary.append({
        "subtype": subtype,
        "phase": phase,
        "total": total_count,
        "total_per90": total_per90,
        "targeted": targeted_count,
        "targeted_per90": targeted_per90,
        "received": received_count,
        "received_per90": received_per90,
        "targeted_proportion": targeted_proportion90,
        "received_proportion": received_proportion90
    })

# Convert to DataFrame for easier handling
off_ball_event_stats_df = pd.DataFrame(off_ball_event_summary)
off_ball_event_stats_df.head(20)

   match_id  total_minutes_played
0   1886347                100.85
1   1899585                100.85
2   1925299                102.13
3   1953632                 98.75
4   1996435                 96.00
5   2006229                 98.75
6   2011166                100.85
7   2013725                 99.83
8   2015213                100.85
9   2017461                100.85
Total minutes played across all matches: 999.71
Subtype: behind and phase: build_up Total: 2 (0.18/90min), Targeted: 0 (0.00/90min, 0.00%), Received: 0 (0.00/90min, 0.00%)
Subtype: coming_short and phase: build_up Total: 213 (19.18/90min), Targeted: 84 (7.56/90min, 39.44%), Received: 71 (6.39/90min, 33.33%)
Subtype: cross_receiver and phase: build_up Total: 1 (0.09/90min), Targeted: True (0.09/90min, 100.00%), Received: True (0.09/90min, 100.00%)
Subtype: dropping_off and phase: build_up Total: 229 (20.62/90min), Targeted: 88 (7.92/90min, 38.43%), Received: 86 (7.74/90min, 37.55%)
Subtype: overlap and phase: build_up T

Unnamed: 0,subtype,phase,total,total_per90,targeted,targeted_per90,received,received_per90,targeted_proportion,received_proportion
0,behind,build_up,2,0.180052,0,0.0,0,0.0,0.0,0.0
1,coming_short,build_up,213,19.175561,84,7.562193,71,6.391854,0.394366,0.333333
2,cross_receiver,build_up,1,0.090026,True,0.090026,True,0.090026,1.0,1.0
3,dropping_off,build_up,229,20.615979,88,7.922297,86,7.742245,0.384279,0.375546
4,overlap,build_up,1,0.090026,False,0.0,False,0.0,0.0,0.0
5,pulling_half_space,build_up,10,0.900261,3,0.270078,2,0.180052,0.3,0.2
6,pulling_wide,build_up,67,6.031749,30,2.700783,28,2.520731,0.447761,0.41791
7,run_ahead_of_the_ball,build_up,91,8.192376,36,3.24094,28,2.520731,0.395604,0.307692
8,support,build_up,27,2.430705,8,0.720209,7,0.630183,0.296296,0.259259
9,behind,chaotic,32,2.880835,19,1.710496,6,0.540157,0.59375,0.1875


In [16]:
# Get the average xpass_completion per subtype for untargeted off-ball events 

off_ball_events = de_match[de_match["event_type_id"] == 1]
off_ball_events = off_ball_events[off_ball_events["targeted"] == False]
xpass_completion_stats = off_ball_events.groupby("event_subtype")["xpass_completion"].agg(['mean', 'count']).reset_index()
print("\nAverage xpass_completion per subtype for untargeted off-ball events:")
print(xpass_completion_stats)

# Overall average xpass_completion for untargeted off-ball events
overall_mean_xpass_completion = off_ball_events["xpass_completion"].mean()
overall_count = off_ball_events.shape[0]
print(f"\nOverall average xpass_completion for untargeted off-ball events: {overall_mean_xpass_completion:.4f} over {overall_count} events")

# Do the same for passing_option_score
off_ball_events = de_match[de_match["event_type_id"] == 1]
off_ball_events = off_ball_events[off_ball_events["targeted"] == False]
passing_option_score_stats = off_ball_events.groupby("event_subtype")["passing_option_score"].agg(['mean', 'count']).reset_index()
print("\nAverage passing_option_score per subtype for untargeted off-ball events:")
print(passing_option_score_stats)

# Overall average passing_option_score for untargeted off-ball events
overall_mean_passing_option_score = off_ball_events["passing_option_score"].mean()
overall_count = off_ball_events.shape[0]
print(f"\nOverall average passing_option_score for untargeted off-ball events: {overall_mean_passing_option_score:.4f} over {overall_count} events")


# Now compare it to targeted off-ball events
off_ball_events = de_match[de_match["event_type_id"] == 1]
off_ball_events = off_ball_events[off_ball_events["targeted"] == True]
xpass_completion_stats_targeted = off_ball_events.groupby("event_subtype")["xpass_completion"].agg(['mean', 'count']).reset_index()
print("\nAverage xpass_completion per subtype for targeted off-ball events:")
print(xpass_completion_stats_targeted)
# Overall average xpass_completion for targeted off-ball events
overall_mean_xpass_completion_targeted = off_ball_events["xpass_completion"].mean()
overall_count_targeted = off_ball_events.shape[0]
print(f"\nOverall average xpass_completion for targeted off-ball events: {overall_mean_xpass_completion_targeted:.4f} over {overall_count_targeted} events")
# Do the same for passing_option_score
off_ball_events = de_match[de_match["event_type_id"] == 1]
off_ball_events = off_ball_events[off_ball_events["targeted"] == True]
passing_option_score_stats_targeted = off_ball_events.groupby("event_subtype")["passing_option_score"].agg(['mean', 'count']).reset_index()
print("\nAverage passing_option_score per subtype for targeted off-ball events:")
print(passing_option_score_stats_targeted)
# Overall average passing_option_score for targeted off-ball events
overall_mean_passing_option_score_targeted = off_ball_events["passing_option_score"].mean()
overall_count_targeted = off_ball_events.shape[0]
print(f"\nOverall average passing_option_score for targeted off-ball events: {overall_mean_passing_option_score_targeted:.4f} over {overall_count_targeted} events")


Average xpass_completion per subtype for untargeted off-ball events:
           event_subtype      mean  count
0                 behind  0.548020     15
1           coming_short  0.825431     29
2         cross_receiver  0.390050     34
3           dropping_off  0.948983     23
4                overlap  0.905727     11
5     pulling_half_space  0.879122      9
6           pulling_wide  0.826436     14
7  run_ahead_of_the_ball  0.737742     84
8                support  0.732614     59
9               underlap  0.664650     10

Overall average xpass_completion for untargeted off-ball events: 0.7241 over 288 events

Average passing_option_score per subtype for untargeted off-ball events:
           event_subtype      mean  count
0                 behind  0.861807     15
1           coming_short  0.792403     29
2         cross_receiver  0.773615     34
3           dropping_off  0.851526     23
4                overlap  0.860255     11
5     pulling_half_space  0.829456      9
6          