In [1]:
import requests
import pandas as pd
import json
import numpy as np
import os


# Setup pitch and plot
from mplsoccer.pitch import Pitch ,VerticalPitch

# username = "XXX"
# password = "XXX"


# from skillcorner.client import SkillcornerClient
# client=SkillcornerClient(username=username,password=password)

def time_to_seconds(time_str):
    if time_str is None:
        return 90 * 60  # 120 minutes = 7200 seconds
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s


In [2]:
####-----------------------------------------------------------------------------------
# If you're on a separate project/environemnt
###------------------------------------------------------------------------------------

matches_json_path = os.path.join(os.getcwd(), "data/matches.json")

with open(matches_json_path, "r") as f:
    matches_json = json.load(f)

match_id = matches_json[0]["id"]

# # Construct the raw GitHub URL
tracking_data_github_url=f'https://media.githubusercontent.com/media/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_tracking_extrapolated.jsonl' # Data is stored using GitLFS
raw_data=pd.read_json(tracking_data_github_url,lines=True)


raw_df = pd.json_normalize(
    raw_data.to_dict("records"),
    "player_data",
    ["frame", "timestamp", "period", "possession", "ball_data"],
)

# Extract 'player_id' and 'group from the 'possession' dictionary
raw_df["possession_player_id"] = raw_df["possession"].apply(
    lambda x: x.get("player_id")
)
raw_df["possession_group"] = raw_df["possession"].apply(lambda x: x.get("group"))


# (Optional) Expand the ball_data with json_normalize
raw_df[["ball_x", "ball_y", "ball_z", "is_detected_ball"]] = pd.json_normalize(
    raw_df.ball_data
)


# (Optional) Drop the original 'possession' column if you no longer need it
raw_df = raw_df.drop(columns=["possession", "ball_data"])

# Add the match_id identifier to your dataframe
raw_df["match_id"] = match_id
tracking_df = raw_df.copy()
tracking_df.head()

Unnamed: 0,x,y,player_id,is_detected,frame,timestamp,period,possession_player_id,possession_group,ball_x,ball_y,ball_z,is_detected_ball,match_id
0,-38.16,1.51,51678,False,2510,2025-10-19,1.0,,,-0.46,-0.12,0.14,True,2017461
1,-20.78,3.31,51013,True,2510,2025-10-19,1.0,,,-0.46,-0.12,0.14,True,2017461
2,-20.93,14.81,51685,True,2510,2025-10-19,1.0,,,-0.46,-0.12,0.14,True,2017461
3,-21.03,-8.49,800322,True,2510,2025-10-19,1.0,,,-0.46,-0.12,0.14,True,2017461
4,-8.6,23.72,811820,True,2510,2025-10-19,1.0,,,-0.46,-0.12,0.14,True,2017461


In [3]:

####-----------------------------------------------------------------------------------
# If you're on a separate project/environemnt
###------------------------------------------------------------------------------------
# match_id=1886347
meta_data_github_url=f'https://raw.githubusercontent.com/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_match.json'
# # Read the JSON data as a JSON object
response = requests.get(meta_data_github_url)
raw_match_data = response.json()


# The output has nested json elements. We process them
raw_match_df = pd.json_normalize(raw_match_data, max_level=2)
raw_match_df["home_team_side"] = raw_match_df["home_team_side"].astype(str)

players_df = pd.json_normalize(
    raw_match_df.to_dict("records"),
    record_path="players",
    meta=[
        "home_team_score",
        "away_team_score",
        "date_time",
        "home_team_side",
        "home_team.name",
        "home_team.id",
        "away_team.name",
        "away_team.id",
    ],  # data we keep
)


# Take only players who played and create their total time
players_df = players_df[
    ~((players_df.start_time.isna()) & (players_df.end_time.isna()))
]
players_df["total_time"] = players_df["end_time"].apply(time_to_seconds) - players_df[
    "start_time"
].apply(time_to_seconds)

# Create a flag for GK
players_df["is_gk"] = players_df["player_role.acronym"] == "GK"

# Add a flag if the given player is home or away
players_df["match_name"] = (
    players_df["home_team.name"] + " vs " + players_df["away_team.name"]
)


# Add a flag if the given player is home or away
players_df["home_away_player"] = np.where(
    players_df.team_id == players_df["home_team.id"], "Home", "Away"
)

# Create flag from player
players_df["team_name"] = np.where(
    players_df.team_id == players_df["home_team.id"],
    players_df["home_team.name"],
    players_df["away_team.name"],
)

# Figure out sides
players_df[["home_team_side_1st_half", "home_team_side_2nd_half"]] = (
    players_df["home_team_side"]
    .astype(str)
    .str.strip("[]")
    .str.replace("'", "")
    .str.split(", ", expand=True)
)
# Clean up sides
players_df["direction_player_1st_half"] = np.where(
    players_df.home_away_player == "Home",
    players_df.home_team_side_1st_half,
    players_df.home_team_side_2nd_half,
)
players_df["direction_player_2nd_half"] = np.where(
    players_df.home_away_player == "Home",
    players_df.home_team_side_2nd_half,
    players_df.home_team_side_1st_half,
)


# Clean up and keep the columns that we want to keep about

columns_to_keep = [
    "start_time",
    "end_time",
    "match_name",
    "date_time",
    "home_team.name",
    "away_team.name",
    "id",
    "short_name",
    "number",
    "team_id",
    "team_name",
    "player_role.position_group",
    "total_time",
    "player_role.name",
    "player_role.acronym",
    "is_gk",
    "direction_player_1st_half",
    "direction_player_2nd_half",
]
players_df = players_df[columns_to_keep]
players_df.head()

Unnamed: 0,start_time,end_time,match_name,date_time,home_team.name,away_team.name,id,short_name,number,team_id,team_name,player_role.position_group,total_time,player_role.name,player_role.acronym,is_gk,direction_player_1st_half,direction_player_2nd_half
0,01:23:12,,Melbourne Victory Football Club vs Auckland FC,2025-05-17T09:35:00Z,Melbourne Victory Football Club,Auckland FC,795521,A. Badolato,23,868,Melbourne Victory Football Club,Wide Attacker,408,Right Winger,RW,False,left_to_right,right_to_left
1,01:02:21,,Melbourne Victory Football Club vs Auckland FC,2025-05-17T09:35:00Z,Melbourne Victory Football Club,Auckland FC,965685,L. Gillion,14,4177,Auckland FC,Wide Attacker,1659,Left Winger,LW,False,right_to_left,left_to_right
2,01:14:54,,Melbourne Victory Football Club vs Auckland FC,2025-05-17T09:35:00Z,Melbourne Victory Football Club,Auckland FC,31147,T. Smith,5,4177,Auckland FC,Central Defender,906,Center Back,CB,False,right_to_left,left_to_right
3,00:00:00,01:23:12,Melbourne Victory Football Club vs Auckland FC,2025-05-17T09:35:00Z,Melbourne Victory Football Club,Auckland FC,50955,N. Velupillay,17,868,Melbourne Victory Football Club,Wide Attacker,4992,Left Winger,LW,False,left_to_right,right_to_left
4,00:00:00,01:19:39,Melbourne Victory Football Club vs Auckland FC,2025-05-17T09:35:00Z,Melbourne Victory Football Club,Auckland FC,23418,F. Gallegos,28,4177,Auckland FC,Midfield,4779,Left Midfield,LM,False,right_to_left,left_to_right


In [4]:
# Merging datasets
enriched_tracking_data = tracking_df.merge(
    players_df, left_on=["player_id"], right_on=["id"]
)
enriched_tracking_data.head()

Unnamed: 0,x,y,player_id,is_detected,frame,timestamp,period,possession_player_id,possession_group,ball_x,...,number,team_id,team_name,player_role.position_group,total_time,player_role.name,player_role.acronym,is_gk,direction_player_1st_half,direction_player_2nd_half
0,-38.16,1.51,51678,False,2510,2025-10-19,1.0,,,-0.46,...,25,868,Melbourne Victory Football Club,Other,5400,Goalkeeper,GK,True,left_to_right,right_to_left
1,-20.78,3.31,51013,True,2510,2025-10-19,1.0,,,-0.46,...,5,868,Melbourne Victory Football Club,Central Defender,3469,Right Center Back,RCB,False,left_to_right,right_to_left
2,-20.93,14.81,51685,True,2510,2025-10-19,1.0,,,-0.46,...,4,868,Melbourne Victory Football Club,Central Defender,5400,Left Center Back,LCB,False,left_to_right,right_to_left
3,-21.03,-8.49,800322,True,2510,2025-10-19,1.0,,,-0.46,...,16,868,Melbourne Victory Football Club,Full Back,5400,Right Back,RB,False,left_to_right,right_to_left
4,-8.6,23.72,811820,True,2510,2025-10-19,1.0,,,-0.46,...,28,868,Melbourne Victory Football Club,Full Back,5400,Left Back,LB,False,left_to_right,right_to_left


In [6]:
# match_id = 1886347
url = f"https://raw.githubusercontent.com/SkillCorner/opendata/master/data/matches/{match_id}/{match_id}_dynamic_events.csv"
de_match = pd.read_csv(url)

# Print column names, head, and shape
print(f"Columns in dynamic events data: {de_match.columns.tolist()}")
print(f"Shape of dynamic events data: {de_match.shape}")

Columns in dynamic events data: ['event_id', 'index', 'match_id', 'frame_start', 'frame_end', 'frame_physical_start', 'time_start', 'time_end', 'minute_start', 'second_start', 'duration', 'period', 'attacking_side_id', 'attacking_side', 'event_type_id', 'event_type', 'event_subtype_id', 'event_subtype', 'player_id', 'player_name', 'player_position_id', 'player_position', 'player_in_possession_id', 'player_in_possession_name', 'player_in_possession_position_id', 'player_in_possession_position', 'team_id', 'team_shortname', 'x_start', 'y_start', 'channel_id_start', 'channel_start', 'third_id_start', 'third_start', 'penalty_area_start', 'x_end', 'y_end', 'channel_id_end', 'channel_end', 'third_id_end', 'third_end', 'penalty_area_end', 'associated_player_possession_event_id', 'associated_player_possession_frame_start', 'associated_player_possession_frame_end', 'associated_player_possession_end_type_id', 'associated_player_possession_end_type', 'associated_off_ball_run_event_id', 'associate

  de_match = pd.read_csv(url)


In [8]:
offball_df = de_match[de_match["event_type"] == "off_ball_run"].copy()
print(f"Total off-ball runs in this match: {len(offball_df)}")

key_cols = [
    "associated_player_possession_event_id",
    "event_id",
    "player_id",
    "team_id",
    "x_start", "y_start", "x_end", "y_end",
    "xthreat",
    "passing_option_score",
    "xpass_completion",
    "team_in_possession_phase_type",
    "event_type",
    "event_subtype",
    "associated_player_possession_end_type",
    "associated_off_ball_run_event_id",
    "lead_to_different_phase",
    "phase_index",
    "lead_to_shot",
    "lead_to_goal",
    "targeted",
    "received",
    "received_in_space",
    "trajectory_direction",
    "speed_avg",
    "location_to_player_in_possession_start",
    "location_to_player_in_possession_end",
    "xloss_player_possession_start",
    "xloss_player_possession_end",
]

print(offball_df[key_cols].isna().mean().sort_values(ascending=False))

Total off-ball runs in this match: 460
xloss_player_possession_end               1.000000
xloss_player_possession_start             1.000000
lead_to_different_phase                   1.000000
associated_off_ball_run_event_id          1.000000
associated_player_possession_end_type     1.000000
received_in_space                         0.723913
speed_avg                                 0.017391
y_start                                   0.000000
associated_player_possession_event_id     0.000000
event_id                                  0.000000
player_id                                 0.000000
team_id                                   0.000000
x_start                                   0.000000
event_type                                0.000000
team_in_possession_phase_type             0.000000
xpass_completion                          0.000000
passing_option_score                      0.000000
xthreat                                   0.000000
y_end                                     0