In [5]:
import requests
import pandas as pd
import json
import numpy as np
from scipy.spatial.distance import cdist
import os
from datetime import timedelta
from scipy.spatial import Voronoi
from shapely.geometry import Polygon, LineString,Point, box
from kloppy import skillcorner
from kloppy.domain import Team
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
from urllib.request import urlopen
from highlight_text import fig_text
from matplotlib.animation import FuncAnimation
import re

from PIL import Image

from mplsoccer import PyPizza, add_image, FontManager


# Setup pitch and plot
from mplsoccer import Pitch
#from mplsoccer.pitch import Pitch ,VerticalPitch

# username = "XXX"
# password = "XXX"


# from skillcorner.client import SkillcornerClient
# client=SkillcornerClient(username=username,password=password)

def time_to_seconds(time_str):
    if time_str is None:
        return 90 * 60  # 120 minutes = 7200 seconds
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s

font_normal = FontManager('https://raw.githubusercontent.com/googlefonts/roboto/main/'
                          'src/hinted/Roboto-Regular.ttf')
font_italic = FontManager('https://raw.githubusercontent.com/googlefonts/roboto/main/'
                          'src/hinted/Roboto-Italic.ttf')
font_bold = FontManager('https://raw.githubusercontent.com/google/fonts/main/apache/robotoslab/'
                        'RobotoSlab[wght].ttf')

In [6]:
def load_matches(matches_json_path):

    with open(matches_json_path, "r") as f:
        matches_json = json.load(f)

    match_ids = [match["id"] for match in matches_json]

    all_tracking = []

    for match_id in match_ids:
        tracking_data_github_url = f'https://media.githubusercontent.com/media/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_tracking_extrapolated.jsonl'
        
        dataset = skillcorner.load(
            meta_data=f'https://raw.githubusercontent.com/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_match.json',
            raw_data=tracking_data_github_url,
            # Optional arguments,
            coordinates="skillcorner",
            include_empty_frames=False)

        all_tracking.append(dataset)

    all_de_dfs = []

    for match_id in match_ids:
        url = f"https://raw.githubusercontent.com/SkillCorner/opendata/master/data/matches/{match_id}/{match_id}_dynamic_events.csv"
        try:
            de_match = pd.read_csv(url)
            all_de_dfs.append(de_match)
        except Exception as e:
            print(f"Failed to load dynamic events for match {match_id}: {e}")

    de_all_matches = pd.concat(all_de_dfs, ignore_index=True)

    all_metadata = []

    for match_id in match_ids:
        metadata_match = f'https://raw.githubusercontent.com/SkillCorner/opendata/741bdb798b0c1835057e3fa77244c1571a00e4aa/data/matches/{match_id}/{match_id}_match.json'
        response = requests.get(metadata_match)
        raw_match_data = response.json()
        all_metadata.append(raw_match_data)

    
    return all_tracking, de_all_matches, all_metadata
        

In [7]:
# Load matches

matches_json_path = os.path.join(os.path.dirname(os.getcwd()), "data/matches.json")
all_tracking, de_all_matches, all_metadata = load_matches(matches_json_path)

  de_match = pd.read_csv(url)
  de_match = pd.read_csv(url)


In [8]:
def midfielders_obe(de_all_matches):
    # Get off-ball events
    off_ball_events = de_all_matches[de_all_matches["event_type_id"] == 1]

    # Get only off ball events from midfielders
    positions_mid = [9,10,11,12,13,14,15]
    mid_obe = off_ball_events[off_ball_events["player_position_id"].isin(positions_mid)].copy()

    # For every obe, column id equals event_id_match_id
    mid_obe["id"] = mid_obe["event_id"].astype(str) + "_" + mid_obe["match_id"].astype(str)
    mid_obe = mid_obe.reset_index(drop=True)

    # Data matching
    mid_obe = mid_obe[
            (mid_obe["is_player_possession_start_matched"] == True) &
            (mid_obe["is_player_possession_end_matched"] == True)
        ]
    return mid_obe

In [9]:
def get_time_from_frame(all_tracking, match_id, frame_id):
    for tracking in all_tracking:
        if int(tracking.metadata.game_id) == int(match_id):
            for frame in tracking:
                if int(frame.frame_id) == int(frame_id):
                    time_str = str(frame.time)  # e.g. "P1T00:09" or "P2T03:21"

                    # Extract period, minutes and seconds
                    match = re.match(r"P(\d+)T(\d+):(\d+)", time_str)
                    period = int(match.group(1))
                    minutes = int(match.group(2))
                    seconds = int(match.group(3))

                    # Base time from mm:ss
                    td = pd.Timedelta(minutes=minutes, seconds=seconds)

                    # If second period, add 45 minutes
                    if period == 2:
                        td += pd.Timedelta(minutes=45)

                    return td

In [14]:
mid_obe = midfielders_obe(de_all_matches)
    
# get the 10th row
example = mid_obe.iloc[9]
print(example["associated_player_possession_event_id"])
print(example["match_id"])
print(example["associated_player_possession_frame_start"])
frame_start = example["frame_start"]

pp = de_all_matches[
    (de_all_matches["event_id"] == example["associated_player_possession_event_id"]) &
    (de_all_matches["match_id"] == example["match_id"])
]

p_options = de_all_matches[
    (de_all_matches["event_type_id"] == 7) &
    (de_all_matches["match_id"] == example["match_id"])]

# get the passing options that occurs during -0.5seconds to +5 seconds of the possession start
p_options = p_options[(
    (pd.to_datetime(pp["time_start"])) >= pd.to_datetime(p_options["time_start"]) & pd.to_datetime(pp["time_start"]) < pd.to_datetime(p_options["time_end"]))   
]
print(p_options["second_start"].values)
print(p_options.shape[0])


print(pp["frame_start"].values[0])
print(pp["n_passing_options_at_start"].values[0])
print(pp["second_start"].values[0])

8_65
2017461
5887.0


DateParseError: hour must be in 0..23: 24:07.4, at position 399

In [44]:

columns = ["event_id", "player_id","match_id","frame_start","frame_end","simultaneous_runs","po_created",
           "n_dangerous","n_first_line", "n_second_last_line", "n_last_line"]
po_df = pd.DataFrame(columns=columns)

for idx, row in mid_obe.iterrows():
    frame_start = row["frame_start"]
    frame_end = row["frame_end"]
    match_id = row["match_id"]

    if row['n_simultaneous_runs'] > 0:
        simultaneous_runs = True
    else:
        simultaneous_runs = False

    # get passion options events that the start frame is between frame_start and frame_end
    p_options = de_all_matches[
        (de_all_matches["event_type_id"] == 7) &
        (de_all_matches["match_id"] == match_id) &
        (de_all_matches["frame_start"] >= frame_start) &
        (de_all_matches["frame_start"] <= frame_end)
    ]

    true_p_o_count = (p_options['predicted_passing_option'] == True).sum()

    dangerous_count = (p_options['dangerous'] == True).sum()

    # for the column furthest_line_break, there are 4 options: first_line, second_last_line, last_line, none. get the counts for each option
    grouped = p_options.groupby('furthest_line_break').size()
    first_line_count = grouped.get('first_line', 0)
    second_last_line_count = grouped.get('second_last_line', 0) 
    last_line_count = grouped.get('last_line', 0)
    none_count = grouped.get('none', 0)

    po_df = pd.concat([po_df, pd.DataFrame([{
        "event_id": row["event_id"],
        "player_id": row["player_id"],
        "match_id": match_id,
        "frame_start": frame_start,
        "frame_end": frame_end,
        "simultaneous_runs": simultaneous_runs,
        "po_created": true_p_o_count,
        "n_dangerous": dangerous_count,
        "n_first_line": first_line_count,
        "n_second_last_line": second_last_line_count,
        "n_last_line": last_line_count
    }])], ignore_index=True)

    
print(po_df.shape)

print(po_df[(po_df['po_created'] > 0) & (po_df["simultaneous_runs"] == False)].shape)

(1271, 11)
(505, 11)


In [None]:
## ------ NUMBER OF (DANGEROUS AND NOT DIFFICULT) PASSING OPTIONS CREATED ------ ##

def po_created(mid_obe, de_all_matches):

    for idx, row in mid_obe.iterrows():
        frame_start = row["frame_start"]
        frame_end = row["frame_end"]
        match_id = row["match_id"]
        player_id = row["player_id"]
        event_id = row["event_id"]
        time_start = row["time_start"]
        time_start_td = pd.to_timedelta("00:" + time_start)
        prev_time_start = time_start_td - pd.Timedelta(seconds=5)

        """
        # get previous frame of frame_start
        previous_frame = frame_start - 1
        while True:
            if previous_frame in de_all_matches[
                (de_all_matches["event_type_id"] == 7) & 
                (de_all_matches["match_id"] == match_id)]["frame_start"].values:
                break
            previous_frame -= 1

        de_all_matches["peak_frame_td"] = de_all_matches["peak_passing_option_frame"].apply(
        lambda f: get_time_from_frame(all_tracking, match_id, f))

        # get passing options before the run 
        p_options_prev = de_all_matches[
            (de_all_matches["event_type_id"] == 7) &
            (de_all_matches["match_id"] == match_id) &
            (de_all_matches["time_start"] <= time_start) &
            (de_all_matches["time_start"] >= prev_time_start)] 
        n_po = p_options_prev.shape[0]  
 
        # and that the peak passing option frame is around 5 seconds before the run start        
        
        p_options_prev = p_options_prev[
            (prev_time_start <= de_all_matches["peak_frame_td"]) &
            (de_all_matches["peak_frame_td"]  <= time_start_td)]
        n_dang_not_diff = p_options_prev[(p_options_prev['dangerous'] == True) & (p_options_prev['difficult_pass_target'] == False)].shape[0]
        """

        # During the run
        p_options_run = de_all_matches[
            (de_all_matches["event_type_id"] == 7) &
            (de_all_matches["match_id"] == match_id) &
            (de_all_matches["frame_start"] >= frame_start) &
            (de_all_matches["frame_start"] <= frame_end) &
            (de_all_matches["player_id"] != player_id)]
        
        n_po_run = p_options_run.shape[0]

        p_options_run = p_options_run[
            (p_options_run["peak_passing_option_frame"] >= frame_start) &
            (p_options_run["peak_passing_option_frame"] <= frame_end)]
       
        n_dang_not_diff_run = p_options_run[(p_options_run['dangerous'] == True) & (p_options_run['difficult_pass_target'] == False)].shape[0]

        #mid_obe.at[idx, 'n_passing_options_before'] = n_po
        #mid_obe.at[idx, 'n_dangerous_not_difficult_before'] = n_dang_not_diff
        mid_obe.at[idx, 'n_passing_options_during'] = n_po_run
        mid_obe.at[idx, 'n_dangerous_not_difficult_during'] = n_dang_not_diff_run
        
    return mid_obe

In [30]:
mid_obe = po_created(mid_obe, de_all_matches)

# Count n_passing_options_during and n_dangerous_not_difficult_during
for idx, row in mid_obe.head(20).iterrows():
    print(row['n_passing_options_during'], row['n_dangerous_not_difficult_during'])

11117
159688
11117
11117
23418
23418
14736
11117
157294
11117
23418
11117
23418
159688
159688
11117
157294
11117
11117
11117
11117
11117
23418
157294
159688
11117
11117
157294
14736
159688
159688
159688
11117
14736
23418
23418
11117
159688
157294
11117
23418
23418
14736
14736
23418
14736
23418
23418
23418
23418
23418
14736
14736
11117
157294
11117
14736
11117
159688
157294
23418
11117
11117
23418
11117
23418
23418
23418
23418
23418
23418
11117
14736
23418
11117
14736
157294
50951
159688
51714
157294
159688
159688
14736
50951
14736
50951
50951
14736
14736
14736
159688
157294
157294
560917
14736
23418
51675
51675
560917
14736
14736
560917
51675
14736
14736
14736
51675
23418
14736
23418
14736
51675
23418
14736
560917
560917
23418
14736
23418
51675
51675
560917
14736
14736
14736
14736
14736
23418
23418
23418
23418
51675
51675
560917
560917
14736
51675
50951
50951
14736
50951
809670
14736
14736
50951
14736
14736
50951
50951
809670
51675
14736
809670
51675
809670
50951
51675
14736
14736
8096

KeyboardInterrupt: 

I get the coordinates of the runner in the beginning of the run and the coordinates in the end of the run. Then, I get the coordinates of the player in possesion. Then I draw a line between the coordinates of the runner at the start and the coordinates of the player in possession. I also draw a line from the player in possesion to the coordinates in the end of the run. This two lines form an angle with the coordinates of the player in possession. Then I count all the passing options that are created inside this angle. If the run is horizontal is perfect. If the run is vertical (I can define a threshold), then I get the whole third or a predefined radius. Also add a distance threshold because the runner can move orizontally close to the player in possesion and suddenly there is a run inside the angle but its waaay far from the runner and that would mean that he didnt influence much.

Get the coordinates of the runner at start time. The pp at start time too. Then coordinates of the runner at end time. If end time the pp doesnt have the ball, get the last frame where the pp have the ball. Then get the coordinates of the runner at that frame. Like this we will have the accurate angle of the runner with the pp. 

If the pp lost the ball, then thats it. If the ball goes to another player (the pp ends with a pass or check who has the ball in the next frame) then we get the coordinates of the runner at that frame and in the end of the run and we get the coordinates of the pp and we do the same proces.

This might be good and also can be visualized well!

In [107]:
# Now I should take into account location of the runner (maybe the channel, the third or if its ahead of the player in possesion)