In this notebook I will perform EDA on raw event data with the purpose of grouping the events into {Pass, Drive, Header, High Pass, Out, Cross, Throw In, Shot, Ball Player Block, Player Successful Tackle, Free Kick, Goal, Goal Kick, Corner Kick}.

First, let's build a table showing all the types of events there are, in order of frequency.

In [1]:
import pandas as pd
import os

match_ids = [117092, 117093, 118575, 118576, 118577, 118578, 128057, 128058, 132831, 132877]
raw_data_path = '/home/share/data/SoccerTrack-v2/data/raw'

# Create an empty list to store all event dataframes
all_events = []

# Loop through all match IDs and read their event data
for match_id in match_ids:
    event_path = os.path.join(raw_data_path, str(match_id), f'{match_id}_player_nodes.csv')
    event_df = pd.read_csv(event_path)
    all_events.append(event_df)

# Concatenate all event dataframes
combined_events = pd.concat(all_events, ignore_index=True)

# Split the event types and count their frequency
event_types = combined_events['filtered_event_types'].str.split(expand=True).stack()
event_counts = event_types.value_counts().reset_index()
event_counts.columns = ['Event Type', 'Frequency']

# Display the table
print(event_counts)

                   Event Type  Frequency
0               passSucceeded       9799
1                passReceived       9798
2          shortPassSucceeded       5376
3         mediumPassSucceeded       3701
4       sidewaysPassSucceeded       3662
..                        ...        ...
74                penaltyKick          4
75      aerialClearanceFailed          3
76                 yellowCard          2
77  shotSaveByForwardMovement          2
78       setPieceGoalConceded          1

[79 rows x 2 columns]


In [2]:
import numpy as np
# Define event categories

header_category = [
    'aerialDuelSucceeded',
    'aerialDuelFailed',
    'aerialClearanceSucceeded',
    'aerialClearanceFailed'
]

cross_category = [
    'crossSucceeded',
    'crossFailed',
    'crossReceived',
    'blockCross'
]

throw_in_category = [
    'throwIn'
]

shot_category = [
    'shot',
    'shotOnTarget',
    'shotMissed',
    'shotBlocked',
    'shotSaveByForwardMovement',
    'setPieceShotConceded',
    'blockShot'
]

free_kick_category = [
    'freeKick',
]

penalty_category = [
    'penalty'
]

goal_category = [
    'goal',
    'goalAgainst',
    'setPieceGoalConceded'
]

corner_kick_category = [
    'cornerKick'
]

goal_kick_category = [
    'goalKickSucceeded',
    'goalKickFailed'
]

pass_category = [
    'passSucceeded',
    'passReceived',
    'passFailed',
    'shortPassSucceeded',
    'shortPassFailed',
    'mediumPassSucceeded',
    'mediumPassFailed',
    'longPassSucceeded',
    'longPassFailed',
    'sidewaysPassSucceeded',
    'sidewaysPassFailed',
    'forwardPassSucceeded',
    'forwardPassFailed',
    'backwardPassSucceeded',
    'backwardPassFailed',
    'keyPass',
    'assist',
    'ballReceived'
]

other_category = [
    'possession',
    'intercept',
    'clearance',
    'duelSucceeded',
    'duelFailed',
    'cutoff',
    'tackleSucceeded',
    'tackleFailed',
    'turnoverWon',
    'turnoverLost',
    'block',
    'dribbleToSpace',
    'groundDuelSucceeded',
    'groundDuelFailed',
    'ballMissed',
    'changeIn',
    'changeOut',
    'foulCommitted',
    'foulConceded',
    'looseBallDuelSucceeded',
    'looseBallDuelFailed',
    'dribbleSucceeded',
    'dribbleFailed',
    'saveByCatching',
    'saveByPunching',
    'controlUnderPressure',
    'hit',
    'offside',
    'defensiveLineSupportSucceeded',
    'defensiveLineSupportFailed',
    'deflection',
    'handballFoul',
    'pause',
    'possessionWon',
    'possessionLost',
    'dribbleConceded',
    'yellowCard'
]

def assign_category(event_types):
    if event_types == 'nan' or event_types == 'NaN' or isinstance(event_types, float):
        return np.nan

    if ' ' not in event_types:
        event_types = [event_types]
    else:
        event_types = event_types.split(' ')
    for event_type in event_types:
        if event_type in header_category:
            return 'Header'
        elif event_type in cross_category:
            return 'Cross'
        elif event_type in throw_in_category:
            return 'Throw In'
        elif event_type in shot_category:
            return 'Shot'
        elif event_type in free_kick_category:
            return 'Free Kick'
        elif event_type in penalty_category:
            return 'Penalty'
        elif event_type in goal_category:
            return 'Goal'
        elif event_type in corner_kick_category:
            return 'Corner Kick'
        elif event_type in goal_kick_category:
            return 'Goal Kick'
        elif event_type in pass_category:
            return 'Pass'
    return 'other'


In [3]:
# Loop through all match IDs and read their event data
for match_id in match_ids:
    event_path = os.path.join(raw_data_path, str(match_id), f'{match_id}_player_nodes.csv')
    event_df = pd.read_csv(event_path)
    break

event_df['event_category'] = event_df['filtered_event_types'].apply(assign_category)



In [4]:
new_event_df = event_df.dropna(subset=['event_category'])

# Finally make a table showing the frequency of each event category.
event_counts = new_event_df['event_category'].value_counts().reset_index()
event_counts.columns = ['Event Type', 'Frequency']
print(event_counts)

    Event Type  Frequency
0         Pass       2446
1        other        719
2       Header         73
3     Throw In         57
4        Cross         53
5         Shot         37
6    Goal Kick         32
7  Corner Kick         20
8    Free Kick         19
9         Goal          8
