### Metrics

In [1]:
import pandas as pd
import json
import os
import numpy as np

pd.options.display.max_columns = 999

Read datasets

In [7]:
# load tracking data
current_directory = os.getcwd()
path_tracking = os.path.join(os.path.join(os.path.dirname(current_directory),'data'),"tracking_set_0")
print(path_tracking)
game_id = 1

df_tracking = pd.read_parquet(f'{path_tracking}/{game_id}_tracking.parquet')

#           ------------------------------------------------------------        

# load events names
path_event_csv = os.path.join(os.path.dirname(current_directory),'data')
df_event_names = pd.read_csv(os.path.join(path_event_csv,'event_names.csv'))
dict_event_names = df_event_names.set_index('event_type_id').to_dict()['event_description']

# load event data
def load_event_data(file_name, base_path):
    # read in event file
    with open(f'{base_path}/{file_name}') as f:
        data=json.loads(f.read())

    f.close()
    
    # transform data into pandas dataframe
    df_events = pd.json_normalize(data['liveData']['event'])
    
    # preprocess event data and keep relevant information only

    # add timeelapsed to each event
    df_events['timestamp'] = pd.to_datetime(df_events.timeStamp).apply(lambda x: x.timestamp())

    df_events = df_events.query('periodId in [1,2]')

    def add_timeelapsed_to_events(df):
        start_time = df.query('typeId==32')['timestamp'].iloc[0]
        df['timestamp_new'] = np.int64((df['timestamp'] - start_time)*1000)

        df['timeelapsed'] = df['timestamp_new'].apply(lambda x: (40 * round(x/40))/1000)

        return df

    df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)

    df_events = df_events.drop(columns=['timeStamp','timestamp','timestamp_new'])
    
    # rename some columns
    df_events = df_events.rename(columns=
        {
            'periodId':'current_phase',
            'typeId':'event_type_id',
            'timeMin':'period_minute',
            'timeSec':'period_second'
        }
    )
    
    return df_events

path_events = os.path.join(os.path.join(os.path.dirname(current_directory),'data'),"first_10_events")
print(path_events)

event_file = f'{game_id}.json'

df_events = load_event_data(
    base_path=path_events,
    file_name=event_file
)

# add event descriptions
df_events['event_description'] = df_events['event_type_id'].map(dict_event_names)

# make a copy of it for later usage
events_all = df_events.copy()

display(df_events.head())
print(df_events['event_description'].unique())


#           ------------------------------------------------------------        


# read in qualifier list
path_data = os.path.join(os.path.dirname(current_directory),'data')
qualifier_names = pd.read_csv(os.path.join(path_data,"qualifier_names.csv"))

# explode coverts each element in each list to a separate row
cols = ['id', 'qualifier']
qualifiers = events_all[cols].explode('qualifier')
display(qualifiers.head())

print("------------")

qualifiers = qualifiers[qualifiers.qualifier.notna()].reset_index(drop=True)
print(qualifiers.shape)
print("------------")
display(qualifiers.head())
print("------------")

# save corresponding event ids for each qualifier
event_ids = qualifiers.id.tolist()

qualifiers = pd.json_normalize(qualifiers[qualifiers.qualifier.notna()]['qualifier'])
print(qualifiers.shape)
print("------------")
display(qualifiers.head())
print("------------")

qualifiers['event_id'] = event_ids
display(qualifiers.head())
print("------------")
qualifiers = qualifiers.merge(qualifier_names, how='left', on='qualifierId')
display(qualifiers.head())

c:\Users\Gabriel\OneDrive\Escritorio\SportsAnalyticsCourse\OptaForum\OptaChallenge_Clustering_Player_Styles\data\tracking_set_0
c:\Users\Gabriel\OneDrive\Escritorio\SportsAnalyticsCourse\OptaForum\OptaChallenge_Clustering_Player_Styles\data\first_10_events


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


Unnamed: 0,id,eventId,event_type_id,current_phase,period_minute,period_second,contestantId,outcome,x,y,lastModified,qualifier,playerId,lineBreakingPass.linesBroken.value,passOption.player,passTarget.player,xThreat.applied,lineBreakingPass.lastLineBroken.value,pressure.pressureReceived.value,pressure.player,xThreat.removed,keyPass,assist,timeelapsed,event_description
2,2423549045,2,32,1,0,0,3c3jcs7vc1t6vz5lev162jyv7,1,0.0,0.0,2022-05-22T03:17:52Z,"[{'id': 3586084711, 'qualifierId': 127, 'value...",,,,,,,,,,,,0.0,Period start
3,2423549041,2,32,1,0,0,bx0cdmzr2gwr70ez72dorx82p,1,0.0,0.0,2022-05-21T18:59:34Z,"[{'id': 3586084701, 'qualifierId': 127, 'value...",,,,,,,,,,,,0.0,Period start
4,2423549063,3,1,1,0,0,bx0cdmzr2gwr70ez72dorx82p,1,49.9,50.0,2022-05-22T03:34:41Z,"[{'id': 3586084825, 'qualifierId': 56, 'value'...",6u2ob6fv950r1qve8uejkq2uh,,,,,,,,,,,0.04,Pass
5,2423549097,4,1,1,0,2,bx0cdmzr2gwr70ez72dorx82p,1,31.5,57.2,2022-05-22T06:37:07Z,"[{'id': 3586085043, 'qualifierId': 213, 'value...",azuc3tma44xyrbgf5y279o1xx,0.0,"[{'playerId': 'e3kdoxu1kwn2w3wwi1rqhvr9x', 'sh...","[{'playerId': '7sep6mx2s67mh5fr3raxu7aei', 'sh...",0.0029771626,,,,,,,2.84,Pass
6,2423549113,5,1,1,0,7,bx0cdmzr2gwr70ez72dorx82p,1,49.2,95.4,2022-05-22T06:37:06Z,"[{'id': 3586085129, 'qualifierId': 212, 'value...",7sep6mx2s67mh5fr3raxu7aei,1.0,"[{'playerId': '5qgc6zjc38a5xjl35gs7h3vu1', 'sh...","[{'playerId': 'e3kdoxu1kwn2w3wwi1rqhvr9x', 'sh...",0.0309752524,secondToLast,high,"[{'playerId': 'e6ok0deqkoe80184iu509gzu2', 'sh...",,,,7.88,Pass


['Period start' 'Pass' 'Take On' 'Challenge' 'Blocked Pass'
 'Ball recovery' 'Attempted Tackle' 'Out' 'Ball touch' '50/50'
 'Dispossessed' 'Tackle' 'Corner Awarded' 'Clearance' 'Offside Pass'
 'Offside provoked' 'Foul' 'Aerial' 'Keeper pick-up' 'Deleted event'
 'Interception' 'Error' 'Goal' 'Attempt Saved' 'Save' 'Miss' 'Claim'
 'Card' 'Start delay' 'End delay' 'Referee Drop Ball' nan 'End'
 'Player Off' 'Player on' 'Formation change' 'Keeper Sweeper'
 'Shield ball opp']


Unnamed: 0,id,qualifier
2,2423549045,"{'id': 3586084711, 'qualifierId': 127, 'value'..."
3,2423549041,"{'id': 3586084701, 'qualifierId': 127, 'value'..."
4,2423549063,"{'id': 3586084825, 'qualifierId': 56, 'value':..."
4,2423549063,"{'id': 3586084833, 'qualifierId': 213, 'value'..."
4,2423549063,"{'id': 3586084827, 'qualifierId': 140, 'value'..."


------------
(9430, 2)
------------


Unnamed: 0,id,qualifier
0,2423549045,"{'id': 3586084711, 'qualifierId': 127, 'value'..."
1,2423549041,"{'id': 3586084701, 'qualifierId': 127, 'value'..."
2,2423549063,"{'id': 3586084825, 'qualifierId': 56, 'value':..."
3,2423549063,"{'id': 3586084833, 'qualifierId': 213, 'value'..."
4,2423549063,"{'id': 3586084827, 'qualifierId': 140, 'value'..."


------------
(9430, 3)
------------


Unnamed: 0,id,qualifierId,value
0,3586084711,127,Right to Left
1,3586084701,127,Left to Right
2,3586084825,56,Back
3,3586084833,213,2.7
4,3586084827,140,28.5


------------


Unnamed: 0,id,qualifierId,value,event_id
0,3586084711,127,Right to Left,2423549045
1,3586084701,127,Left to Right,2423549041
2,3586084825,56,Back,2423549063
3,3586084833,213,2.7,2423549063
4,3586084827,140,28.5,2423549063


------------


Unnamed: 0,id,qualifierId,value,event_id,qualifier
0,3586084711,127,Right to Left,2423549045,Direction of Play
1,3586084701,127,Left to Right,2423549041,Direction of Play
2,3586084825,56,Back,2423549063,Zone
3,3586084833,213,2.7,2423549063,Angle
4,3586084827,140,28.5,2423549063,Pass End X


Number of passes attempted per player

In [8]:
def event_per_player(df,event):
    df = df[df['playerId'].notna()]
    try:
        df = df[df['event_description']==event]
        return df.groupby('playerId').size().reset_index(name=event)
    except:
        return False

df = event_per_player(df_events,'Pass')
print(df)

                     playerId  Pass
0   2lvit204llltk13iglsa2tjah     2
1   3sc349yey596xp2j6xlyt0frp    54
2   3vx94h32ahujciraspdayj9t6    19
3   4u281v53ges3kimtgac0tidm2    52
4   5ak9fwtqlr2pll0nsv5br7p7u    20
5   5qgc6zjc38a5xjl35gs7h3vu1    36
6   6ekdnbnk56xlxforb5owt3dn9    47
7   6j0ogojh2b7poyceg7i3k09yi    64
8   6u2ob6fv950r1qve8uejkq2uh    59
9   72d5uxwcmvhd6mzthxuvev1sl    38
10  7cp51c8zn7y08iyk0hc9ix5nt    66
11  7k0r5crdh9blj3edt31zwy0dm     1
12  7sep6mx2s67mh5fr3raxu7aei    42
13  8f3bhiy6r5eei1n25exhbwr8p    22
14  8gkexxgf3pypshhqwg6ibp7o4    35
15  8qmm84tue6kuz8e5nhhdhmz8p    41
16  96wcx761pzv5ub4sfwsynp51x    54
17  976riwm0dz0e74d4l28y3ttcl    55
18  a56woizbe4g6jpl3fg4tlgno5    24
19   afymbx9eo87zau8mo99pakbu    41
20  agwvouyocx93y39g7tmwaojx1     5
21  azuc3tma44xyrbgf5y279o1xx    41
22  bvbebtykj45j3luvemk8yc4ph    41
23  ccu7hw3wrcspl1a18g2ldnsh5     6
24  dxb1r4gqgxkngb0pzvfby9iol    13
25  e3kdoxu1kwn2w3wwi1rqhvr9x    14
26  e6ok0deqkoe80184iu509gzu

Xthreat 

- 764/1792 values
- 763 pass, 1 offside pass
- Mean values

In [21]:
xthreat = df_events[df_events['xThreat.applied'].notna()]
print(xthreat['event_description'].value_counts())

xthreat = xthreat[xthreat['playerId'].notna()]
xthreat['xThreat.applied'] = pd.to_numeric(xthreat['xThreat.applied'])
xthreat.groupby('playerId')['xThreat.applied'].mean().reset_index(name='xthreat')

Pass            763
Offside Pass      1
Name: event_description, dtype: int64


Unnamed: 0,playerId,xthreat
0,3sc349yey596xp2j6xlyt0frp,0.017285
1,3vx94h32ahujciraspdayj9t6,0.027412
2,4u281v53ges3kimtgac0tidm2,0.014667
3,5ak9fwtqlr2pll0nsv5br7p7u,0.099773
4,5qgc6zjc38a5xjl35gs7h3vu1,0.079592
5,6ekdnbnk56xlxforb5owt3dn9,0.000711
6,6j0ogojh2b7poyceg7i3k09yi,0.046613
7,6u2ob6fv950r1qve8uejkq2uh,0.095785
8,72d5uxwcmvhd6mzthxuvev1sl,0.006289
9,7cp51c8zn7y08iyk0hc9ix5nt,0.008418


Total short and long passes and mean distance of pass by player
- Esta comprovat que tot els pases tenen un qualifier que es length
- Tots els pases tenen un playerId

In [52]:
df_passes = df_events[df_events['event_description']=='Pass']
print(df_passes.shape)
print(len(df_passes['id'].unique()))

# ---------- MEAN DISTANCE PER PLAYER --------------------
df_completo = pd.merge(df_passes, qualifiers[qualifiers['qualifier'] == 'Length'], left_on = 'id',right_on='event_id')
df_completo['value'] = pd.to_numeric(df_completo['value'])
display(df_completo.groupby('playerId')['value'].mean().reset_index(name='mean_distance'))

# ---------- TOTAL SHORT AND LONG PASSES PER PLAYER --------------------

df_completo['long'] = df_completo['value']>25

count = df_completo.groupby('playerId')['long'].value_counts().unstack(fill_value=0)

count.columns = ['Short passes', 'Long passes']
display(count)


(1024, 25)
1024


Unnamed: 0,playerId,mean_distance
0,2lvit204llltk13iglsa2tjah,24.8
1,3sc349yey596xp2j6xlyt0frp,16.642593
2,3vx94h32ahujciraspdayj9t6,12.226316
3,4u281v53ges3kimtgac0tidm2,17.028846
4,5ak9fwtqlr2pll0nsv5br7p7u,18.615
5,5qgc6zjc38a5xjl35gs7h3vu1,18.286111
6,6ekdnbnk56xlxforb5owt3dn9,29.331915
7,6j0ogojh2b7poyceg7i3k09yi,15.4125
8,6u2ob6fv950r1qve8uejkq2uh,16.323729
9,72d5uxwcmvhd6mzthxuvev1sl,20.926316


Unnamed: 0_level_0,Short passes,Long passes
playerId,Unnamed: 1_level_1,Unnamed: 2_level_1
2lvit204llltk13iglsa2tjah,1,1
3sc349yey596xp2j6xlyt0frp,44,10
3vx94h32ahujciraspdayj9t6,18,1
4u281v53ges3kimtgac0tidm2,43,9
5ak9fwtqlr2pll0nsv5br7p7u,14,6
5qgc6zjc38a5xjl35gs7h3vu1,28,8
6ekdnbnk56xlxforb5owt3dn9,24,23
6j0ogojh2b7poyceg7i3k09yi,55,9
6u2ob6fv950r1qve8uejkq2uh,49,10
72d5uxwcmvhd6mzthxuvev1sl,31,7


Number of shots attempted (Goal + miss)

In [77]:
df_shots = df_events[df_events['event_description'].isin(['Goal','Miss'])]

df_shots.groupby('playerId').size().reset_index(name='shots')

Unnamed: 0,playerId,shots
0,3vx94h32ahujciraspdayj9t6,1
1,4u281v53ges3kimtgac0tidm2,1
2,6j0ogojh2b7poyceg7i3k09yi,1
3,6u2ob6fv950r1qve8uejkq2uh,1
4,8gkexxgf3pypshhqwg6ibp7o4,1
5,8qmm84tue6kuz8e5nhhdhmz8p,1
6,e3kdoxu1kwn2w3wwi1rqhvr9x,4


Line Breaking Passes