In [1]:
import pandas as pd
import json
import os
import numpy as np

pd.options.display.max_columns = 999

In [2]:
# load tracking data
current_directory = os.getcwd()
path_tracking = os.path.join(os.path.join(os.path.dirname(current_directory),'data'),"tracking_set_0")
print(path_tracking)
game_id = 1

df_tracking = pd.read_parquet(f'{path_tracking}/{game_id}_tracking.parquet')

#           ------------------------------------------------------------        

# load events names
path_event_csv = os.path.join(os.path.dirname(current_directory),'data')
df_event_names = pd.read_csv(os.path.join(path_event_csv,'event_names.csv'))
dict_event_names = df_event_names.set_index('event_type_id').to_dict()['event_description']

# load event data
def load_event_data(file_name, base_path):
    # read in event file
    with open(f'{base_path}/{file_name}') as f:
        data=json.loads(f.read())

    f.close()
    
    # transform data into pandas dataframe
    df_events = pd.json_normalize(data['liveData']['event'])
    
    # preprocess event data and keep relevant information only

    # add timeelapsed to each event
    df_events['timestamp'] = pd.to_datetime(df_events.timeStamp).apply(lambda x: x.timestamp())

    df_events = df_events.query('periodId in [1,2]')

    def add_timeelapsed_to_events(df):
        start_time = df.query('typeId==32')['timestamp'].iloc[0]
        df['timestamp_new'] = np.int64((df['timestamp'] - start_time)*1000)

        df['timeelapsed'] = df['timestamp_new'].apply(lambda x: (40 * round(x/40))/1000)

        return df

    df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)

    df_events = df_events.drop(columns=['timeStamp','timestamp','timestamp_new'])
    
    # rename some columns
    df_events = df_events.rename(columns=
        {
            'periodId':'current_phase',
            'typeId':'event_type_id',
            'timeMin':'period_minute',
            'timeSec':'period_second'
        }
    )
    
    return df_events

path_events = os.path.join(os.path.join(os.path.dirname(current_directory),'data'),"first_10_events")
print(path_events)

event_file = f'{game_id}.json'

df_events = load_event_data(
    base_path=path_events,
    file_name=event_file
)

# add event descriptions
df_events['event_description'] = df_events['event_type_id'].map(dict_event_names)

# make a copy of it for later usage
events_all = df_events.copy()

display(df_events.head())
print(df_events['event_description'].unique())


#           ------------------------------------------------------------        


# read in qualifier list
path_data = os.path.join(os.path.dirname(current_directory),'data')
qualifier_names = pd.read_csv(os.path.join(path_data,"qualifier_names.csv"))

# explode coverts each element in each list to a separate row
cols = ['id', 'qualifier']
qualifiers = events_all[cols].explode('qualifier')
display(qualifiers.head())

print("------------")

qualifiers = qualifiers[qualifiers.qualifier.notna()].reset_index(drop=True)
print(qualifiers.shape)
print("------------")
display(qualifiers.head())
print("------------")

# save corresponding event ids for each qualifier
event_ids = qualifiers.id.tolist()

qualifiers = pd.json_normalize(qualifiers[qualifiers.qualifier.notna()]['qualifier'])
print(qualifiers.shape)
print("------------")
display(qualifiers.head())
print("------------")

qualifiers['event_id'] = event_ids
display(qualifiers.head())
print("------------")
qualifiers = qualifiers.merge(qualifier_names, how='left', on='qualifierId')
display(qualifiers.head())

c:\Users\Gabriel\OneDrive\Escritorio\SportsAnalyticsCourse\OptaForum\OptaChallenge_Clustering_Player_Styles\data\tracking_set_0
c:\Users\Gabriel\OneDrive\Escritorio\SportsAnalyticsCourse\OptaForum\OptaChallenge_Clustering_Player_Styles\data\first_10_events


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


Unnamed: 0,id,eventId,event_type_id,current_phase,period_minute,period_second,contestantId,outcome,x,y,lastModified,qualifier,playerId,lineBreakingPass.linesBroken.value,passOption.player,passTarget.player,xThreat.applied,lineBreakingPass.lastLineBroken.value,pressure.pressureReceived.value,pressure.player,xThreat.removed,keyPass,assist,timeelapsed,event_description
2,2423549045,2,32,1,0,0,3c3jcs7vc1t6vz5lev162jyv7,1,0.0,0.0,2022-05-22T03:17:52Z,"[{'id': 3586084711, 'qualifierId': 127, 'value...",,,,,,,,,,,,0.0,Period start
3,2423549041,2,32,1,0,0,bx0cdmzr2gwr70ez72dorx82p,1,0.0,0.0,2022-05-21T18:59:34Z,"[{'id': 3586084701, 'qualifierId': 127, 'value...",,,,,,,,,,,,0.0,Period start
4,2423549063,3,1,1,0,0,bx0cdmzr2gwr70ez72dorx82p,1,49.9,50.0,2022-05-22T03:34:41Z,"[{'id': 3586084825, 'qualifierId': 56, 'value'...",6u2ob6fv950r1qve8uejkq2uh,,,,,,,,,,,0.04,Pass
5,2423549097,4,1,1,0,2,bx0cdmzr2gwr70ez72dorx82p,1,31.5,57.2,2022-05-22T06:37:07Z,"[{'id': 3586085043, 'qualifierId': 213, 'value...",azuc3tma44xyrbgf5y279o1xx,0.0,"[{'playerId': 'e3kdoxu1kwn2w3wwi1rqhvr9x', 'sh...","[{'playerId': '7sep6mx2s67mh5fr3raxu7aei', 'sh...",0.0029771626,,,,,,,2.84,Pass
6,2423549113,5,1,1,0,7,bx0cdmzr2gwr70ez72dorx82p,1,49.2,95.4,2022-05-22T06:37:06Z,"[{'id': 3586085129, 'qualifierId': 212, 'value...",7sep6mx2s67mh5fr3raxu7aei,1.0,"[{'playerId': '5qgc6zjc38a5xjl35gs7h3vu1', 'sh...","[{'playerId': 'e3kdoxu1kwn2w3wwi1rqhvr9x', 'sh...",0.0309752524,secondToLast,high,"[{'playerId': 'e6ok0deqkoe80184iu509gzu2', 'sh...",,,,7.88,Pass


['Period start' 'Pass' 'Take On' 'Challenge' 'Blocked Pass'
 'Ball recovery' 'Attempted Tackle' 'Out' 'Ball touch' '50/50'
 'Dispossessed' 'Tackle' 'Corner Awarded' 'Clearance' 'Offside Pass'
 'Offside provoked' 'Foul' 'Aerial' 'Keeper pick-up' 'Deleted event'
 'Interception' 'Error' 'Goal' 'Attempt Saved' 'Save' 'Miss' 'Claim'
 'Card' 'Start delay' 'End delay' 'Referee Drop Ball' nan 'End'
 'Player Off' 'Player on' 'Formation change' 'Keeper Sweeper'
 'Shield ball opp']


Unnamed: 0,id,qualifier
2,2423549045,"{'id': 3586084711, 'qualifierId': 127, 'value'..."
3,2423549041,"{'id': 3586084701, 'qualifierId': 127, 'value'..."
4,2423549063,"{'id': 3586084825, 'qualifierId': 56, 'value':..."
4,2423549063,"{'id': 3586084833, 'qualifierId': 213, 'value'..."
4,2423549063,"{'id': 3586084827, 'qualifierId': 140, 'value'..."


------------
(9430, 2)
------------


Unnamed: 0,id,qualifier
0,2423549045,"{'id': 3586084711, 'qualifierId': 127, 'value'..."
1,2423549041,"{'id': 3586084701, 'qualifierId': 127, 'value'..."
2,2423549063,"{'id': 3586084825, 'qualifierId': 56, 'value':..."
3,2423549063,"{'id': 3586084833, 'qualifierId': 213, 'value'..."
4,2423549063,"{'id': 3586084827, 'qualifierId': 140, 'value'..."


------------
(9430, 3)
------------


Unnamed: 0,id,qualifierId,value
0,3586084711,127,Right to Left
1,3586084701,127,Left to Right
2,3586084825,56,Back
3,3586084833,213,2.7
4,3586084827,140,28.5


------------


Unnamed: 0,id,qualifierId,value,event_id
0,3586084711,127,Right to Left,2423549045
1,3586084701,127,Left to Right,2423549041
2,3586084825,56,Back,2423549063
3,3586084833,213,2.7,2423549063
4,3586084827,140,28.5,2423549063


------------


Unnamed: 0,id,qualifierId,value,event_id,qualifier
0,3586084711,127,Right to Left,2423549045,Direction of Play
1,3586084701,127,Left to Right,2423549041,Direction of Play
2,3586084825,56,Back,2423549063,Zone
3,3586084833,213,2.7,2423549063,Angle
4,3586084827,140,28.5,2423549063,Pass End X


In [5]:
df_events['contestantId'].unique()

array(['3c3jcs7vc1t6vz5lev162jyv7', 'bx0cdmzr2gwr70ez72dorx82p'],
      dtype=object)

In [9]:
games = [1,2,3,4,5,6,7,8,9,10]

for game_id in games:

    # load tracking data
    current_directory = os.getcwd()
    path_tracking = os.path.join(os.path.join(os.path.dirname(current_directory),'data'),"tracking_set_0")
    #print(path_tracking)

    df_tracking = pd.read_parquet(f'{path_tracking}/{game_id}_tracking.parquet')

    #           ------------------------------------------------------------        

    # load events names
    path_event_csv = os.path.join(os.path.dirname(current_directory),'data')
    df_event_names = pd.read_csv(os.path.join(path_event_csv,'event_names.csv'))
    dict_event_names = df_event_names.set_index('event_type_id').to_dict()['event_description']

    # load event data
    def load_event_data(file_name, base_path):
        # read in event file
        with open(f'{base_path}/{file_name}') as f:
            data=json.loads(f.read())

        f.close()
        
        # transform data into pandas dataframe
        df_events = pd.json_normalize(data['liveData']['event'])
        
        # preprocess event data and keep relevant information only

        # add timeelapsed to each event
        df_events['timestamp'] = pd.to_datetime(df_events.timeStamp).apply(lambda x: x.timestamp())

        df_events = df_events.query('periodId in [1,2]')

        def add_timeelapsed_to_events(df):
            start_time = df.query('typeId==32')['timestamp'].iloc[0]
            df['timestamp_new'] = np.int64((df['timestamp'] - start_time)*1000)

            df['timeelapsed'] = df['timestamp_new'].apply(lambda x: (40 * round(x/40))/1000)

            return df

        df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)

        df_events = df_events.drop(columns=['timeStamp','timestamp','timestamp_new'])
        
        # rename some columns
        df_events = df_events.rename(columns=
            {
                'periodId':'current_phase',
                'typeId':'event_type_id',
                'timeMin':'period_minute',
                'timeSec':'period_second'
            }
        )
        
        return df_events

    path_events = os.path.join(os.path.join(os.path.dirname(current_directory),'data'),"first_10_events")
    #print(path_events)

    event_file = f'{game_id}.json'

    df_events = load_event_data(
        base_path=path_events,
        file_name=event_file
    )

    # add event descriptions
    df_events['event_description'] = df_events['event_type_id'].map(dict_event_names)

    # make a copy of it for later usage
    events_all = df_events.copy()

    #display(df_events.head())
    #print(df_events['event_description'].unique())


    #           ------------------------------------------------------------        


    # read in qualifier list
    path_data = os.path.join(os.path.dirname(current_directory),'data')
    qualifier_names = pd.read_csv(os.path.join(path_data,"qualifier_names.csv"))

    # explode coverts each element in each list to a separate row
    cols = ['id', 'qualifier']
    qualifiers = events_all[cols].explode('qualifier')
    #display(qualifiers.head())

    #print("------------")

    qualifiers = qualifiers[qualifiers.qualifier.notna()].reset_index(drop=True)
    #print(qualifiers.shape)
    #print("------------")
    #display(qualifiers.head())
    #print("------------")

    # save corresponding event ids for each qualifier
    event_ids = qualifiers.id.tolist()

    qualifiers = pd.json_normalize(qualifiers[qualifiers.qualifier.notna()]['qualifier'])
    #print(qualifiers.shape)
    #print("------------")
    #display(qualifiers.head())
    #print("------------")
    print(df_events['contestantId'].unique())
    qualifiers['event_id'] = event_ids
    #display(qualifiers.head())
    print("------------")
    qualifiers = qualifiers.merge(qualifier_names, how='left', on='qualifierId')
    #display(qualifiers.head())

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


['3c3jcs7vc1t6vz5lev162jyv7' 'bx0cdmzr2gwr70ez72dorx82p']
------------


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


['121le8unjfzug3iu9pgkqa1c7' '98dr7jscv8adc8zgi2u403oij']
------------


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


['2b3mar72yy8d6uvat1ka6tn3r' '4t4hod56fsj7utpjdor8so5q6']
------------


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


['bx0cdmzr2gwr70ez72dorx82p' '4t4hod56fsj7utpjdor8so5q6']
------------


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


['be2k34rut1lz79jxenabttqlc' '3xedluek08t2ka7oypwuullcn']
------------


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


['27xvwccz8kpmqsefjv2b2sc0o' '2b3mar72yy8d6uvat1ka6tn3r']
------------


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


['3c3jcs7vc1t6vz5lev162jyv7' '27xvwccz8kpmqsefjv2b2sc0o']
------------


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


['70nn27vgkt6l48lvv5e66q7ww' '27xvwccz8kpmqsefjv2b2sc0o']
------------
['4t4hod56fsj7utpjdor8so5q6' '2sc9xfhu6tbru9hhlhr4a89zn']
------------


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_events = df_events.groupby('periodId').apply(add_timeelapsed_to_events)


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\Gabriel\\OneDrive\\Escritorio\\SportsAnalyticsCourse\\OptaForum\\OptaChallenge_Clustering_Player_Styles\\data\\tracking_set_0/10_tracking.parquet'