# This is the first step in the project.

Motivation: Combine the JSON files: Competitions, Matches, Lineups and Events into four final dataframes.

Final Output: Four pickle files each representing one of the four final dataframes.

In [1]:
# Import required libraries
import os
import pandas as pd
import json
from tqdm import tqdm
from numpyencoder import NumpyEncoder
from utils import merge_JsonFiles, merge_JsonFiles_with_lookup
pd.options.mode.chained_assignment = None

In [2]:
# File paths
ROOT = os.path.join(os.getcwd(), 'Statsbomb_data\open-data-master')
DATA = os.path.join(ROOT, 'data')
EVENTS = os.path.join(DATA, 'events')
LINEUPS = os.path.join(DATA, 'lineups')
MATCHES = os.path.join(DATA, 'matches')
THREESIXTY = os.path.join(DATA, 'three-sixty')

# Competitions

In [3]:
''' Import competitions JSON file. This file contains competition-level details. '''
competitions = pd.read_json(
    path_or_buf=os.path.join(DATA, 'competitions.json'),
    orient='records'
)
competitions.head()

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
0,9,27,Germany,1. Bundesliga,male,False,False,2015/2016,2023-08-17T23:51:11.837478,,,2023-08-17T23:51:11.837478
1,16,4,Europe,Champions League,male,False,False,2018/2019,2023-03-07T12:20:48.118250,2021-06-13T16:17:31.694,,2023-03-07T12:20:48.118250
2,16,1,Europe,Champions League,male,False,False,2017/2018,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2021-01-23T21:55:30.425330
3,16,2,Europe,Champions League,male,False,False,2016/2017,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2020-07-29T05:00
4,16,27,Europe,Champions League,male,False,False,2015/2016,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2020-07-29T05:00


In [4]:
''' First I will identify all competitions contained in data. 

Variables:-
competition - DataFrame - dataframe containing competition data.
'''
print("UNIQUE COMPETITIONS:\n")
[f'{c}' for c in competitions.competition_name.unique()]

UNIQUE COMPETITIONS:



['1. Bundesliga',
 'Champions League',
 'Copa del Rey',
 "FA Women's Super League",
 'FIFA U20 World Cup',
 'FIFA World Cup',
 'Indian Super league',
 'La Liga',
 'Liga Profesional',
 'Ligue 1',
 'North American League',
 'NWSL',
 'Premier League',
 'Serie A',
 'UEFA Euro',
 'UEFA Europa League',
 "UEFA Women's Euro",
 "Women's World Cup"]

In [5]:
''' I have selected top 5 leagues in europe and Champions League data for my project. '''
top_5_leagues = ['1. Bundesliga', 'La Liga', 'Premier League', 'Serie A', 'Ligue 1', 'Champions League']

In [6]:
''' Filter data to contain above discussed competitions only. 

Variables:-
competitions - DataFrame - dataframe containing competition data
top_5_league - List - selected competitions
'''
competitions_f1 = competitions.loc[competitions.competition_name.isin(top_5_leagues)]
competitions_f1.replace(
    to_replace='1. Bundesliga',
    value='Bundesliga',
    inplace=True
)

In [7]:
''' Verifying the gender of players. I am considering man's soccer for my project. 

Variables:-
competitions_f1 - DataFrame - filtered competition data.
'''
print(f"UNIQUE GENDERS:\n")
[g for g in competitions_f1.competition_gender.unique()]

UNIQUE GENDERS:



['male']

In [8]:
''' Verifying player age group. I am considering adult professionals for my project. 

Variables:
competitions_f1 - DataFrame - filtered competition data.
'''
print(f"UNIQUE COMPETITION YOUTH:\n")
[y for y in competitions_f1.competition_youth.unique()]

UNIQUE COMPETITION YOUTH:



[False]

In [9]:
''' Seasons contained the data. 

Variables:
competitions_f1 - DataFrame - filtered competition data.
'''
print(f"SEASON RANGE:\n")
[season for season in competitions_f1.season_name.unique()]

SEASON RANGE:



['2015/2016',
 '2018/2019',
 '2017/2018',
 '2016/2017',
 '2014/2015',
 '2013/2014',
 '2012/2013',
 '2011/2012',
 '2010/2011',
 '2009/2010',
 '2008/2009',
 '2006/2007',
 '2004/2005',
 '2003/2004',
 '1999/2000',
 '1972/1973',
 '1971/1972',
 '1970/1971',
 '2020/2021',
 '2019/2020',
 '2007/2008',
 '2005/2006',
 '1973/1974',
 '1986/1987']

In [10]:
''' I have considered the latest 5 seasons for the project. From 2016-17 to 2020-21. 

Variables:-
competitions_f1 - DataFrame - filtered competition data.
'''
last_5_seasons = ['2020/2021', '2019/2020', '2018/2019', '2017/2018', '2016/2017']
competitions_f2 = competitions_f1.loc[competitions_f1.season_name.isin(last_5_seasons)]

In [11]:
''' Checking the number of rows in the last 5 seasons.

Varibles:-
competitions_f2 - DataFrame - filtered competition and season data.
'''
competitions_f2.shape

(8, 12)

We only have 8 rows left, which means that not all competitions have recent 5 year's data available.

Let's see which competitions have latest data.

In [12]:
''' Checking which competitions have recent 5 season's data. 

Variables:-
competitions_f2 - DataFrame - filtered competition and season data.
'''
print(f"COMPETITIONS THAT HAVE LAST 5 YEARS' DATA:\n")
[c for c in competitions_f2.competition_name.unique()]

COMPETITIONS THAT HAVE LAST 5 YEARS' DATA:



['Champions League', 'La Liga']

In [13]:
''' Checking La Liga and Champion's league seasons available in the data.

Variables:-
competitions_f2 - DataFrame - filtered competition and season data.
'''
print(f"CHAMPIONS LEAGUE SEASONS:\n")
print(competitions_f2.loc[competitions_f2.competition_name=='Champions League'].season_name)
print(f"LA LIGA SEASONS:\n")
print(competitions_f2.loc[competitions_f2.competition_name=='La Liga'].season_name)

CHAMPIONS LEAGUE SEASONS:

1    2018/2019
2    2017/2018
3    2016/2017
Name: season_name, dtype: object
LA LIGA SEASONS:

35    2020/2021
36    2019/2020
37    2018/2019
38    2017/2018
39    2016/2017
Name: season_name, dtype: object


### Final competitions data

The last 5 seasons of top 5 leaagues and champions league yields very less data. 

Hence, I have decided to go with La Liga and Champions league with all their seasons.

# Matches

The matches folder is categorized by competition ids, containing json files for a full list of matches.

Matches

    --- comp_id_1

        --- match_id_1.json

        --- match_id_2.json

        --- match_id_3.json

        --- ...

    --- comp_id_2

        --- match_id_20.json

        --- match_id_21.json

        --- ...
        
    --- ...

In [14]:
''' Fetching the competition ids of selected competitions.

Variables:-
competitions_f2 - DataFrame - filtered competition and season data.
'''
print(f"Competition ids from filtered competitions dataframe:\n")
[i for i in competitions_f2.competition_id.unique()]

Competition ids from filtered competitions dataframe:



[16, 11]

In [15]:
''' Merge all JSON files of each competition. Save the merged file in the respective competition's folder.

Variables:-
MATCHES - string - filepath to all match folders
selected_competitions - List - selected competitions' ids
root - string - filepath to store each competition's merged file
filenames - list - list of all json files to be merged
output - string - a competition's final json file is stored as {comp_id}_all.json
'''
selected_competitions = ['16','11']
for c in selected_competitions:
    root = os.path.join(MATCHES, c)
    filenames = os.listdir(root)
    output = os.path.join(MATCHES, f'{c}_all.json')
    merge_JsonFiles(
        root=root,
        input_files=filenames,
        output_file=output
    )

In [17]:
''' In previous cell, I created a merged file for each competition. Now I am merging those files to create a final file.

Variables:-
MATCHES - string - filepath to all match folders
selected_competitions - List - selected competitions' ids
'''
merge_JsonFiles(
    root=MATCHES,
    input_files=[f'{c}_all.json' for c in selected_competitions],
    output_file=os.path.join(MATCHES, 'matches_final.json')
)

In [20]:
''' Load the final merged file into a dataframe.

Variables:-
MATCHES - string - filepath to all match folders
'''
matches_df = pd.read_json(
    path_or_buf=os.path.join(MATCHES,'matches_final.json')
)
matches_df.head(2)

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,match_status_360,last_updated,last_updated_360,metadata,match_week,competition_stage,stadium,referee
0,18245,2018-05-26,20:45:00.000,"{'competition_id': 16, 'country_name': 'Europe...","{'season_id': 1, 'season_name': '2017/2018'}","{'home_team_id': 220, 'home_team_name': 'Real ...","{'away_team_id': 24, 'away_team_name': 'Liverp...",3,1,available,scheduled,2021-01-23T21:55:30.425330,2021-06-13T16:17:31.694,"{'data_version': '1.1.0', 'shot_fidelity_versi...",7,"{'id': 26, 'name': 'Final'}","{'id': 4222, 'name': 'NSK Olimpiyskyi', 'count...","{'id': 727, 'name': 'Milorad Mažić', 'country'..."
1,18245,2018-05-26,20:45:00.000,"{'competition_id': 16, 'country_name': 'Europe...","{'season_id': 1, 'season_name': '2017/2018'}","{'home_team_id': 220, 'home_team_name': 'Real ...","{'away_team_id': 24, 'away_team_name': 'Liverp...",3,1,available,scheduled,2021-01-23T21:55:30.425330,2021-06-13T16:17:31.694,"{'data_version': '1.1.0', 'shot_fidelity_versi...",7,"{'id': 26, 'name': 'Final'}","{'id': 4222, 'name': 'NSK Olimpiyskyi', 'count...","{'id': 727, 'name': 'Milorad Mažić', 'country'..."


In [21]:
''' Dropping unnecessary columns from dataframe.

Variables:-
matches_df - DataFrame - match data
'''
matches_df.drop(columns=['kick_off','match_status', 'match_status_360', 'last_updated', 'last_updated_360', 'metadata', 'stadium', 'referee'], inplace=True)

In [37]:
''' Removing duplicate rows of match ids.

Variables:-
matches_df - DataFrame - match data
'''
matches_df.drop_duplicates(subset=['match_id'], inplace=True)

In [38]:
''' Number of matches in match data is 885.

Variables:-
matches_df - DataFrame - match data
'''
matches_df.shape

(885, 12)

In [24]:
''' Extracting competition id and season id from their respective dictionaries and storing them in seperate columns.

Variables:-
matches_df - DataFrame - match data
'''
matches_df['competition_id'] = matches_df['competition'].str['competition_id']
matches_df['season_id'] = matches_df['season'].str['season_id']

### Matches final data

Match level data of all seasons of La Liga and Champions League is combined.

There are in total 885 matches.

# Lineups

In [39]:
''' Get all unique match ids from match dataframe.

Variables:-
matches_df - DataFrame - match data
'''
match_ids = matches_df.match_id.unique()

In [40]:
''' Create a dictionary of all lineup files. This will make the merging of lineup files faster 

Variables:-
LINEUPS - string - path to all lineup folders
'''
lineups_dict = {}
for l in tqdm(os.listdir(LINEUPS)):
    with open(os.path.join(LINEUPS,l), 'r', encoding="utf8") as l_file:
        lineups_dict[l] = json.load(l_file)

100%|██████████| 3200/3200 [00:02<00:00, 1271.58it/s]


In [41]:
''' Merge all lineup files.

Variables:-
LINEUPS - string - path to all lineup folders
match_ids - Series - unique match ids
lineups_dict - dict - mapping of all json filenames and their respective content.
'''

merge_JsonFiles_with_lookup(
    lookup_ids=match_ids,
    output_file=os.path.join(LINEUPS,'lineups_final.json'),
    lookup_dict = lineups_dict
)

In [42]:
''' load the consolidated lineup data in a dataframe.

Variables:-
LINEUPS - string - path to all lineup folders
'''
lineups_df = pd.read_json(
    path_or_buf=os.path.join(LINEUPS,'lineups_final.json')
)
lineups_df.head()

Unnamed: 0,team_id,team_name,lineup,match_id
0,24,Liverpool,"[{'player_id': 3471, 'player_name': 'Dejan Lov...",18245
1,220,Real Madrid,"[{'player_id': 4926, 'player_name': 'Francisco...",18245
2,220,Real Madrid,"[{'player_id': 3063, 'player_name': 'Danilo Lu...",18244
3,224,Juventus,"[{'player_id': 3951, 'player_name': 'Mario Lem...",18244
4,238,Inter Milan,"[{'player_id': 3371, 'player_name': 'Mario Bar...",18235


In [43]:
lineups_df.shape

(1770, 4)

### Lineups final data

Each match contains two sets of lineups. One for home team and one for away team.

Lineups data has 1770 rows, which is exactly double than the number of matches.

# Events

In [44]:
''' Combining all json files of events data. There are various events like pass, intercept, dribble, etc, but the event named 'Shot' is considered only. 

Variables:-
match_ids - Series - unique match ids
EVENTS - string - file path to all events data
'''

final_json = []

for match_id in tqdm(match_ids):
    with open(os.path.join(EVENTS,f'{match_id}.json'), 'r', encoding="utf8") as f:
        loaded_json = json.load(f)
        for i in range(len(loaded_json)):
            if loaded_json[i]['type']['name'] == 'Shot':
                loaded_json[i]['match_id'] = match_id
                final_json.extend([loaded_json[i]])


100%|██████████| 885/885 [01:19<00:00, 11.09it/s]


In [47]:
''' loading all events data in a dataframe.

Variables:-
final_json - list - list of all events json file contents
'''

events_df = pd.DataFrame.from_dict(
    final_json
)
events_df.head()

Unnamed: 0,id,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,...,player,position,location,duration,related_events,shot,match_id,under_pressure,out,off_camera
0,682270cc-4bc4-4952-8f91-d3c5a704a691,153,1,00:02:33.865,2,33,"{'id': 16, 'name': 'Shot'}",7,"{'id': 24, 'name': 'Liverpool'}","{'id': 4, 'name': 'From Throw In'}",...,"{'id': 3567, 'name': 'Georginio Wijnaldum'}","{'id': 13, 'name': 'Right Center Midfield'}","[98.1, 52.1]",0.134619,"[5df2374f-5956-4ae5-a855-769c3d9bd8e9, 91c0ba0...","{'statsbomb_xg': 0.03639871, 'end_location': [...",18245,,,
1,9f5aa3eb-3bed-4bc0-97a5-bb8444b235b9,245,1,00:05:14.065,5,14,"{'id': 16, 'name': 'Shot'}",13,"{'id': 24, 'name': 'Liverpool'}","{'id': 3, 'name': 'From Free Kick'}",...,"{'id': 3531, 'name': 'Mohamed Salah'}","{'id': 17, 'name': 'Right Wing'}","[96.6, 51.5]",0.092997,"[0801d693-6cd2-4565-b9f7-8e57215326dc, 4cc5f82...","{'statsbomb_xg': 0.027975515, 'end_location': ...",18245,True,,
2,399ac143-5f7b-4080-8c0b-3c18435d7fc1,398,1,00:07:56.816,7,56,"{'id': 16, 'name': 'Shot'}",19,"{'id': 24, 'name': 'Liverpool'}","{'id': 1, 'name': 'Regular Play'}",...,"{'id': 3535, 'name': 'Roberto Firmino Barbosa ...","{'id': 23, 'name': 'Center Forward'}","[112.1, 30.6]",1.006248,[0ed95d27-7799-4809-94be-14b2af0e8199],"{'statsbomb_xg': 0.07536108, 'end_location': [...",18245,,,
3,660d9d98-46b6-4b5e-9c9a-435d63142c93,467,1,00:10:02.932,10,2,"{'id': 16, 'name': 'Shot'}",24,"{'id': 220, 'name': 'Real Madrid'}","{'id': 3, 'name': 'From Free Kick'}",...,"{'id': 5552, 'name': 'Marcelo Vieira da Silva ...","{'id': 6, 'name': 'Left Back'}","[94.5, 26.9]",1.301567,[9eee50d1-2956-4bd5-9417-4faa3cdcfec6],"{'statsbomb_xg': 0.023375953, 'end_location': ...",18245,,,
4,fe6c7f60-2ff0-4077-882e-b045c8abc7c3,628,1,00:13:02.999,13,2,"{'id': 16, 'name': 'Shot'}",28,"{'id': 24, 'name': 'Liverpool'}","{'id': 3, 'name': 'From Free Kick'}",...,"{'id': 3473, 'name': 'James Philip Milner'}","{'id': 15, 'name': 'Left Center Midfield'}","[101.2, 30.6]",0.054247,"[62488dda-14cf-4155-90c2-a4231a54de32, 70c57f0...","{'statsbomb_xg': 0.0437719, 'end_location': [1...",18245,,,


In [46]:
''' There are 21777 events in total. 

Variables:-
events_df - DataFrame - events dataframe.
'''
events_df.shape

(21777, 21)

### Events final data

A dataframe of all events occurred in 885 matches. Each row represents a particular event happend in a particular match.

There are a total of 21777 events.

# Export all dataframes

All dataframes are exported to pickle file. The reason of choosing pickle over csv is that columns having dictionaries are converted to string if we export to csv, Whereas in pickle, the columns containing dictionaries maintain the dict type. Resulting in dict after loading the pickle file again. This is very helpful.

In [48]:
FINAL_DF = os.path.join(ROOT,'final_dataframes')
competitions_f2.to_pickle(os.path.join(FINAL_DF, 'competitions.pkl'))
matches_df.to_pickle(os.path.join(FINAL_DF, 'matches.pkl'))
lineups_df.to_pickle(os.path.join(FINAL_DF, 'lineups.pkl'))
events_df.to_pickle(os.path.join(FINAL_DF, 'events.pkl'))