# Data Preparation

This notebook downloads the opensource [Soccer match event dataset](https://figshare.com/collections/Soccer_match_event_dataset/4415000/2) and converts it to the [SPADL format](https://github.com/ML-KULeuven/socceraction). This dataset contains all spatio-temporal events (passes, shots, fouls, etc.) that occured during all matches of the 2017/18 season of the top-5 European leagues (La Liga, Serie A, Bundesliga, Premier League, Ligue 1) as well as the FIFA World Cup 2018 and UEFA Euro Cup 2016.

**Disclaimer**: this notebook is compatible with [v5 of the Soccer match event dataset](https://figshare.com/collections/Soccer_match_event_dataset/4415000/5) and the following package versions:

- tqdm 4.42.1
- requests 2.22.0
- pandas 1.0
- socceraction 0.1.1

In [1]:
import os; import sys;
import tqdm
import requests
import mimetypes
import zipfile
import math
import pandas as pd
pd.set_option('display.max_columns', None)

import socceraction.spadl as spadl
import socceraction.spadl.wyscout as wyscout

# Wyscout does not distinguish between headers and other body
# parts on shots. The socceraction convertor simply labels all
# shots as performed by foot. I think it is better to label 
# them as headers.
def determine_bodypart_id(event):
    """
    This function determines the body part used for an event
    Args:
    event (pd.Series): Wyscout event Series
    Returns:
    int: id of the body part used for the action
    """
    if event["subtype_id"] in [81, 36, 21, 90, 91]:
        body_part = "other"
    elif event["subtype_id"] == 82 or event['head/body']:
        body_part = "head"
    else:  # all other cases
        body_part = "foot"
    return spadl.config.bodyparts.index(body_part)
wyscout.determine_bodypart_id = determine_bodypart_id

## Configure folder names and download URLs

In [2]:
spadl_datafolder = "../data/wyscout"
raw_datafolder = "../data/wyscout/raw"

# Create data folder if it doesn't exist
for d in [raw_datafolder, spadl_datafolder]:
    if not os.path.exists(d):
        os.makedirs(d, exist_ok=True)
        print(f"Directory {d} created ")

In [3]:
# https://figshare.com/collections/Soccer_match_event_dataset/4415000/5
dataset_urls = dict(
    eventid2name = "https://ndownloader.figshare.com/files/21385245",
    tags2name = "https://ndownloader.figshare.com/files/21385239",
    competitions = "https://ndownloader.figshare.com/files/15073685",
    teams = "https://ndownloader.figshare.com/files/15073697",
    coaches = "https://ndownloader.figshare.com/files/15073868",
    referees = "https://ndownloader.figshare.com/files/15074030",
    players = "https://ndownloader.figshare.com/files/15073721",
    matches = "https://ndownloader.figshare.com/files/14464622",
    events = "https://ndownloader.figshare.com/files/14464685"
)

## Download public WyScout data 

In [4]:
for key, url in dataset_urls.items():
    print(f"Downloading {key}")
    # Streaming, so we can iterate over the response.
    r = requests.get(url, stream=True)
    content_type = r.headers['content-type']
    extension = mimetypes.guess_extension(content_type)

    # Total size in bytes.
    total_size = int(r.headers.get("content-length", 0))
    block_size = 1024 * 1024
    wrote = 0
    with open(f"{raw_datafolder}/{key}{extension}", "wb") as f:
        datastream = tqdm.tqdm(
            r.iter_content(block_size),
            total=math.ceil(total_size // block_size),
            unit="MB",
            #unit_scale=True,
            #unit_divisor=1024
        )
        for data in datastream:
            wrote = wrote + len(data)
            f.write(data)
    if extension == ".zip":
        print(f"Extracting {key}")
        with zipfile.ZipFile(f"{raw_datafolder}/{key}.zip", 'r') as zipObj:
            zipObj.extractall(f"{raw_datafolder}")
    if total_size != 0 and wrote != total_size:
        print("ERROR, something went wrong")

print("Downloaded files:")
os.listdir(raw_datafolder)

Downloading eventid2name


1MB [00:00, 419.98MB/s]


Downloading tags2name


1MB [00:00, 397.49MB/s]


Downloading competitions


1MB [00:00, 419.77MB/s]


Downloading teams


1MB [00:00, 40.63MB/s]


Downloading coaches


1MB [00:00, 23.08MB/s]


Downloading referees


1MB [00:00, 13.24MB/s]


Downloading players


2MB [00:00, 10.69MB/s]                       


Downloading matches


1MB [00:00,  1.96MB/s]


Extracting matches
Downloading events


74MB [00:03, 21.59MB/s]                        


Extracting events
Downloaded files:


['matches_Italy.json',
 'events_Spain.json',
 'matches_World_Cup.json',
 'matches_Germany.json',
 'coaches.json',
 'eventid2name.csv',
 'matches_European_Championship.json',
 'events_England.json',
 'events_France.json',
 'teams.json',
 'matches_England.json',
 'events_World_Cup.json',
 'tags2name.csv',
 'matches_Spain.json',
 'events_European_Championship.json',
 'events_Italy.json',
 'matches_France.json',
 'events_Germany.json',
 'players.json',
 'competitions.json',
 'referees.json',
 'events.zip',
 'matches.zip']

## Select competitions to load and convert

In [5]:
competitions = pd.read_json(f"{raw_datafolder}/competitions.json")
# Rename competitions to the names used in the file names
competitions['name'] = competitions.apply(lambda x: x.area['name'] if x.area['name'] != "" else x['name'], axis=1)
# View all available competitions
set(competitions.name)

{'England',
 'European Championship',
 'France',
 'Germany',
 'Italy',
 'Spain',
 'World Cup'}

In [6]:
# Domestic leagues
#selected_competitions = competitions[competitions.name.isin(
#    ["England", "France", "Germany", "Italy", "Spain"]
#)]

# Premier leagues
selected_competitions = competitions[competitions.name == "England"]

# International
#selected_competitions = competitions[competitions.name.isin(
#    ['European Championship', 'World Cup']
#)]

selected_competitions

Unnamed: 0,name,wyId,format,area,type
1,England,364,Domestic league,"{'name': 'England', 'id': '0', 'alpha3code': '...",club


## Load and convert match data

In [7]:
player_games = []
actions = {}
for row in selected_competitions.itertuples():
    print(f"Processing {row.name}")
    # load data
    matches = pd.read_json(f"{raw_datafolder}/matches_{row.name}.json")
    events = pd.read_json(f"{raw_datafolder}/events_{row.name}.json").groupby('matchId', as_index=False)
    for match in tqdm.tqdm(list(matches.itertuples())):
        match_id = match.wyId
        match_events = events.get_group(match_id)

        # convert data
        player_games.append(wyscout.get_player_games(match, match_events))
        home_team = next(filter(lambda x: x['side'] == 'home', match.teamsData.values()))['teamId']
        actions[match_id] = wyscout.convert_actions(match_events, home_team)
        # action id is missing !
        actions[match_id]["action_id"] = range(len(actions[match_id]))

player_games = pd.concat(player_games).reset_index(drop=True)

Processing England


100%|██████████| 380/380 [07:00<00:00,  1.11s/it]


## Store converted SPADL data in a HDF-file

In [8]:
spadl_h5 = os.path.join(spadl_datafolder, "spadl-wyscout.h5")

# Store all spadl data in h5-file
with pd.HDFStore(spadl_h5) as spadlstore:
    spadlstore["competitions"] = selected_competitions
    matches = pd.concat(list(
        pd.read_json(f"{raw_datafolder}/matches_{row.name}.json")
        for row in selected_competitions.itertuples()
    ))
    spadlstore["games"] = wyscout.convert_games(matches)
    players = wyscout.convert_players(pd.read_json(f"{raw_datafolder}/players.json"))
    spadlstore["players"] = players
    spadlstore["teams"] = wyscout.convert_teams(pd.read_json(f"{raw_datafolder}/teams.json"))
    for game_id in actions.keys():
        spadlstore[f"actions/game_{game_id}"] = actions[game_id]
        
    spadlstore["actiontypes"] = spadl.actiontypes_df()
    spadlstore["results"] = spadl.results_df()
    spadlstore["bodyparts"] = spadl.bodyparts_df()

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['name', 'format', 'area', 'type'], dtype='object')]

  exec(code_obj, self.user_global_ns, self.user_ns)
