In [2]:
import pandas as pd

from pandas import json_normalize
import json
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
from matplotlib.patches import Arc

import numpy as np
import matplotlib.cm as cm
from scipy.ndimage.filters import gaussian_filter

import requests

from matplotlib.colors import Normalize
import matplotlib.patheffects as pe

from ast import literal_eval
plt.style.use('fivethirtyeight')

In [3]:
import sys
sys.path.insert(0, '../')

import config

## Retrieve dataset from either Github or local files

In [4]:
def FreeCompetitions(env='github'):
    """Function to retrieve free competitions from StatsBomb open data.

        Args:
            env (str, optional): 'github' or 'local', environment to retrieve data from.

        Returns:
            comp_df (dataframe): DataFrame with all free competitions.

        Raises:
            ValueError: wrong value for 'env'.
    """

    if env == 'github':
        competitions_url = config.data_url + "competitions.json"
        raw_competitions = requests.get(url=competitions_url)
        raw_competitions.encoding = 'utf-8'
        competitions = raw_competitions.json()
    elif env == 'local':
        competitions_path = config.data_dir + 'competitions.json'
        with open(competitions_path, encoding='utf-8') as json_file:
            competitions = json.load(json_file)
    else:
        raise ValueError(
            "'env' variable should be either 'local' or 'github'.")

    comp_df = pd.DataFrame(competitions)
    return comp_df


def FreeMatches(competitions, env='github'):
    """Function to retrieve matches from all competitions in competitions.

        Args:
            competitions (DataFrame): df with competitions, must contain 'competition_id' and 'season_id'
            env (str, optional): 'github' or 'local', environment to retrieve data from.

        Returns:
            matches_df (DataFrame): DataFrame with all matches.

        Raises:
            ValueError: wrong value for 'env'.
    """

    matches_df = pd.DataFrame()
    if env == 'github':
        for i in tqdm(range(len(competitions))):
            comp_id = str(competitions['competition_id'][i])
            season_id = str(competitions['season_id'][i])
            matches_url = config.data_url + \
                f"/matches/{comp_id}/{season_id}.json"
            raw_matches = requests.get(url=matches_url)
            matches = json_normalize(raw_matches.json())
            matches_df = matches_df.append(
                matches, ignore_index=True, sort=False)
    elif env == 'local':
        for i in tqdm(range(len(competitions))):
            comp_id = str(competitions['competition_id'][i])
            season_id = str(competitions['season_id'][i])
            matches_url = config.data_dir + \
                f"/matches/{comp_id}/{season_id}.json"
            with open(matches_url, encoding='utf-8') as json_file:
                raw_matches = json.load(json_file)
            matches = json_normalize(raw_matches)
            matches_df = matches_df.append(
                matches, ignore_index=True, sort=False)
    else:
        raise ValueError(
            "'env' variable should be either 'local' or 'github'.")

    return matches_df


def get_matchFree(match_id, env='github'):
    """Function to retrieve events from a match.

        Args:
            match_id (int): id of the game.
            env (str, optional): 'github' or 'local', environment to retrieve data from.

        Returns:
            events (DataFrame): DataFrame with all events.

        Raises:
            ValueError: wrong value for 'env'.
    """

    if env == 'github':
        events_url = config.data_url + f"events/{match_id}.json"
        raw_events_api = requests.get(url=events_url)
        raw_events_api.encoding = 'utf-8'
        events = pd.DataFrame(json_normalize(raw_events_api.json()))
    elif env == 'local':
        events_url = config.data_dir + f"events/{match_id}.json"
        with open(events_url, encoding='utf-8') as json_file:
            raw_events_api = json.load(json_file)
        events = pd.DataFrame(json_normalize(raw_events_api))
    else:
        raise ValueError(
            "'env' variable should be either 'local' or 'github'.")

    events.loc[:, 'match_id'] = match_id
    return events


# Data from a list of matches (use get_matchFree)
def StatsBombFreeEvents(matchesdf, env='github'):
    """Function to create DataFrame with events from match.

        Args:
            matchesdf (DataFrame): dataframe of matches. Must have columns 'match_id',
                'competition_id', and 'season_id'.
            env (str, optional): 'github' or 'local', environment to retrieve data from.

        Returns:
            df (DataFrame): DataFrame with all events from all matches.

    """
    res = []
    for ind in tqdm(matchesdf.index):
        events = get_matchFree(matchesdf[matchesdf.index == ind]['match_id'].values[0],
                               env=env)
        events.loc[:, 'competition_id'] = matchesdf[matchesdf.index == ind
                                                    ]['competition.competition_id'].values[0]
        events.loc[:, 'season_id'] = matchesdf[matchesdf.index == ind
                                               ]['season.season_id'].values[0]
        res.append(events)
    df = pd.concat(res, sort=True)
    return df


def get_lineupsFree(match_id, env='github'):
    """Function to retrieve lineup from a match.

        Args:
            match_id (int): id of the game.
            env (str, optional): 'github' or 'local', environment to retrieve data from.

        Returns:
            events (DataFrame): DataFrame with all events.

        Raises:
            ValueError: wrong value for 'env'.
    """

    if env == 'github':
        events_url = config.data_url + f"lineups/{match_id}.json"
        raw_events_api = requests.get(url=events_url)
        raw_events_api.encoding = 'utf-8'
        events = pd.DataFrame(json_normalize(
            raw_events_api.json(), 'lineup', ['team_id', 'team_name']))
    elif env == 'local':
        events_url = config.data_dir + f"lineups/{match_id}.json"
        with open(events_url, encoding='utf-8') as json_file:
            raw_events_api = json.load(json_file)
        events = pd.DataFrame(json_normalize(
            raw_events_api, 'lineup', ['team_id', 'team_name']))
    else:
        raise ValueError(
            "'env' variable should be either 'local' or 'github'.")

    events.loc[:, 'match_id'] = match_id
    return events


def StatsBombFreelineups(matchesdf, env='github'):
    """Function to create DataFrame with events from match.

        Args:
            matchesdf (DataFrame): dataframe of matches. Must have columns 'match_id',
                'competition_id', and 'season_id'.
            env (str, optional): 'github' or 'local', environment to retrieve data from.

        Returns:
            df (DataFrame): DataFrame with all events from all matches.

    """
    res = []
    for ind in matchesdf.index:
        events = get_lineupsFree(matchesdf[matchesdf.index == ind]['match_id'].values[0],
                                 env=env)
        events.loc[:, 'competition_id'] = matchesdf[matchesdf.index == ind
                                                    ]['competition.competition_id'].values[0]
        events.loc[:, 'season_id'] = matchesdf[matchesdf.index == ind
                                               ]['season.season_id'].values[0]
        res.append(events)
    df = pd.concat(res, sort=True)
    return df

## Football drawing

### Draw pitch

In [47]:
def draw_pitch(ax):
    """Function to plot a football pitch with dimension 120x80, matching StatsBomb spec.

        Args:
            ax (axes): Matplotlib axes.

        Returns:
            ax (axes): Matplotlib axes.

    """
    # size of the pitch is 120, 80
    # Create figure
    x_min = 0
    x_max = 120
    y_min = 0
    y_max = 80

    # Pitch Outline & Centre Line
    plt.plot([x_min, y_min], [x_min, y_max], color="black")
    plt.plot([x_min, x_max], [y_max, y_max], color="black")
    plt.plot([x_max, x_max], [y_max, y_min], color="black")
    plt.plot([x_max, y_min], [x_min, y_min], color="black")
    plt.plot([x_max/2, x_max/2], [x_min, y_max], color="black")

    # Left Penalty Area
    plt.plot([18, 18], [62, 18], color="black")
    plt.plot([0, 18], [62, 62], color="black")
    plt.plot([0, 18], [18, 18], color="black")

    # Right Penalty Area
    plt.plot([120, 102], [62, 62], color="black")
    plt.plot([120, 102], [18, 18], color="black")
    plt.plot([102, 102], [18, 62], color="black")

    # Left 6-yard Box
    plt.plot([6, 6], [30, 50], color="black")
    plt.plot([0, 6], [50, 50], color="black")
    plt.plot([0, 6], [30, 30], color="black")

    # Right 6-yard Box
    plt.plot([120, 114], [30, 30], color="black")
    plt.plot([114, 114], [50, 30], color="black")
    plt.plot([120, 114], [50, 50], color="black")

    # Prepare Circles
    centreCircle = plt.Circle((x_max/2, y_max/2), 10,
                              color="black", fill=False)
    centreSpot = plt.Circle((x_max/2, y_max/2), 0.9, color="black")
    leftPenSpot = plt.Circle((12, 40), 0.71, color="black")
    rightPenSpot = plt.Circle((108, 40), 0.71, color="black")

    # Draw Circles
    ax.add_patch(centreCircle)
    ax.add_patch(centreSpot)
    ax.add_patch(leftPenSpot)
    ax.add_patch(rightPenSpot)

    # Prepare Arcs
    # arguments for arc
    # x, y coordinate of centerpoint of arc
    # width, height as arc might not be circle, but oval
    # angle: degree of rotation of the shape, anti-clockwise
    # theta1, theta2, start and end location of arc in degree
    leftArc = Arc((13, 40), height=16.2, width=16.2, angle=0,
                  theta1=310, theta2=50, color="black")
    rightArc = Arc((107, 40), height=16.2, width=16.2, angle=0,
                   theta1=130, theta2=230, color="black")

    # Draw Arcs
    ax.add_patch(leftArc)
    ax.add_patch(rightArc)

    plt.xlim([-5, 125])
    plt.ylim([-5, 85])

    plt.axis('off')
    
    
    ax.annotate("@BenjaminLarrousse", xy=(22, 78),
                ha="right", va="bottom", zorder=7, fontsize=11, color='grey')
    
    # Invert y axis to match Statsbomb spec (0 at the top and 80 at the bottom)
    ax.invert_yaxis()
    
    return ax

Code to test the pitch drawing

To do put in unit testing


```python
fig, ax = plt.subplots()
fig.set_size_inches(10, 6.5)
draw_pitch(ax)
```

### Draw heatmap

In [6]:
def create_heatmap(x, y, s, bins=1000):
    heatmap, xedges, yedges = np.histogram2d(x, y, bins=bins)
    heatmap = gaussian_filter(heatmap, sigma=s)

    extent = [0, 120, 0, 80]
    return heatmap.T, extent

### Draw passing network

In [16]:
def draw_pass_map(ax, df, match_id, team, min_time=0, max_time=None):
    """Function to plot a passing network for a team in a match.
        Heavily inspired by Friends of Tracking (see link, under visualization/passing_network.py): 
        https://github.com/Friends-of-Tracking-Data-FoTD/passing-networks-in-python
        

        Args:
            ax: Matplotlib's axis object, it expects to have the pitch already plotted.
            df (DataFrame): dataframe of events of games (created from Stastbomb open data).
            match_id (int): id of the game.
            team (str): name of the team in consideration.
            min_time (opt, float): minimum minute of the game we want passes from.
                                   Default to 0: from the start.
            max_time (opt, float): maximum minute of the game we want passes from.
                                   Default to None: the maximum minute with both starting XI
                                   (i.e. before a substitution or red card).

        Returns:
            ax (axes): Matplotlib axes.

    """
    # Team lineup: starting players will be on the map
    lineup = literal_eval(df[(df['match_id'] == match_id) &
                             (df['type.name'] == 'Starting XI') &
                             (df['team.name'] == team)
                             ]['tactics.lineup'].values[0])
    
    if max_time is None:
        # Get number of played minutes for this game for each player
        min_played = []
        for player in lineup:
            min_played.append(get_minutes_played(df,
                                                 match_id,
                                                 player['player']['name'],
                                                 team)
                              )
        max_time = min(min_played)
        
    # Filter passes for passes before first sub or first red card
    # 'Complete' passes are the one with pass.outcome.name = NaN
    df_passes = df[(df['match_id'] == match_id) &
                   (df['type.name'] == "Pass") &
                   (df['pass.outcome.name'].isna()) &
                   (df['team.name'] == team) &
                   (df['time'] >= min_time) &
                   (df['time'] <= max_time)].copy()
    
    # Change the path with your data folder
    with open(f"../../../data/lineups/{match_id}.json",
              encoding='utf-8') as json_file:
        lineup_file = json.load(json_file)
    
    names_nickames = {player["player_name"]: player["player_nickname"]
                           for team in lineup_file for player in team['lineup']
                      }
    
    df_passes["pass.recipient.name"] = df_passes[
        'pass.recipient.name'].apply(lambda x: names_nickames[x] if names_nickames[x] else x)
    df_passes["player.name"] = df_passes[
        'player.name'].apply(lambda x: names_nickames[x] if names_nickames[x] else x)
    
    player_pass_count = df_passes.groupby("player.name").size().to_frame("num_passes")
    player_pass_value = df_passes.groupby("player.name").size().to_frame("pass_value")
    
    # Pair players: the one making the pass with the recipient
    df_passes['pair_key'] = df_passes.apply(lambda x: "_".join(sorted([x["player.name"],
                                                                       x["pass.recipient.name"]])
                                                               ),
                                            axis=1)
    pair_pass_count = df_passes.groupby("pair_key").size().to_frame("num_passes")
    pair_pass_value = df_passes.groupby("pair_key").size().to_frame("pass_value")
    
    # Median position of passes (starting location) for each player
    player_position = df_passes.groupby("player.name"
                                        ).agg({"location_x": "median",
                                               "location_y": "median"})
    
    
    # This allows to fix the range of sizes and color scales 
    # so that two plots from different teams are comparable.
    max_player_count = player_pass_count.num_passes.max()
    max_player_value = player_pass_value.pass_value.max()
    max_pair_count = pair_pass_count.num_passes.max()
    max_pair_value = pair_pass_value.pass_value.max()

    ## Plot edges
    # Combine num_passes and pass_value columns into one DataFrame
    pair_stats = pd.merge(pair_pass_count, pair_pass_value, left_index=True, right_index=True)
    for pair_key, row in pair_stats.iterrows():
        player1, player2 = pair_key.split("_")

        player1_x = player_position.loc[player1]["location_x"]
        player1_y = player_position.loc[player1]["location_y"]

        player2_x = player_position.loc[player2]["location_x"]
        player2_y = player_position.loc[player2]["location_y"]

        num_passes = row["num_passes"]
        pass_value = row["pass_value"]

        line_width = (num_passes / max_pair_count) * (1-5) + 5

        norm = Normalize(vmin=0, vmax=max_pair_value)
        edge_cmap = cm.get_cmap("Oranges")
        edge_color = edge_cmap(norm(pass_value))

        ax.plot([player1_x, player2_x], [player1_y, player2_y],
                'w-', linestyle='-', alpha=1, lw=line_width, zorder=3, color=edge_color)



    # Step 2: plot nodes
    # Combine num_passes and pass_value columns into one DataFrame
    player_stats = pd.merge(player_pass_count, player_pass_value, left_index=True, right_index=True)
    for player_name, row in player_stats.iterrows():
        player_x = player_position.loc[player_name]["location_x"]
        player_y = player_position.loc[player_name]["location_y"]

        num_passes = row["num_passes"]
        pass_value = row["pass_value"]

        marker_size = (num_passes / max_player_count) * (100-25) + 25

        norm = Normalize(vmin=0, vmax=max_player_value)
        node_cmap = cm.get_cmap("Reds")
        node_color = node_cmap(norm(pass_value))

        ax.plot(player_x, player_y, '.', color=node_color, markersize=marker_size, zorder=5)
        ax.plot(player_x, player_y, '.', color='white', markersize=marker_size-20, zorder=6)
        ax.annotate(player_name, xy=(player_x, player_y), ha="center", va="center", zorder=7,
                    fontsize=9, color="black", weight='bold',
                    path_effects=[pe.withStroke(linewidth=2, foreground='white')])

    # Step 3: Extra information shown on the plot
    ax.set_title(f"Pass network: {team}",
                 loc="left")
    
    ax.annotate(
        f"{lineup_file[1]['team_name']} - {lineup_file[0]['team_name']}"
        f", minutes {round(min_time, 1)} to {round(max_time,1)}.",
                xy=(70, -5))
    
    return ax

### Creation of animation for a sequence of football actions

In [7]:
def animate(example):
    """Function to create an animation of a sequence of football actions.
    Specific for Statsbomb free Messi dataset (hence one of the team is "Barcelona")

        Args:
            example (Row Dataframe): one row of a dataframe, from the iterrows() iterator.

        Returns:
            No return

    """
    index, row = example
    # Color depending on team
    if row['team.name'] == 'Barcelona':
        color='lightblue'
    else:
        color='lightred'
    
    # Marker depending on action type
    if row['type.name'] in ['Pass','Shot']:
        marker = 'h'
        markersize = 15
        ax.arrow(row['location_x'],
                 row['location_y'],
                 row['end_location_x'] - row['location_x'],
                 row['end_location_y'] - row['location_y'],
                 color='k',
                 linewidth=2,
                 head_width=1.5,
                 length_includes_head=True,
                 zorder=0
                 )
    elif row['type.name'] in ['Carry','Ball Recovery']:
        marker = 's'
        markersize = 0
        ax.plot([row['location_x'], row['end_location_x']],
                [row['location_y'], row['end_location_y']],
                color='k',
                linestyle="--",
                linewidth=1.5,
                zorder=0,
                )
    else:
        marker = 'p'

    ax.plot(row['location_x'],
            row['location_y'],
            linestyle="None",
            marker=marker,
            markersize=markersize,
            color=color,
            mec="black",
            zorder=1,
    )
    ax.annotate(index, row[['location_x','location_y']] + 1.1, size=15)
    

## Playing minutes by season by player

In [8]:
# Total number of minutes played in one specific game for a particular player
def get_minutes_played(df, match_id, player_name, team):
    """Function to calculate number of minutes played in a game for a player.

        Args:
            df (DataFrame): dataframe of events of games (created from Stastbomb open data).
            match_id (int): id of the game.
            player_name (str): player name to consider.
            team (str): team name of the player in consideration.

        Returns:
            min_played (float): minutes played by the player in the game.

    """
    
    try:
        lineup = literal_eval(df[(df['match_id'] == match_id) &
                                 (df['team.name'] == team) &
                                 (df['type.name'] == 'Starting XI')]['tactics.lineup'].values[0]
                              )
        
        starting_xi = [x['player']['name'] for x in lineup]

        starter = player_name in starting_xi
        
    except IndexError:
        print("No event 'Starting XI' in the input dataset.")
        raise 
        
    first_half_duration = df[(df['match_id'] == match_id) &
                             (df['period'] == 1)]['time'].max()

    second_half_duration = df[(df['match_id'] == match_id) &
                              (df['period'] == 2)]['time'].max() - 45

    if starter:
        # Assign full game time
        min_played = first_half_duration + second_half_duration
    else:
        # Get off the bench during the game
        entered = df[(df['match_id'] == match_id) &
                     (df['team.name'] == team) &
                     (df['type.name'] == 'Substitution') &
                     (df['substitution.replacement.name'] == player_name)][['time', 'period']]
        if not entered['time'].empty:
            min_played = int(entered['period'] == 1
                             )*(first_half_duration - entered['time'].values[0] + second_half_duration
                                ) + int(entered['period'] == 2
                                        )*(second_half_duration + 45 - entered['time'].values[0])
        else:
            min_played = 0

    # Substitution during the game
    was_sub = df[(df['match_id'] == match_id) &
                 (df['team.name'] == team) &
                 (df['player.name'] == player_name) &
                 (df['type.name'] == 'Substitution')][['time', 'period']]
    # If substitution, reduce total playing time
    if not was_sub['time'].empty:
        min_played -= int(was_sub['period'].values == 1
                          )*(second_half_duration + first_half_duration - was_sub['time'].values[0]
                             ) + int(was_sub['period'].values == 2
                                     )*(second_half_duration + 45 - was_sub['time'].values[0])

    # Exclusion (Red card or second yellow)
    was_excluded = df[(df['match_id'] == match_id) &
                      (df['team.name'] == team) &
                      (df['player.name'] == player_name) &
                      ((df['bad_behaviour.card.name'
                            ].isin(['Red Card', 'Second Yellow'])) |
                       (df['foul_committed.card.name'
                            ].isin(['Red Card', 'Second Yellow']))
                       )
                      ][['time', 'period']]
    # If excluded, recude total playint time
    if not was_excluded['time'].empty:
        min_played -= int(was_excluded['period'].values == 1
                          )*(second_half_duration + first_half_duration - was_excluded['time'].values[0]
                             ) + int(was_excluded['period'].values == 2
                                     )*(second_half_duration + 45 - was_excluded['time'].values[0])
    
    if min_played < 0.0:
        print('Number of played minutes is negative: check your input data (Sub OFF before Sub ON?).')
    return min_played


def get_total_minutes(df, season_id, player_name, team):
    """Function to calculate total minutes played in a season for a player.

        Args:
            df (DataFrame): dataframe of events of games (created from Stastbomb open data).
            season_id (int): id of the season.
            player_name (str): player name to consider.
            team (str): team name of the player in consideration.

        Returns:
            total_min (float): minutes played by the player in the game.

    """
    total_min = 0
    for match_id in tqdm(df[df['season_id'] == season_id]['match_id'].unique()):
        min_played = get_minutes_played(df, match_id, player_name, team)
        total_min += min_played

    return total_min

## Offensive metrics

### Open Play Completed Passes Into The box

In [9]:
def get_openplay_compl_pass_into_box(df, season_id, player_name, match_ids=None):
    """Function to calculate open play completed passes into the box.

        Args:
            df (DataFrame): dataframe with Statsbomb event data.
            season_id (str): id of the season we want to calculate the metric on.
            player_name (str): name of the player in consideration for the metric.
            match_ids (opt, list): list of match_id to calculate the metric on. If None,
                                 the metric is calculated on the complete season.

        Returns:
            res (int): number of open play completed passes into the box.

    """
    # If match_ids is not provided, calculate for the all season
    if match_ids is None:
        try:
            res = df[(df['player.name'] == player_name) &
                     (df['type.name'] == 'Pass') &
                     (~df['pass.type.name'].isin(['Throw-in', 'Free Kick', 'Goal Kick',
                                                  'Corner', 'Kick Off'
                                                  ])
                      ) &
                     (df['end_location_x'] >= 102) &
                     (df['end_location_x'] <= 120) &
                     (df['end_location_y'] >= 18) &
                     (df['end_location_y'] <= 62) &
                     (df['season_id'] == season_id) &
                     (df['pass.outcome.name'].isnull())
                     ]['pass.outcome.name'].value_counts(dropna=False).values[0]
        except IndexError:
            res = 0
    # If match_ids is provided, calculate on those match_ids
    else:
        try:
            res = df[(df['player.name'] == player_name) &
                     (df['type.name'] == 'Pass') &
                     (~df['pass.type.name'].isin(['Throw-in', 'Free Kick', 'Goal Kick',
                                                  'Corner', 'Kick Off'
                                                  ])
                      ) &
                     (df['end_location_x'] >= 102) &
                     (df['end_location_x'] <= 120) &
                     (df['end_location_y'] >= 18) &
                     (df['end_location_y'] <= 62) &
                     (df['season_id'] == season_id) &
                     (df['pass.outcome.name'].isnull()) &
                     (df['match_id'].isin(match_ids))
                     ]['pass.outcome.name'].value_counts(dropna=False).values[0]
        except IndexError:
            res = 0
    return res

### Completed passes inside the box

In [10]:
def get_compl_pass_inside_box(df, season_id, player_name, match_ids=None):
    """Function to calculate completed passes inside the box.

        Args:
            df (DataFrame): dataframe with Statsbomb event data.
            season_id (str): id of the season we want to calculate the metric on.
            player_name (str): name of the player in consideration for the metric.
            match_ids (opt, list): list of match_id to calculate the metric on. If None,
                                 the metric is calculated on the complete season.

        Returns:
            res (int): number of completed passes inside the box.

    """
    # If match_ids is not provided, calculate for the all season
    if match_ids is None:
        try:
            res = df[(df['player.name'] == player_name) &
                     (df['type.name'] == 'Pass') &
                     (df['location_x'] >= 102) &
                     (df['location_x'] <= 120) &
                     (df['location_y'] >= 18) &
                     (df['location_y'] <= 62) &
                     (df['season_id'] == season_id) &
                     (df['pass.outcome.name'].isnull())
                     ]['pass.outcome.name'].value_counts(dropna=False).values[0]
        except IndexError:
            res = 0
    # If match_ids is provided, calculate on those match_ids
    else:
        try:
            res = df[(df['player.name'] == player_name) &
                     (df['type.name'] == 'Pass') &
                     (df['location_x'] >= 102) &
                     (df['location_x'] <= 120) &
                     (df['location_y'] >= 18) &
                     (df['location_y'] <= 62) &
                     (df['season_id'] == season_id) &
                     (df['pass.outcome.name'].isnull()) &
                     (df['match_id'].isin(match_ids))
                     ]['pass.outcome.name'].value_counts(dropna=False).values[0]
        except IndexError:
            res = 0
        
    return res

### Completed throughballs

In [11]:
def get_compl_throughballs(df, season_id, player_name, match_ids=None):
    """Function to calculate completed throughballs for a particular player.

        Args:
            df (DataFrame): dataframe with Statsbomb event data.
            season_id (str): id of the season we want to calculate the metric on.
            player_name (str): name of the player in consideration for the metric.
            match_ids (opt, list): list of match_id to calculate the metric on. If None,
                                 the metric is calculated on the complete season.

        Returns:
            res (int): number of completed throughballs.

    """
    # If match_ids is not provided, calculate for the all season
    if match_ids is None:
        try:
            res = df[(df['player.name'] == player_name) &
                     (df['type.name'] == 'Pass') &
                     (df['pass.technique.name'] == 'Through Ball') &
                     (df['season_id'] == season_id) &
                     (df['pass.outcome.name'].isnull())
                     ]['pass.outcome.name'].value_counts(dropna=False).values[0]
        except IndexError:
            res = 0
    # If match_ids is provided, calculate on those match_ids
    else:
        try:
            res = df[(df['player.name'] == player_name) &
                     (df['type.name'] == 'Pass') &
                     (df['pass.technique.name'] == 'Through Ball') &
                     (df['season_id'] == season_id) &
                     (df['pass.outcome.name'].isnull()) &
                     (df['match_id'].isin(match_ids))
                     ]['pass.outcome.name'].value_counts(dropna=False).values[0]
        except IndexError:
            res = 0
        
    return res

### Open play key passes

In [12]:
def get_openplay_keypass(df, season_id, player_name, match_ids=None):
    """Function to calculate open play key passes for a particular player.

        Args:
            df (DataFrame): dataframe with Statsbomb event data.
            season_id (str): id of the season we want to calculate the metric on.
            player_name (str): name of the player in consideration for the metric.
            match_ids (opt, list): list of match_id to calculate the metric on. If None,
                                 the metric is calculated on the complete season.
        Returns:
            res (int): number of open play key passes.

    """
    # If match_ids is not provided, calculate for the all season
    if match_ids is None:
        res = len(df[(df['player.name'] == player_name) &
                     (df['type.name'] == 'Pass') &
                     (~df['pass.type.name'].isin(['Throw-in', 'Free Kick', 'Goal Kick',
                                                  'Corner', 'Kick Off'
                                                  ])
                      ) &
                     (df['season_id'] == season_id) &
                     ((df['pass.goal_assist'] == True) |
                      (df['pass.shot_assist'] == True)
                      )
                     ]
                  )
    # If match_ids is provided, calculate on those match_ids
    else:
        res = len(df[(df['player.name'] == player_name) &
                     (df['type.name'] == 'Pass') &
                     (~df['pass.type.name'].isin(['Throw-in', 'Free Kick', 'Goal Kick',
                                                  'Corner', 'Kick Off'
                                                  ])
                      ) &
                     (df['season_id'] == season_id) &
                     ((df['pass.goal_assist'] == True) |
                      (df['pass.shot_assist'] == True)
                      ) &
                     (df['match_id'].isin(match_ids))
                     ]
                  )
        
    return res

### Open play xG assisted

In [13]:
def get_openplay_xg_assisted(df, season_id, player_name, match_ids=None):
    """Function to calculate open play expected goal assisted for a particular player.

        Args:
            df (DataFrame): dataframe with Statsbomb event data.
            season_id (str): id of the season we want to calculate the metric on.
            player_name (str): name of the player in consideration for the metric.
            match_ids (opt, list): list of match_id to calculate the metric on. If None,
                                 the metric is calculated on the complete season.

        Returns:
            res (int): number of open play expected goal assisted.

    """
    # If match_ids is not provided, calculate for the all season
    if match_ids is None:
        assisted_shot_id = df[(df['player.name'] == player_name) &
                              (df['type.name'] == 'Pass') &
                              (~df['pass.type.name'].isin(['Throw-in', 'Free Kick', 'Goal Kick',
                                                          'Corner', 'Kick Off'
                                                          ])
                              ) &
                              (df['season_id'] == season_id) &
                              ((df['pass.goal_assist'] == True) |
                               (df['pass.shot_assist'] == True)
                               )
                                ]['pass.assisted_shot_id'].values
    # If match_ids is provided, calculate on those match_ids
    else:
        assisted_shot_id = df[(df['player.name'] == player_name) &
                              (df['type.name'] == 'Pass') &
                              (~df['pass.type.name'].isin(['Throw-in', 'Free Kick', 'Goal Kick',
                                                          'Corner', 'Kick Off'
                                                          ])
                              ) &
                              (df['season_id'] == season_id) &
                              ((df['pass.goal_assist'] == True) |
                               (df['pass.shot_assist'] == True)
                               ) &
                              (df['match_id'].isin(match_ids))
                                ]['pass.assisted_shot_id'].values
        
    res = df[(df['id'].isin(assisted_shot_id))]['shot.statsbomb_xg'].sum()

    return res

### PPDA (team level)

In [14]:
def calculate_ppda(df, match_id, x_min, x_max, y_min, y_max):
    """Function to calculate PPDA for the two teams in a particular game.

        Args:
            df (DataFrame): dataframe with Statsbomb event data.
            match_id (int): match id of the game we want to calculate the metric on.
            x_min (float): min on the location_x.
            x_max (float): max on the location_x.
            y_min (float): min on the location_y.
            y_max (float): max on the location_y.

        Returns:
            res (dict): ppda for each team in a dict with keys equal to team names.

    """
    
    # Filter dataframe for the specific match_id and area of the pitch
    df_match = df[(df['match_id'] == match_id) &
                  (df['location_x'] >= x_min) &
                  (df['location_x'] <= x_max) &
                  (df['location_y'] >= y_min) &
                  (df['location_y'] <= y_max)
                  ].reset_index(drop=True)
    
    dict_def_actions = compute_def_actions(df, match_id, x_min, x_max, y_min, y_max)
    
    res = {}
    
    
    for team in dict_def_actions.keys():
        # Calculate passes allowed
        passes_allowed = len(df_match[(df_match['type.name'] == 'Pass') &
                                      (df_match['team.name'] != team)
                                      ]
                             )

        res[team] = passes_allowed / dict_def_actions[team]
    
    return res

## Defensive metrics

### Number of defensive actions by pitch zone

In [15]:
def compute_def_actions(df, match_id, x_min=0.0, x_max=120.0, y_min=0.0, y_max=80.0):
    """Function to calculate the number of defensive actions for each team in a game.

        Args:
            df (DataFrame): dataframe with Statsbomb event data.
            match_id (int): match id of the game we want to calculate the metric on.
            x_min (opt, float): min on the location_x. Default to full pitch.
            x_max (opt, float): max on the location_x. Default to full pitch.
            y_min (opt, float): min on the location_y. Default to full pitch.
            y_max (opt, float): max on the location_y. Default to full pitch.

        Returns:
            res (dict): number of defensive actions for each team in a dict with keys equal to team names.

    """
    # Retrieve team names
    teams = df[df['match_id'] == match_id]['team.name'].unique()
    
    # Filter dataframe for the specific match_id and area of the pitch
    df_match = df[(df['match_id'] == match_id) &
                  (df['location_x'] >= x_min) &
                  (df['location_x'] <= x_max) &
                  (df['location_y'] >= y_min) &
                  (df['location_y'] <= y_max)
                  ].reset_index(drop=True)
    
    # List of defensive events
    # 'Duel': tackle and aerial lost considered
    def_events = ['Block', 'Clearance', 'Foul Commited', 'Interception',
                  'Duel', 'Pressure', 'Shield']
    
    res = {}

    for team in teams:
        # Team number of defensive actions
        def_actions = len(df_match[(df_match['team.name'] == team) &
                                   (df_match['type.name'].isin(def_events))
                                   ])

        # Adding blocked shot (shot from opposition team) in the count
        def_actions += len(df_match[(df_match['team.name'] != team) &
                                    (df_match['type.name'] == 'Shot') &
                                    (df_match['shot.outcome.name'] == 'Blocked')])

        # Adding "defensive" pass: Pass with type Interception and Recovery
        def_actions += len(df_match[(df_match['team.name'] == team) &
                                    (df_match['type.name'] == 'Pass') &
                                    (df_match['pass.type.name'].isin(['Interception',
                                                                      'Recovery']
                                                                     )
                                     )
                                    ])

        def_actions += len(df_match[(df_match['team.name'] != team) &
                                    (df_match['type.name'] == 'Dispossessed')])
        
        res[team] = def_actions
    return res