In [1]:
%matplotlib inline


# Possession Chains
Create Possession Chains


In [2]:
import pandas as pd
import numpy as np
import json
# plotting
import os
import pathlib
import warnings 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from mplsoccer import Pitch
import matplotlib.pyplot as plt
from joblib import load

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

In [3]:
PITCH_MAX_X = 105
PITCH_MAX_Y = 68
GOAL_WIDTH = 7.32

WYSCOUT_PITCH_MAX_X = 100
WYSCOUT_PITCH_MAX_Y = 100
WYSCOUT_BOX_X = 84
WYSCOUT_BOX_MIN_Y = 19
WYSCOUT_BOX_MAX_Y = 81

In [4]:
TRANSFORM_X = PITCH_MAX_X / WYSCOUT_PITCH_MAX_X
TRANSFORM_Y = PITCH_MAX_Y / WYSCOUT_PITCH_MAX_Y

BOX_X = WYSCOUT_BOX_X * TRANSFORM_X
BOX_MIN_Y = WYSCOUT_BOX_MIN_Y * TRANSFORM_Y
BOX_MAX_Y = WYSCOUT_BOX_MAX_Y * TRANSFORM_Y

## Opening the dataset

First we open the data. It is exactly the same way as we did earlier.

In [5]:
DATA_PATH = os.path.join(str(pathlib.Path().resolve().parents[1]), 'data', 'Wyscout')

In [6]:
league = 'England'
events_file_name = f'events_{league}.json'
events_path = os.path.join(DATA_PATH, 'events', events_file_name)

df = pd.read_json(events_path, encoding='unicode-escape')

In [7]:
def transform_x(row, i):
    return row[i]['x'] * TRANSFORM_X

def transform_y(row, i):
    return (WYSCOUT_PITCH_MAX_Y - row[i]['y']) * TRANSFORM_Y

df["x"] = df.positions.apply(lambda row: transform_x(row, 0))
df["y"] = df.positions.apply(lambda row: transform_y(row, 0))
df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,x,y
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171,51.45,34.68
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172,32.55,14.96
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173,53.55,17.00
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174,36.75,19.72
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175,43.05,3.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643145,5,Ball out of the field,[],0,"[{'y': 32, 'x': 0}, {'y': 100, 'x': 100}]",2500098,Interruption,1623,2H,2796.732525,50,251596409,0.00,46.24
643146,3,Corner,"[{'id': 302}, {'id': 801}, {'id': 1801}]",70965,"[{'y': 100, 'x': 100}, {'y': 47, 'x': 88}]",2500098,Free Kick,1633,2H,2829.821084,30,251596232,105.00,0.00
643147,1,Air duel,"[{'id': 701}, {'id': 1802}]",7919,"[{'y': 53, 'x': 12}, {'y': 50, 'x': 14}]",2500098,Duel,1623,2H,2831.211419,10,251596410,12.60,31.96
643148,1,Air duel,"[{'id': 703}, {'id': 1801}]",8005,"[{'y': 47, 'x': 88}, {'y': 50, 'x': 86}]",2500098,Duel,1633,2H,2832.434399,10,251596234,92.40,36.04


## Preparing data 

First we see create a new column with the next event. We create a column with 1 if the ball
was kicked out - to mark when the chain should be stopped. We filter out *Interruptions* since
there was a ball out of the pitch or foul before them. Then, we filter lost duels, since Wyscout
duels are saved twice - for both team either defensive or offensive. We filter out
events "Out of ball" since we can get this information from other column. Then, we filter 
out events by goalkeepers, since shots were made before. 

In [8]:
next_event = df.groupby('matchId')['subEventName'].shift(-1, fill_value='')
df["nextEvent"] = next_event
df["kickedOut"] = df.apply(lambda x: 1 if (x.nextEvent in ["Ball out of the field", ""]) else 0, axis = 1)
# interruptions out
interruption = df.loc[df["eventName"] == "Interruption"]
# filter out non-accurate duels - in wyscout they are 2 way - attacking and defending
lost_duels = df.loc[df["eventName"] == "Duel"]
lost_duels = lost_duels.loc[lost_duels.apply (lambda x:{'id': 1802} in x.tags, axis = 1)]
df = df.drop(lost_duels.index)
# filter ball out of the field - I can get this anyways
out_of_ball = df.loc[df["subEventName"] == "Ball out of the field"]
df = df.drop(out_of_ball.index)
# save attempts can be dropped
goalies = df.loc[df["subEventName"].isin(["Goalkeeper leaving line", "Save attempt", "Reflexes"])]
df = df.drop(goalies.index)

## Isolating possession chain

Then, we isolate possession chain with the rule that if the ball was touched once by 
a different team, it should not change the results. If there was a foul, ball kicked out
of the field the chain is stopped. For others in the ball, if the ball was properly intercepted
that is, the next team is the one that made the next event, we stop the chain. Otherwise,
if the ball was only touched, but did not change possession, we treat a pass as an accurate one.
Note that this is an approximate of true possession chain. In the industry you will work
on datasets with possession chains already isolated.



In [9]:
def isolateChains(df):
    """
    Parameters
    ----------
    df : dataframe
        dataframe with Wyscout event data.

    Returns
    -------
    df: dataframe
        dataframe with isolated possession chains

    """
    df["nextTeamId"] = df.shift(-1, fill_value=0)["teamId"]
    # potential +0s
    chain_team = df.iloc[0]["teamId"]
    period = df.iloc[0]["matchPeriod"]
    stop_criterion = 0
    chain = 0
    df["possession_chain"] = 0
    df["possession_chain_team"] = 0
    
    for i, row in df.iterrows():
        # add value
        df.at[i, "possession_chain"] = chain
        df.at[i, "possession_chain_team"] = chain_team
        # if pass not accurate/lost duel, add 1 to stop criterion
        if row["eventName"] == "Pass" or row["eventName"] == "Duel":
            if row["teamId"] == chain_team and {"id": 1802} in row["tags"]:
                stop_criterion += 1
            if row["teamId"] != chain_team and {"id": 1801} in row["tags"]:
                stop_criterion += 1   
        # if ball intercepted properly add 2            
        if row["eventName"] == "Others on the ball":
            if row["teamId"] == row["nextTeamId"]:
                stop_criterion += 2
        # if shot, add 2 to stop criteriom        
        if row["eventName"] in ["Shot", "Foul", "Offside"]:
            stop_criterion += 2
        # if ball out of field, add 2
        if row["kickedOut"] == 1:
            stop_criterion += 2
        # criterion for stopping when half ended
        if row["matchPeriod"] != period:
            chain += 1
            stop_criterion = 0
            chain_team = row['teamId']
            period = row["matchPeriod"] 
            df.at[i, "possession_chain"] = chain
            df.at[i, "possession_chain_team"] = chain_team
        # possession chain ended
        if stop_criterion >= 2:  # hacer que el stop criterio acepte nextEventName = ''?
            chain += 1
            stop_criterion = 0
            chain_team = row['nextTeamId']
    return df


df = isolateChains(df)
df.loc[df["possession_chain"] == 4][["eventName", "possession_chain"]]

Unnamed: 0,eventName,possession_chain
36,Free Kick,4
37,Pass,4
38,Duel,4
40,Duel,4
42,Pass,4
43,Pass,4
44,Pass,4
45,Pass,4
46,Shot,4


## Calculating xG value 

To calculate xG value of shots, we use the function from [Lesson 2](https://soccermatics.readthedocs.io/en/latest/gallery/lesson2/plot_xGModelFit.html).
However, this time we treat penalties as shots from the penalty spot. Then, we assign xG 
value to shots in df with all events.



In [10]:
def calculate_distance_and_angle(x, y):
    c = abs(y - PITCH_MAX_Y / 2)
    d = PITCH_MAX_X - x
    distance = np.sqrt(d ** 2 + c ** 2)
    angle = np.arctan((GOAL_WIDTH * d) / (d ** 2 + c ** 2 - (GOAL_WIDTH / 2) ** 2))
    angle = angle + np.pi * (angle <= 0)
    
    return distance, angle


def predict_xG(distance, angle, model):
    params = model.params
    return 1 / (1 + np.exp(params[0] + params[1] * distance + params[2] * angle))

In [11]:
def build_xG_df(shots):
    # calculate distance and angle 
    shots["distance"], shots["angle"] = zip(*shots.apply(lambda row: calculate_distance_and_angle(row["x"], row["y"]), axis=1))
    
    # headers have id = 403
    headers = shots.loc[shots.apply (lambda x: {'id': 403} in x.tags, axis = 1)]
    non_headers = shots.drop(headers.index)
    
    return headers, non_headers


def predict_xG_df(shots):
    headers, non_headers = build_xG_df(shots)
    
    headers_path_model = os.path.join(str(pathlib.Path().resolve().parents[1]), 'models', 'headers_xG_model.sav')
    headers_model = load(headers_path_model)
    
    non_headers_path_model = os.path.join(str(pathlib.Path().resolve().parents[1]), 'models', 'non_headers_xG_model.sav')
    non_headers_model = load(non_headers_path_model)

    headers = headers.assign(xG=headers.apply(lambda row: predict_xG(row['distance'], row['angle'], headers_model), axis=1))
    non_headers = non_headers.assign(xG=non_headers.apply(lambda row: predict_xG(row['distance'], row['angle'], non_headers_model), axis=1))
    
    all_shots_xg = pd.concat([non_headers, headers])
    
    return all_shots_xg.sort_index()

In [12]:
def calulatexG(df):
    """
    Parameters
    ----------
    df : dataframe
        dataframe with Wyscout event data.

    Returns
    -------
    xG_sum: dataframe
        dataframe with xG for each shot

    """
    shots = df.loc[df["eventName"] == "Shot"].copy()
    
    all_shots_xg = predict_xG_df(shots)

    penalties = df.loc[df["subEventName"] == "Penalty"]
    penalties = penalties.assign(xG=0.8)
    
    all_shots_w_pens_xg = pd.concat([all_shots_xg, penalties]).sort_index()
    
    df["xG"] = 0
    for index, row in all_shots_w_pens_xg.iterrows():
        df.at[index, "xG"] = row["xG"]
    
    return df


df = calulatexG(df)
df.loc[df["possession_chain"].isin([3,4])][["eventName", "possession_chain", "xG"]]

Unnamed: 0,eventName,possession_chain,xG
25,Pass,3,0.0
26,Pass,3,0.0
27,Duel,3,0.0
30,Duel,3,0.0
31,Duel,3,0.0
34,Duel,3,0.0
35,Foul,3,0.0
36,Free Kick,4,0.0
37,Pass,4,0.0
38,Duel,4,0.0


## Finding chains that ended with shot

As the next step we find possession chains that ended with shot. We assign 1 to them. 
We also assign xG of the shot to all events in the chain. If a chain was stopped by a foul,
also chain before is considered as one that ended with shot. We also keep only events
made by possession team in the chain.



In [13]:
def prepareChains(df):
    """
    Parameters
    ----------
    df : dataframe
        dataframe with Wyscout event data.

    Returns
    -------
    xG_sum: dataframe
        dataframe with assigned values for chains 

    """
    df["shot_end"] = 0
    # get number of chains
    no_chains = max(df["possession_chain"].unique())
    indicies = []
    for i in range(no_chains+1):
        # all events get possession chain
        possession_chain_df = df.loc[df["possession_chain"] == i]
        # check if the possession chain is not empty
        if len(possession_chain_df) > 0:
            # if ended with shot
            if possession_chain_df.iloc[-1]["eventName"] == "Shot":
                # assign values
                df.loc[df["possession_chain"] == i, "shot_end"] = 1
                xG = possession_chain_df.iloc[-1]["xG"]
                df.loc[df["possession_chain"] == i, "xG"] = xG
                # check if the previous ones did not end with foul
                k = i-1
                if k > 0:
                    try:
                        prev = df.loc[df["possession_chain"] == k]   
                        # create a loop if e.g. 2 chains before and 1 chain before didn't end with shot
                        while prev.iloc[-1]["eventName"] == "Foul":
                            #assign value for them
                            df.loc[df["possession_chain"] == k, "xG"] = xG
                            df.loc[df["possession_chain"] == k, "shot_end"] = 1
                            k = k-1
                            prev = df.loc[df["possession_chain"] == k]
                    except:
                        k = k-1
            # get indiices of events made by possession team           
            team_indicies = possession_chain_df.loc[possession_chain_df["teamId"] == possession_chain_df.teamId.mode().iloc[0]].index.values.tolist()
            indicies.extend(team_indicies)    
    
    df = df.loc[indicies]
    return df 
    
    
df = prepareChains(df)  
df.loc[df["possession_chain"].isin([3,4])][["eventName", "possession_chain", "xG"]]

Unnamed: 0,eventName,possession_chain,xG
25,Pass,3,0.138584
26,Pass,3,0.138584
30,Duel,3,0.138584
31,Duel,3,0.138584
34,Duel,3,0.138584
36,Free Kick,4,0.138584
37,Pass,4,0.138584
38,Duel,4,0.138584
40,Duel,4,0.138584
42,Pass,4,0.138584


## Preparing data for modelling

As the next step we prepare data for modelling. We filter out dodgy events without end coordinates and store coordinates in different
columns. We also create variable *c* as the distance of line parallel to x-axis throught the middle of the pitch. Moreover, we
set end of a shot as (105, 34) contrary to Wyscout, which tends to store their ends as one of the corners. 



In [14]:
# filter out dodgy
df = df.loc[df.apply(lambda x: len(x.positions) == 2, axis = 1)]         
# columns with coordinates   
df["x0"] = df.positions.apply(lambda row: transform_x(row, 0))
df["y0"] = df.positions.apply(lambda row: transform_y(row, 0))
df["x1"] = df.positions.apply(lambda row: transform_x(row, 1))
df["y1"] = df.positions.apply(lambda row: transform_y(row, 1))
df["c0"] = df.positions.apply(lambda cell: abs(50 - cell[0]['y']) * 68/100)
df["c1"] = df.positions.apply(lambda cell: abs(50 - cell[1]['y']) * 68/100)

# assign (105, 34) to end of the shot
df.loc[df["eventName"] == "Shot", "x1"] = 105
df.loc[df["eventName"] == "Shot", "y1"] = 34
df.loc[df["eventName"] == "Shot", "c1"] = 0

In [15]:
file_name = f'possession_chains_{league}.json'
path = os.path.join(DATA_PATH, 'possession_chain', file_name)
df.to_json(path)