
# Calculating xT (position-based)
Calculating Expected Threat


In [1]:
#importing necessary libraries 

import pandas as pd
import numpy as np
import json
# plotting
import matplotlib.pyplot as plt
#opening data
import os
import pathlib
import warnings 
# Importing coord_to_bins from our python's file utilities
from utilities import coords_to_bins
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

## Opening data 
In this section we implement the Expected Threat model in
the same way described by [Karun Singh](https://karun.in/blog/expected-threat.html).
First, we open the data.



In [2]:
# Liste des fichiers JSON
file_names = ['events_England.json', 'events_France.json', 'events_Spain.json', 'events_Germany.json', 'events_Italy.json']

# Initialiser un DataFrame vide
df = pd.DataFrame()

# Boucler à travers les fichiers et les charger
for file_name in file_names:
    with open(file_name) as f:
        data = json.load(f)
    # Concaténer les nouvelles données au DataFrame existant
    df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

In [3]:
df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175
...,...,...,...,...,...,...,...,...,...,...,...,...
3071390,3,Free kick cross,"[{'id': 801}, {'id': 1801}]",70974,"[{'y': 23, 'x': 75}, {'y': 65, 'x': 95}]",2576338,Free Kick,3193,2H,2870.982660,32,253567160
3071391,1,Ground loose ball duel,"[{'id': 702}, {'id': 1801}]",14745,"[{'y': 35, 'x': 5}, {'y': 36, 'x': 3}]",2576338,Duel,3185,2H,2872.101142,13,253567161
3071392,1,Ground loose ball duel,"[{'id': 702}, {'id': 1801}]",413041,"[{'y': 65, 'x': 95}, {'y': 64, 'x': 97}]",2576338,Duel,3193,2H,2872.990437,13,253567163
3071393,1,Air duel,"[{'id': 703}, {'id': 1801}]",20927,"[{'y': 36, 'x': 3}, {'y': 100, 'x': 100}]",2576338,Duel,3185,2H,2874.049206,10,253567162


## Actions moving the ball and shooting
To calculate the Expected Threat we need actions that move the ball. First we filter them
from the database.
To make our calculations easier we create new columns with coordinates,
one for each coordinate.
In the end we store number of actions in each bin in a *move_count* array to calculate later
move probability.



In [4]:
next_event = df.shift(-1, fill_value=0)
df["nextEvent"] = next_event["subEventName"]
# Creating a DataFrame with all actions involving ball movement

actions_df = df.loc[df['subEventName'].isin(['Simple pass', 'High pass', 'Head pass', 'Hand pass', 'Smart pass', 'Cross', 'Acceleration','Shot'])]

#extracting coordinates
actions_df["x"] = actions_df.positions.apply(lambda cell: (cell[0]['x']))
actions_df["y"] = actions_df.positions.apply(lambda cell: (cell[0]['y']))
actions_df["end_x"] = actions_df.positions.apply(lambda cell: (cell[1]['x']))
actions_df["end_y"] = actions_df.positions.apply(lambda cell: (cell[1]['y']))

actions_df['start_bins'] =coords_to_bins(actions_df, 'x', 'y')
actions_df['end_bins'] =coords_to_bins(actions_df, 'end_x', 'end_y')
# Creating two DataFrames:
# move_df is actions_df excluding actions where the ball went out of the field
# shot_df is actions_df including only the shots

move_df = actions_df[actions_df['subEventName'] != "Shot"]
move_df["kickedOut"] = move_df.apply(lambda x: 1 if x.nextEvent == "Ball out of the field" else 0, axis = 1)
move_df = move_df.loc[(((move_df["end_x"] != 0) & (move_df["end_y"] != 100)) & ((move_df["end_x"] != 100) & (move_df["end_y"] != 0)))]

#filtering out of the field
delete_passes = move_df.loc[move_df["kickedOut"] == 1]
move_df = move_df.drop(delete_passes.index)

shot_df = actions_df[actions_df['subEventName'] == "Shot"]



In [5]:
actions_df['subEventName'].unique()

array(['Simple pass', 'High pass', 'Head pass', 'Smart pass', 'Cross',
       'Shot', 'Hand pass', 'Acceleration'], dtype=object)

In [6]:
move_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,x,y,end_x,end_y,start_bins,end_bins,kickedOut
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171,High pass,49,49,31,78,44,37,0
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172,Head pass,31,78,51,75,37,57,0
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173,Head pass,51,75,35,71,57,37,0
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174,Simple pass,35,71,41,95,37,49,0
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175,Simple pass,41,95,72,88,49,78,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071375,8,Simple pass,[{'id': 1801}],70974,"[{'y': 58, 'x': 47}, {'y': 55, 'x': 35}]",2576338,Pass,3193,2H,2795.993433,85,253567130,Simple pass,47,58,35,55,45,35,0
3071376,8,Simple pass,[{'id': 1801}],349102,"[{'y': 55, 'x': 35}, {'y': 60, 'x': 33}]",2576338,Pass,3193,2H,2797.609376,85,253567133,Ground attacking duel,35,55,33,60,35,36,0
3071379,8,Simple pass,[{'id': 1801}],413041,"[{'y': 57, 'x': 25}, {'y': 43, 'x': 31}]",2576338,Pass,3193,2H,2801.914483,85,253567136,Touch,25,57,31,43,25,34,0
3071381,8,High pass,[{'id': 1801}],206318,"[{'y': 48, 'x': 46}, {'y': 87, 'x': 61}]",2576338,Pass,3193,2H,2808.430235,83,253567145,Touch,46,48,61,87,44,68,0


In [7]:
shot_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,x,y,end_x,end_y,start_bins,end_bins
46,10,Shot,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",25413,"[{'y': 41, 'x': 88}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,94.595788,100,177959212,Reflexes,88,41,0,0,84,0
62,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",26150,"[{'y': 52, 'x': 85}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,179.854785,100,177959247,Ball out of the field,85,52,100,100,85,99
91,10,Shot,"[{'id': 101}, {'id': 403}, {'id': 201}, {'id':...",14763,"[{'y': 52, 'x': 96}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,254.745027,100,177959280,Reflexes,96,52,100,100,95,99
128,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",7868,"[{'y': 33, 'x': 81}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,425.824035,100,177959289,Ball out of the field,81,33,0,0,83,0
249,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1205}, {'id'...",7868,"[{'y': 30, 'x': 75}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,815.462015,100,177959429,Save attempt,75,30,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3070893,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",116269,"[{'y': 45, 'x': 95}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,1152.032980,100,253566542,Ball out of the field,95,45,0,0,94,0
3070927,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1212}, {'id'...",3548,"[{'y': 38, 'x': 93}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,1251.730517,100,253566586,Touch,93,38,0,0,93,0
3071192,10,Shot,"[{'id': 101}, {'id': 401}, {'id': 201}, {'id':...",21177,"[{'y': 46, 'x': 90}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,2065.034482,100,253566910,Reflexes,90,46,0,0,94,0
3071241,10,Shot,"[{'id': 402}, {'id': 1212}, {'id': 1802}]",349102,"[{'y': 32, 'x': 79}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,2367.252041,100,253566961,Ball out of the field,79,32,0,0,73,0


In [8]:
# Calculer le nombre de mouvements de chaque bin de départ à chaque bin d'arrivée
move_counts = move_df.groupby(['start_bins', 'end_bins']).size().reset_index(name='counts')

# Calculer le nombre total de mouvements depuis chaque bin de départ
total_moves = move_df.groupby(['start_bins']).size().reset_index(name='total_counts')

# Joindre les deux DataFrames pour obtenir les comptes totaux pour chaque bin de départ
move_stats = pd.merge(move_counts, total_moves, on='start_bins')

# Calculer les probabilités de mouvement
move_stats['probability'] = move_stats['counts'] / move_stats['total_counts']

# Conserver uniquement les colonnes nécessaires
moving_probabilities = move_stats[['start_bins', 'end_bins', 'probability']]
moving_probabilities


Unnamed: 0,start_bins,end_bins,probability
0,0,0,0.102794
1,0,1,0.077345
2,0,2,0.038922
3,0,3,0.025948
4,0,4,0.017465
...,...,...,...
8355,99,95,0.111208
8356,99,96,0.092149
8357,99,97,0.046861
8358,99,98,0.048085


In [9]:
moving_probabilities.groupby(['start_bins']).sum()

Unnamed: 0_level_0,end_bins,probability
start_bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2614,1.0
1,2951,1.0
2,3409,1.0
3,3355,1.0
4,3908,1.0
...,...,...
95,2292,1.0
96,3406,1.0
97,4087,1.0
98,4176,1.0


### shooting probability from each bins

In [10]:
# Ajouter une colonne binaire 'is_shot'
actions_df['is_shot'] = actions_df['subEventName'].apply(lambda x: 1 if x == "Shot" else 0)

# Calculer le nombre total d'actions par bin de départ
total_actions_by_bin = actions_df.groupby('start_bins').size().reset_index(name='total_actions')

# Calculer le nombre de tirs par bin de départ
total_shots_by_bin = actions_df.groupby('start_bins')['is_shot'].sum().reset_index(name='total_shots')

# Joindre les deux DataFrames pour obtenir les comptes totaux pour chaque bin de départ
shot_stats = pd.merge(total_shots_by_bin, total_actions_by_bin, on='start_bins')

# Calculer les probabilités de tir
shot_stats['shot_probability'] = shot_stats['total_shots'] / shot_stats['total_actions']

# Conserver uniquement les colonnes nécessaires
shot_probabilities = shot_stats[['start_bins', 'shot_probability']]

shot_probabilities


Unnamed: 0,start_bins,shot_probability
0,0,0.000000
1,1,0.000000
2,2,0.000000
3,3,0.001017
4,4,0.000737
...,...,...
95,95,0.919335
96,96,0.579714
97,97,0.100539
98,98,0.002378


### Importing our xG model

In [11]:
import pickle
with open('xG_model.pkl', 'rb') as file:
    xg_model = pickle.load(file)

### Calculation of the expected threat

In [25]:
def expected_threat(moving_probabilities, shot_probabilities, xg_model, features_dict, n_bins=100, max_iter=10, epsilon=1e-6):
    # Get the feature names from the xG model
    feature_names = xg_model.feature_names_in_

    # Ensure all required features are present in features_dict
    required_features = {'score_17_18', 'possession_percent', 'total_time', '2H'}
    if not required_features.issubset(features_dict.keys()):
        raise ValueError(f"Missing required features: {required_features - features_dict.keys()}")

    # Prepare features for the xG model including one-hot encoding for the bins
    xg_features_list = []
    for bin_start in range(n_bins):
        xg_features = features_dict.copy()
        for i in range(n_bins):
            xg_features[f'bin_{i}'] = 1 if i == bin_start else 0
        xg_features_list.append(xg_features)

    # Convert to DataFrame and ensure the correct order of columns
    xg_features_df = pd.DataFrame(xg_features_list, columns=feature_names)
    p_scoring = xg_model.predict_proba(xg_features_df)[:, 1]

    # Get the shot probabilities as a numpy array
    shot_probs = shot_probabilities['shot_probability'].values

    # Calculate the move probabilities matrix
    move_probs_matrix = np.zeros((n_bins, n_bins))
    for idx, row in moving_probabilities.iterrows():
        start_bin = int(row['start_bins'])
        end_bin = int(row['end_bins'])
        move_probs_matrix[start_bin, end_bin] = row['probability']

    # Initialize the xT values
    xT = np.zeros(n_bins)
    
    for _ in range(max_iter):
        xT_old = xT.copy()
        # Calculate the new xT values using matrix operations
        new_xT = p_scoring * shot_probs + np.dot(move_probs_matrix, xT_old * (1 - shot_probs))
        
        # Check for convergence
        if np.max(np.abs(new_xT - xT)) < epsilon:
            break
        
        xT = new_xT
    
    return xT


### Example for testing our function 

In [26]:
# Example of how to call the function with your data
features_of_xg_model = {
    'score_17_18': 179.605,
    'possession_percent': 55.0,
    'total_time': 5800.0,
    '2H': 1
}
xT = expected_threat(moving_probabilities, shot_probabilities, xg_model, features_of_xg_model)
xT

array([0.06871797, 0.06719303, 0.06762882, 0.06828371, 0.06824878,
       0.06892604, 0.06949644, 0.06905366, 0.06863295, 0.07113295,
       0.0740489 , 0.07134111, 0.07078199, 0.07172419, 0.07335101,
       0.07431866, 0.07321041, 0.07191469, 0.0728867 , 0.07642279,
       0.08064949, 0.07833905, 0.077273  , 0.07773148, 0.07852368,
       0.07866593, 0.07841484, 0.07853766, 0.08029933, 0.0829681 ,
       0.08817265, 0.08830938, 0.08919767, 0.08975842, 0.08954599,
       0.08976833, 0.09041182, 0.09088882, 0.09079061, 0.09114108,
       0.09658545, 0.09941963, 0.10128357, 0.10202023, 0.10399824,
       0.10491441, 0.10338692, 0.10307722, 0.1020077 , 0.09981233,
       0.1087896 , 0.1125904 , 0.11620853, 0.1183938 , 0.11788039,
       0.11554517, 0.12027676, 0.11774303, 0.11506243, 0.1120352 ,
       0.12138048, 0.12632662, 0.13317776, 0.13898633, 0.14508984,
       0.14641974, 0.13957211, 0.13388009, 0.12828795, 0.12332221,
       0.13508535, 0.14407776, 0.1719088 , 0.20062848, 0.22335

In [28]:
# Example of how to call the function with your data
features_of_xg_model = {
    'score_17_18': 213,
    'possession_percent': 34.0,
    'total_time': 5750.0,
    '2H': 0
}
xT = expected_threat(moving_probabilities, shot_probabilities, xg_model, features_of_xg_model)
xT

array([0.06692352, 0.06542161, 0.06582532, 0.06645339, 0.06638477,
       0.06700992, 0.06753725, 0.06707522, 0.06665262, 0.06906439,
       0.07213036, 0.0694594 , 0.06888685, 0.06978251, 0.07134617,
       0.07225258, 0.07114979, 0.06986084, 0.07078343, 0.07420422,
       0.07857549, 0.07628279, 0.07520851, 0.07562322, 0.07636879,
       0.07647628, 0.07621187, 0.07629908, 0.07798193, 0.08055368,
       0.08592538, 0.08602871, 0.08684438, 0.08734437, 0.08709832,
       0.08728583, 0.0878708 , 0.08829992, 0.08817962, 0.08849473,
       0.09416149, 0.09687804, 0.09863989, 0.09929201, 0.10116274,
       0.10200391, 0.10048315, 0.10014076, 0.09907299, 0.09692672,
       0.10611959, 0.109779  , 0.11320179, 0.11521147, 0.11462029,
       0.11227815, 0.11682846, 0.11435599, 0.1117386 , 0.10880276,
       0.11850037, 0.12325742, 0.12981575, 0.13542688, 0.14106215,
       0.14191509, 0.13542051, 0.12996984, 0.12457374, 0.11977763,
       0.13203932, 0.14069956, 0.16936593, 0.1922391 , 0.21430