
# Calculating xT (position-based)
Calculating Expected Threat


In [1]:
#importing necessary libraries 

import pandas as pd
import numpy as np
import json
# plotting
import matplotlib.pyplot as plt
#opening data
import os
import pathlib
import warnings 
# Importing coord_to_bins from our python's file utilities
from utilities import coords_to_bins
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

## Opening data 
In this section we implement the Expected Threat model in
the same way described by [Karun Singh](https://karun.in/blog/expected-threat.html).
First, we open the data.



In [2]:
# Liste des fichiers JSON
file_names = ['events_England.json', 'events_France.json', 'events_Spain.json', 'events_Germany.json', 'events_Italy.json']

# Initialiser un DataFrame vide
df = pd.DataFrame()

# Boucler à travers les fichiers et les charger
for file_name in file_names:
    with open(file_name) as f:
        data = json.load(f)
    # Concaténer les nouvelles données au DataFrame existant
    df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

In [3]:
df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175
...,...,...,...,...,...,...,...,...,...,...,...,...
3071390,3,Free kick cross,"[{'id': 801}, {'id': 1801}]",70974,"[{'y': 23, 'x': 75}, {'y': 65, 'x': 95}]",2576338,Free Kick,3193,2H,2870.982660,32,253567160
3071391,1,Ground loose ball duel,"[{'id': 702}, {'id': 1801}]",14745,"[{'y': 35, 'x': 5}, {'y': 36, 'x': 3}]",2576338,Duel,3185,2H,2872.101142,13,253567161
3071392,1,Ground loose ball duel,"[{'id': 702}, {'id': 1801}]",413041,"[{'y': 65, 'x': 95}, {'y': 64, 'x': 97}]",2576338,Duel,3193,2H,2872.990437,13,253567163
3071393,1,Air duel,"[{'id': 703}, {'id': 1801}]",20927,"[{'y': 36, 'x': 3}, {'y': 100, 'x': 100}]",2576338,Duel,3185,2H,2874.049206,10,253567162


## Actions moving the ball and shooting
To calculate the Expected Threat we need actions that move the ball. First we filter them
from the database.
To make our calculations easier we create new columns with coordinates,
one for each coordinate.
In the end we store number of actions in each bin in a *move_count* array to calculate later
move probability.



In [4]:
next_event = df.shift(-1, fill_value=0)
df["nextEvent"] = next_event["subEventName"]
# Creating a DataFrame with all actions involving ball movement

actions_df = df.loc[df['subEventName'].isin(['Simple pass', 'High pass', 'Head pass', 'Hand pass', 'Smart pass', 'Cross', 'Acceleration','Shot'])]

#extracting coordinates
actions_df["x"] = actions_df.positions.apply(lambda cell: (cell[0]['x']))
actions_df["y"] = actions_df.positions.apply(lambda cell: (cell[0]['y']))
actions_df["end_x"] = actions_df.positions.apply(lambda cell: (cell[1]['x']))
actions_df["end_y"] = actions_df.positions.apply(lambda cell: (cell[1]['y']))

actions_df['start_bins'] =coords_to_bins(actions_df, 'x', 'y')
actions_df['end_bins'] =coords_to_bins(actions_df, 'end_x', 'end_y')
# Creating two DataFrames:
# move_df is actions_df excluding actions where the ball went out of the field
# shot_df is actions_df including only the shots

move_df = actions_df[actions_df['subEventName'] != "Shot"]
move_df["kickedOut"] = move_df.apply(lambda x: 1 if x.nextEvent == "Ball out of the field" else 0, axis = 1)
move_df = move_df.loc[(((move_df["end_x"] != 0) & (move_df["end_y"] != 100)) & ((move_df["end_x"] != 100) & (move_df["end_y"] != 0)))]

#filtering out of the field
delete_passes = move_df.loc[move_df["kickedOut"] == 1]
move_df = move_df.drop(delete_passes.index)

shot_df = actions_df[actions_df['subEventName'] == "Shot"]



In [5]:
actions_df['subEventName'].unique()

array(['Simple pass', 'High pass', 'Head pass', 'Smart pass', 'Cross',
       'Shot', 'Hand pass', 'Acceleration'], dtype=object)

In [6]:
move_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,x,y,end_x,end_y,start_bins,end_bins,kickedOut
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171,High pass,49,49,31,78,44,37,0
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172,Head pass,31,78,51,75,37,57,0
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173,Head pass,51,75,35,71,57,37,0
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174,Simple pass,35,71,41,95,37,49,0
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175,Simple pass,41,95,72,88,49,78,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071375,8,Simple pass,[{'id': 1801}],70974,"[{'y': 58, 'x': 47}, {'y': 55, 'x': 35}]",2576338,Pass,3193,2H,2795.993433,85,253567130,Simple pass,47,58,35,55,45,35,0
3071376,8,Simple pass,[{'id': 1801}],349102,"[{'y': 55, 'x': 35}, {'y': 60, 'x': 33}]",2576338,Pass,3193,2H,2797.609376,85,253567133,Ground attacking duel,35,55,33,60,35,36,0
3071379,8,Simple pass,[{'id': 1801}],413041,"[{'y': 57, 'x': 25}, {'y': 43, 'x': 31}]",2576338,Pass,3193,2H,2801.914483,85,253567136,Touch,25,57,31,43,25,34,0
3071381,8,High pass,[{'id': 1801}],206318,"[{'y': 48, 'x': 46}, {'y': 87, 'x': 61}]",2576338,Pass,3193,2H,2808.430235,83,253567145,Touch,46,48,61,87,44,68,0


In [7]:
shot_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,x,y,end_x,end_y,start_bins,end_bins
46,10,Shot,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",25413,"[{'y': 41, 'x': 88}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,94.595788,100,177959212,Reflexes,88,41,0,0,84,0
62,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",26150,"[{'y': 52, 'x': 85}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,179.854785,100,177959247,Ball out of the field,85,52,100,100,85,99
91,10,Shot,"[{'id': 101}, {'id': 403}, {'id': 201}, {'id':...",14763,"[{'y': 52, 'x': 96}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,254.745027,100,177959280,Reflexes,96,52,100,100,95,99
128,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",7868,"[{'y': 33, 'x': 81}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,425.824035,100,177959289,Ball out of the field,81,33,0,0,83,0
249,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1205}, {'id'...",7868,"[{'y': 30, 'x': 75}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,815.462015,100,177959429,Save attempt,75,30,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3070893,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",116269,"[{'y': 45, 'x': 95}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,1152.032980,100,253566542,Ball out of the field,95,45,0,0,94,0
3070927,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1212}, {'id'...",3548,"[{'y': 38, 'x': 93}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,1251.730517,100,253566586,Touch,93,38,0,0,93,0
3071192,10,Shot,"[{'id': 101}, {'id': 401}, {'id': 201}, {'id':...",21177,"[{'y': 46, 'x': 90}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,2065.034482,100,253566910,Reflexes,90,46,0,0,94,0
3071241,10,Shot,"[{'id': 402}, {'id': 1212}, {'id': 1802}]",349102,"[{'y': 32, 'x': 79}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,2367.252041,100,253566961,Ball out of the field,79,32,0,0,73,0


In [8]:
# Calculer le nombre de mouvements de chaque bin de départ à chaque bin d'arrivée
move_counts = move_df.groupby(['start_bins', 'end_bins']).size().reset_index(name='counts')

# Calculer le nombre total de mouvements depuis chaque bin de départ
total_moves = move_df.groupby(['start_bins']).size().reset_index(name='total_counts')

# Joindre les deux DataFrames pour obtenir les comptes totaux pour chaque bin de départ
move_stats = pd.merge(move_counts, total_moves, on='start_bins')

# Calculer les probabilités de mouvement
move_stats['probability'] = move_stats['counts'] / move_stats['total_counts']

# Conserver uniquement les colonnes nécessaires
moving_probabilities = move_stats[['start_bins', 'end_bins', 'probability']]
moving_probabilities


Unnamed: 0,start_bins,end_bins,probability
0,0,0,0.102794
1,0,1,0.077345
2,0,2,0.038922
3,0,3,0.025948
4,0,4,0.017465
...,...,...,...
8355,99,95,0.111208
8356,99,96,0.092149
8357,99,97,0.046861
8358,99,98,0.048085


In [9]:
moving_probabilities.groupby(['start_bins']).sum()

Unnamed: 0_level_0,end_bins,probability
start_bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2614,1.0
1,2951,1.0
2,3409,1.0
3,3355,1.0
4,3908,1.0
...,...,...
95,2292,1.0
96,3406,1.0
97,4087,1.0
98,4176,1.0


### shooting probability from each bins

In [10]:
# Ajouter une colonne binaire 'is_shot'
actions_df['is_shot'] = actions_df['subEventName'].apply(lambda x: 1 if x == "Shot" else 0)

# Calculer le nombre total d'actions par bin de départ
total_actions_by_bin = actions_df.groupby('start_bins').size().reset_index(name='total_actions')

# Calculer le nombre de tirs par bin de départ
total_shots_by_bin = actions_df.groupby('start_bins')['is_shot'].sum().reset_index(name='total_shots')

# Joindre les deux DataFrames pour obtenir les comptes totaux pour chaque bin de départ
shot_stats = pd.merge(total_shots_by_bin, total_actions_by_bin, on='start_bins')

# Calculer les probabilités de tir
shot_stats['shot_probability'] = shot_stats['total_shots'] / shot_stats['total_actions']

# Conserver uniquement les colonnes nécessaires
shot_probabilities = shot_stats[['start_bins', 'shot_probability']]

shot_probabilities


Unnamed: 0,start_bins,shot_probability
0,0,0.000000
1,1,0.000000
2,2,0.000000
3,3,0.001017
4,4,0.000737
...,...,...
95,95,0.919335
96,96,0.579714
97,97,0.100539
98,98,0.002378


### Importing our xG model

In [11]:
import pickle
with open('xG_model.pkl', 'rb') as file:
    xg_model = pickle.load(file)

### Calculation of the expected threat

In [18]:
def expected_threat(moving_probabilities, shot_probabilities, xg_model, features_dict, n_bins=100, max_iter=10, epsilon=1e-6):
    # Get the feature names from the xG model
    feature_names = xg_model.feature_names_in_

    # Ensure all required features are present in features_dict
    required_features = {'score_17_18', 'possession_percent', 'total_time', '2H'}
    if not required_features.issubset(features_dict.keys()):
        raise ValueError(f"Missing required features: {required_features - features_dict.keys()}")

    # Initialize the xT values as a numpy array of zeros
    xT = np.zeros(n_bins)
    
    for it in range(max_iter):
        total_diff = 0  # Initialize total difference for convergence check
        
        for bin_start in range(n_bins):
            # Prepare features for the xG model including one-hot encoding for the bins
            xg_features = features_dict.copy()
            for i in range(100):
                xg_features[f'bin_{i}'] = 1 if i == bin_start else 0
            
            # Ensure the features are in the correct order
            ordered_features = {name: xg_features[name] for name in feature_names}
            
            # Convert to DataFrame
            xg_features_df = pd.DataFrame([ordered_features])
            
            # Predict the scoring probability using the xG model
            p_scoring = xg_model.predict_proba(xg_features_df)[:, 1][0]
            
            # Get the shot probability for the current bin
            p_shot = shot_probabilities.loc[shot_probabilities['start_bins'] == bin_start, 'shot_probability'].values
            p_shot = p_shot[0] if len(p_shot) > 0 else 0  # Use 0 if no shot probability is found
            
            # Calculate the move probability (1 - shot probability)
            p_move = 1 - p_shot
            
            
            # Initialize the total payoff for the expected threat calculation
            total_payoff = 0
            for bin_end in range(n_bins):
                # Get the move probability from the start bin to the end bin
                move_prob = moving_probabilities.loc[(moving_probabilities['start_bins'] == bin_start) & (moving_probabilities['end_bins'] == bin_end), 'probability'].values
                move_prob = move_prob[0] if len(move_prob) > 0 else 0  # Use 0 if no move probability is found
                total_payoff += move_prob * xT[bin_end]
            
            # Calculate the new expected threat value for the current bin
            new_xT =  p_scoring * p_shot + (p_move * total_payoff)
            total_diff += abs(new_xT - xT[bin_start])  # Accumulate the difference for convergence check
            xT[bin_start] = new_xT  # Update the xT value
        
        # Check for convergence
        if total_diff < epsilon:
            break  
    
    return xT

 


### Example for testing our function 

In [19]:
# Example of how to call the function with your data
features_of_xg_model = {
    'score_17_18': 179.605,
    'possession_percent': 55.0,
    'total_time': 5800.0,
    '2H': 1
}
xT = expected_threat(moving_probabilities, shot_probabilities, xg_model, features_of_xg_model)
xT

array([0.2945578 , 0.29414207, 0.29907474, 0.3029315 , 0.30511403,
       0.3094167 , 0.31383873, 0.31761863, 0.32137807, 0.32972209,
       0.31887157, 0.31625894, 0.31649652, 0.31880089, 0.32322135,
       0.32722351, 0.32813277, 0.33004614, 0.33591038, 0.34679491,
       0.33899121, 0.3371991 , 0.33703978, 0.33980118, 0.34331205,
       0.34609456, 0.34835064, 0.35147247, 0.357879  , 0.36626464,
       0.36259843, 0.36529006, 0.36845761, 0.37136106, 0.37328969,
       0.37576459, 0.37925845, 0.38276058, 0.38587665, 0.38992033,
       0.38689529, 0.39289016, 0.39657428, 0.39898515, 0.40309692,
       0.40663248, 0.4072242 , 0.40972524, 0.41111063, 0.41124757,
       0.41317904, 0.41921053, 0.42394638, 0.42722814, 0.42708931,
       0.42637879, 0.43519429, 0.43524727, 0.4351525 , 0.43503479,
       0.43878508, 0.44524859, 0.45051252, 0.45338809, 0.45699214,
       0.45988817, 0.45903438, 0.46052251, 0.45979558, 0.45811064,
       0.47153711, 0.47955613, 0.47867829, 0.46557342, 0.46630