
# Calculating xT (position-based)
Calculating Expected Threat


In [1]:
#importing necessary libraries 

import pandas as pd
import numpy as np
import json
import pickle 
#opening data
import os
import pathlib
import warnings 
# Importing coord_to_bins from our python's file utilities
from utilities import calculate_bin_number, generate_bins_properties
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore') 

## Opening data 
In this section we implement the Expected Threat model in
the same way described by [Karun Singh](https://karun.in/blog/expected-threat.html).
First, we open the data.



In [2]:
# Liste des fichiers JSON
file_names = ['events_England.json', 'events_France.json', 'events_Spain.json', 'events_Germany.json', 'events_Italy.json']

# Initialiser un DataFrame vide
df = pd.DataFrame()

# Boucler à travers les fichiers et les charger
for file_name in file_names:
    with open(file_name) as f:
        data = json.load(f)
    # Concaténer les nouvelles données au DataFrame existant
    df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

In [3]:
df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175
...,...,...,...,...,...,...,...,...,...,...,...,...
3071390,3,Free kick cross,"[{'id': 801}, {'id': 1801}]",70974,"[{'y': 23, 'x': 75}, {'y': 65, 'x': 95}]",2576338,Free Kick,3193,2H,2870.982660,32,253567160
3071391,1,Ground loose ball duel,"[{'id': 702}, {'id': 1801}]",14745,"[{'y': 35, 'x': 5}, {'y': 36, 'x': 3}]",2576338,Duel,3185,2H,2872.101142,13,253567161
3071392,1,Ground loose ball duel,"[{'id': 702}, {'id': 1801}]",413041,"[{'y': 65, 'x': 95}, {'y': 64, 'x': 97}]",2576338,Duel,3193,2H,2872.990437,13,253567163
3071393,1,Air duel,"[{'id': 703}, {'id': 1801}]",20927,"[{'y': 36, 'x': 3}, {'y': 100, 'x': 100}]",2576338,Duel,3185,2H,2874.049206,10,253567162


## Actions moving the ball and shooting
To calculate the Expected Threat we need Creating a DataFrame with all actions involving ball movement. First we filter them
from the database.
To make our calculations easier we create new columns with coordinates,
one for each coordinate.
In the end we store number of actions in each bin in a **move_counts** array to calculate later
**move probability** and more.



In [4]:
# Calculer l'événement suivant
next_event = df.shift(-1, fill_value=0)
df["nextEvent"] = next_event["subEventName"]

# Créer un DataFrame avec toutes les actions impliquant un mouvement du ballon
actions_df = df.loc[df['subEventName'].isin(['Simple pass', 'High pass', 'Head pass', 'Hand pass', 'Smart pass', 'Cross', 'Acceleration', 'Shot'])]

# Extraire les coordonnées
actions_df["x"] = actions_df.positions.apply(lambda cell: (cell[0]['x']*105/100))
actions_df["y"] = actions_df.positions.apply(lambda cell: (cell[0]['y']*68/100))
actions_df["end_x"] = actions_df.positions.apply(lambda cell: (cell[1]['x']*105/100))
actions_df["end_y"] = actions_df.positions.apply(lambda cell: (cell[1]['y']*68/100))

# Calculer les bins de départ et d'arrivée
actions_df['start_bins']= calculate_bin_number(actions_df, 'x', 'y')
actions_df['end_bins']= calculate_bin_number(actions_df, 'end_x', 'end_y')
# Créer deux DataFrames :
# move_df exclut les actions où le ballon est sorti du terrain
# shot_df inclut uniquement les tirs

move_df = actions_df[actions_df['subEventName'] != "Shot"]
move_df["kickedOut"] = move_df.apply(lambda x: 1 if x.nextEvent == "Ball out of the field" else 0, axis=1)
move_df = move_df.loc[(((move_df["end_x"] != 0) & (move_df["end_y"] != 68)) & ((move_df["end_x"] != 105) & (move_df["end_y"] != 0)))]

# Filtrer les passes hors du terrain
delete_passes = move_df.loc[move_df["kickedOut"] == 1]
move_df = move_df.drop(delete_passes.index)

shot_df = actions_df[actions_df['subEventName'] == "Shot"]



In [5]:
move_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,x,y,end_x,end_y,start_bins,end_bins,kickedOut
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171,High pass,51.45,33.32,32.55,53.04,89,57,0
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172,Head pass,32.55,53.04,53.55,51.00,57,105,0
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173,Head pass,53.55,51.00,36.75,48.28,105,68,0
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174,Simple pass,36.75,48.28,43.05,64.60,68,83,0
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175,Simple pass,43.05,64.60,75.60,59.84,83,142,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071375,8,Simple pass,[{'id': 1801}],70974,"[{'y': 58, 'x': 47}, {'y': 55, 'x': 35}]",2576338,Pass,3193,2H,2795.993433,85,253567130,Simple pass,49.35,39.44,36.75,37.40,90,66,0
3071376,8,Simple pass,[{'id': 1801}],349102,"[{'y': 55, 'x': 35}, {'y': 60, 'x': 33}]",2576338,Pass,3193,2H,2797.609376,85,253567133,Ground attacking duel,36.75,37.40,34.65,40.80,66,67,0
3071379,8,Simple pass,[{'id': 1801}],413041,"[{'y': 57, 'x': 25}, {'y': 43, 'x': 31}]",2576338,Pass,3193,2H,2801.914483,85,253567136,Touch,26.25,38.76,32.55,29.24,54,53,0
3071381,8,High pass,[{'id': 1801}],206318,"[{'y': 48, 'x': 46}, {'y': 87, 'x': 61}]",2576338,Pass,3193,2H,2808.430235,83,253567145,Touch,48.30,32.64,64.05,59.16,89,118,0


In [6]:
shot_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,x,y,end_x,end_y,start_bins,end_bins
46,10,Shot,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",25413,"[{'y': 41, 'x': 88}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,94.595788,100,177959212,Reflexes,92.40,27.88,0.0,0.0,172,0
62,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",26150,"[{'y': 52, 'x': 85}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,179.854785,100,177959247,Ball out of the field,89.25,35.36,105.0,68.0,162,191
91,10,Shot,"[{'id': 101}, {'id': 403}, {'id': 201}, {'id':...",14763,"[{'y': 52, 'x': 96}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,254.745027,100,177959280,Reflexes,100.80,35.36,105.0,68.0,186,191
128,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",7868,"[{'y': 33, 'x': 81}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,425.824035,100,177959289,Ball out of the field,85.05,22.44,0.0,0.0,147,0
249,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1205}, {'id'...",7868,"[{'y': 30, 'x': 75}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,815.462015,100,177959429,Save attempt,78.75,20.40,0.0,0.0,147,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3070893,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",116269,"[{'y': 45, 'x': 95}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,1152.032980,100,253566542,Ball out of the field,99.75,30.60,0.0,0.0,185,0
3070927,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1212}, {'id'...",3548,"[{'y': 38, 'x': 93}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,1251.730517,100,253566586,Touch,97.65,25.84,0.0,0.0,172,0
3071192,10,Shot,"[{'id': 101}, {'id': 401}, {'id': 201}, {'id':...",21177,"[{'y': 46, 'x': 90}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,2065.034482,100,253566910,Reflexes,94.50,31.28,0.0,0.0,173,0
3071241,10,Shot,"[{'id': 402}, {'id': 1212}, {'id': 1802}]",349102,"[{'y': 32, 'x': 79}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,2367.252041,100,253566961,Ball out of the field,82.95,21.76,0.0,0.0,147,0


In [7]:
n_bins = max(move_df['start_bins'].max(), move_df['end_bins'].max()) + 1

In [8]:
bins = np.arange(n_bins)
all_combinations = pd.MultiIndex.from_product([bins, bins], names=['start_bins', 'end_bins']).to_frame(index=False)

# Calculer le nombre de mouvements de chaque bin de départ à chaque bin d'arrivée
move_counts = move_df.groupby(['start_bins', 'end_bins']).size().reset_index(name='counts')

# Fusionner les combinaisons possibles avec les mouvements réels
all_moves = pd.merge(all_combinations, move_counts, on=['start_bins', 'end_bins'], how='left').fillna(0)

# Calculer le nombre total de mouvements depuis chaque bin de départ
total_moves = move_df.groupby(['start_bins']).size().reset_index(name='total_counts')

# Joindre les deux DataFrames pour obtenir les comptes totaux pour chaque bin de départ
move_stats = pd.merge(all_moves, total_moves, on='start_bins', how='left').fillna(0)

# Calculer les probabilités de mouvement
move_stats['probability'] = move_stats['counts'] / move_stats['total_counts']

# Conserver uniquement les colonnes nécessaires
moving_probabilities = move_stats[['start_bins', 'end_bins', 'probability']]


moving_probabilities



Unnamed: 0,start_bins,end_bins,probability
0,0,0,0.065960
1,0,1,0.043581
2,0,2,0.029446
3,0,3,0.022379
4,0,4,0.010601
...,...,...,...
36859,191,187,0.051893
36860,191,188,0.029304
36861,191,189,0.017094
36862,191,190,0.026862


In [9]:
moving_probabilities['probability'].values.reshape((n_bins,n_bins), order = 'C')

array([[0.06595995, 0.04358068, 0.02944641, ..., 0.        , 0.        ,
        0.        ],
       [0.032     , 0.0352    , 0.0256    , ..., 0.        , 0.        ,
        0.        ],
       [0.02292264, 0.03008596, 0.01002865, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00052966, 0.        , 0.        , ..., 0.01324153, 0.01112288,
        0.01006356],
       [0.        , 0.        , 0.        , ..., 0.01116313, 0.02016565,
        0.01872524],
       [0.        , 0.        , 0.        , ..., 0.01709402, 0.02686203,
        0.04273504]])

In [10]:
moving_probabilities.groupby(['start_bins']).sum()

Unnamed: 0_level_0,end_bins,probability
start_bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,18336,1.0
1,18336,1.0
2,18336,1.0
3,18336,1.0
4,18336,1.0
...,...,...
187,18336,1.0
188,18336,1.0
189,18336,1.0
190,18336,1.0


### shooting probability from each bins

In [11]:
# Ajouter une colonne binaire 'is_shot'
actions_df['is_shot'] = actions_df['subEventName'].apply(lambda x: 1 if x == "Shot" else 0)

# Calculer le nombre total d'actions par bin de départ
total_actions_by_bin = actions_df.groupby('start_bins').size().reset_index(name='total_actions')

# Calculer le nombre de tirs par bin de départ
total_shots_by_bin = actions_df.groupby('start_bins')['is_shot'].sum().reset_index(name='total_shots')

# Joindre les deux DataFrames pour obtenir les comptes totaux pour chaque bin de départ
shot_stats = pd.merge(total_shots_by_bin, total_actions_by_bin, on='start_bins')

# Calculer les probabilités de tir
shot_stats['shot_probability'] = shot_stats['total_shots'] / shot_stats['total_actions']

# Conserver uniquement les colonnes nécessaires
shot_probabilities = shot_stats[['start_bins', 'shot_probability']]

shot_probabilities


Unnamed: 0,start_bins,shot_probability
0,0,0.000000
1,1,0.000000
2,2,0.000000
3,3,0.000000
4,4,0.002110
...,...,...
187,187,0.640893
188,188,0.141449
189,189,0.006035
190,190,0.001088


In [12]:
# Charger le modèle xG depuis un fichier pkl
with open('xG_model.pkl', 'rb') as file:
    xg_model = pickle.load(file)

### Calculation of the expected threat

In [43]:
def expected_threat(moving_probabilities, shot_probabilities, xg_model, features_dict, bins=(16, 12), max_iter=10, epsilon=1e-6):
    # Calculer le nombre total de bins
    total_bins = bins[0] * bins[1]

    # Vérifier si un numéro de bin est fourni dans features_dict
    if 'bin_number' not in features_dict:
        # Si 'x' et 'y' sont fournis, calculer les bins correspondants
        if 'x' in features_dict and 'y' in features_dict:
            bins_properties = generate_bins_properties(pd.DataFrame([features_dict]), 'x', 'y', bins=bins)
            # Extraire le numéro de bin
            bin_number = bins_properties.iloc[0]['bin_number']
            # Ajouter les propriétés des bins au dictionnaire features_dict
            features_dict.update(bins_properties.iloc[0].to_dict())
            # Supprimer les coordonnées x et y car elles ne sont pas dans les features du modèle
            del features_dict['x']
            del features_dict['y']
        else:
            raise ValueError("Les coordonnées x, y ou le numéro de bin doivent être fournis dans features_dict.")
    else:
        # Utiliser le numéro de bin fourni directement
        bin_number = features_dict['bin_number']

    # Vérifier si les features dict correspondent aux colonnes du modèle xG
    feature_names = list(xg_model.feature_names_in_)
    if not set(features_dict.keys()).issubset(feature_names):
        missing_features = set(features_dict.keys()) - set(feature_names)
        raise ValueError(f"Missing required features in the model: {missing_features}")

    # Préparer les features pour chaque bin, y compris le numéro du bin
    xg_features_list = []
    for bin_start in range(total_bins):
        features_with_bin = {**features_dict, 'bin_number': bin_start}
        xg_features_list.append(features_with_bin)

    # Convertir en DataFrame et s'assurer que l'ordre des colonnes correspond à celui du modèle
    xg_features_df = pd.DataFrame(xg_features_list, columns=feature_names)
    
    # Prédire les probabilités de marquer
    p_scoring = xg_model.predict_proba(xg_features_df)[:, 1]

    # Obtenir les probabilités de tir
    shot_probs = shot_probabilities['shot_probability'].values

    # Calculer la matrice de probabilités de mouvement
    move_probs_matrix = moving_probabilities['probability'].values.reshape((total_bins, total_bins), order='C')

    # Initialiser les valeurs xT
    xT = np.zeros(total_bins)
    
    for _ in range(max_iter):
        xT_old = xT.copy()
        # Calculer les nouvelles valeurs xT
        new_xT = p_scoring * shot_probs + np.dot(move_probs_matrix, xT_old * (1 - shot_probs))
        
        # Vérifier la convergence
        if np.max(np.abs(new_xT - xT)) < epsilon:
            break
        
        xT = new_xT
    
    return xT


### Example for testing our function 

In [44]:
user_features = {
    'x': 101,  # coordonnée x sur le terrain
    'y': 50,   # coordonnée y sur le terrain
    'possession_percent': 55.0,  # pourcentage de possession de l'équipe
    'time_played': 5800.0  # temps total de jeu en secondes
}
xT = expected_threat(moving_probabilities, shot_probabilities, xg_model, user_features)
xT


array([0.01037677, 0.01011099, 0.01002559, 0.01011366, 0.01042158,
       0.01025727, 0.01037827, 0.01036278, 0.01028483, 0.010444  ,
       0.01037528, 0.01056888, 0.01082756, 0.01064629, 0.01034571,
       0.01063521, 0.01065681, 0.01076158, 0.01086763, 0.01090407,
       0.01074322, 0.01060103, 0.0107159 , 0.01123086, 0.01136674,
       0.01100947, 0.01068289, 0.01090641, 0.01110275, 0.01138948,
       0.01146524, 0.01132741, 0.01096748, 0.01087768, 0.01128916,
       0.01170227, 0.0120043 , 0.0116029 , 0.01134462, 0.01128908,
       0.01125463, 0.01138027, 0.01137029, 0.01135805, 0.01134895,
       0.01140007, 0.01183932, 0.01217957, 0.01261856, 0.01253946,
       0.01225978, 0.01232217, 0.01245154, 0.01250667, 0.01252759,
       0.01251131, 0.01243741, 0.01243699, 0.0127485 , 0.01292405,
       0.01346984, 0.01350104, 0.01357894, 0.01373133, 0.01377658,
       0.013746  , 0.01378396, 0.01380869, 0.01383522, 0.01378112,
       0.01376342, 0.01373907, 0.01428908, 0.01460079, 0.01480

In [45]:
# Example of how to call the function with your data
user_features = {
    'bin_number' : 154,
    'possession_percent': 62.0,  # pourcentage de possession de l'équipe
    'time_played': 2300.0  # temps total de jeu en secondes
}
xT = expected_threat(moving_probabilities, shot_probabilities, xg_model, user_features)
xT

array([0.01561935, 0.01522578, 0.01511005, 0.01522983, 0.01587814,
       0.01559721, 0.01575047, 0.01569174, 0.01548723, 0.01584815,
       0.01562677, 0.01591191, 0.0162984 , 0.01603166, 0.0155866 ,
       0.01601155, 0.01603721, 0.01619306, 0.01635203, 0.01640823,
       0.01617142, 0.01596505, 0.01613605, 0.01690407, 0.01710681,
       0.01657317, 0.01608526, 0.01641691, 0.01670847, 0.01713708,
       0.01725085, 0.01704534, 0.01650619, 0.01637395, 0.01699249,
       0.01761031, 0.01806218, 0.01746017, 0.01707189, 0.01698729,
       0.01693478, 0.01712298, 0.01710864, 0.01709018, 0.0170769 ,
       0.01715409, 0.01781475, 0.01832537, 0.01898347, 0.01886419,
       0.01844348, 0.01853683, 0.01873095, 0.01881366, 0.01884522,
       0.0188206 , 0.01870995, 0.01870969, 0.01917823, 0.01944215,
       0.02026195, 0.02030835, 0.02042522, 0.02065412, 0.02072205,
       0.02067602, 0.02073309, 0.02077022, 0.02081023, 0.0207291 ,
       0.02070264, 0.02066626, 0.02149284, 0.02196133, 0.02227

In [16]:
#from mplsoccer import Pitch

In [17]:
#pitch = Pitch(line_color='black',pitch_type='custom', pitch_length=105, pitch_width=68, line_zorder = 2)

In [18]:
'''
fig, ax = pitch.grid(grid_height=0.9, title_height=0.06, axis=False,
                     endnote_height=0.01, title_space=0, endnote_space=0)
goal["statistic"] = xT
pcm  = pitch.heatmap(goal, cmap='Oranges', edgecolor='grey', ax=ax['pitch'])
    labels = pitch.label_heatmap(goal, color='blue', fontsize=9,
                             ax=ax['pitch'], ha='center', va='center', str_format="{0:,.2f}", zorder = 3)
    #legend to our plot
    ax_cbar = fig.add_axes((1, 0.093, 0.03, 0.786))
    cbar = plt.colorbar(pcm, cax=ax_cbar)
    txt = 'Expected Threat matrix after ' +  str(i+1) + ' moves'
    fig.suptitle(txt, fontsize = 30)
    plt.show()
    '''

IndentationError: unexpected indent (1310549280.py, line 3)