
# Calculating xT (position-based)
Calculating Expected Threat


In [1]:
#importing necessary libraries 

import pandas as pd
import numpy as np
import json
import pickle 
#opening data
import os
import pathlib
import warnings 
# Importing coord_to_bins from our python's file utilities
from utilities import calculate_bin_number, generate_bins_properties
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore') 

## Opening data 
In this section we implement the Expected Threat model in
the same way described by [Karun Singh](https://karun.in/blog/expected-threat.html).
First, we open the data.



In [2]:
# Liste des fichiers JSON
file_names = ['events_England.json', 'events_France.json', 'events_Spain.json', 'events_Germany.json', 'events_Italy.json']

# Initialiser un DataFrame vide
df = pd.DataFrame()

# Boucler à travers les fichiers et les charger
for file_name in file_names:
    with open(file_name) as f:
        data = json.load(f)
    # Concaténer les nouvelles données au DataFrame existant
    df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

In [3]:
df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175
...,...,...,...,...,...,...,...,...,...,...,...,...
3071390,3,Free kick cross,"[{'id': 801}, {'id': 1801}]",70974,"[{'y': 23, 'x': 75}, {'y': 65, 'x': 95}]",2576338,Free Kick,3193,2H,2870.982660,32,253567160
3071391,1,Ground loose ball duel,"[{'id': 702}, {'id': 1801}]",14745,"[{'y': 35, 'x': 5}, {'y': 36, 'x': 3}]",2576338,Duel,3185,2H,2872.101142,13,253567161
3071392,1,Ground loose ball duel,"[{'id': 702}, {'id': 1801}]",413041,"[{'y': 65, 'x': 95}, {'y': 64, 'x': 97}]",2576338,Duel,3193,2H,2872.990437,13,253567163
3071393,1,Air duel,"[{'id': 703}, {'id': 1801}]",20927,"[{'y': 36, 'x': 3}, {'y': 100, 'x': 100}]",2576338,Duel,3185,2H,2874.049206,10,253567162


## Actions moving the ball and shooting
To calculate the Expected Threat we need Creating a DataFrame with all actions involving ball movement. First we filter them
from the database.
To make our calculations easier we create new columns with coordinates,
one for each coordinate.
In the end we store number of actions in each bin in a **move_counts** array to calculate later
**move probability** and more.



In [4]:
# Calculer l'événement suivant
next_event = df.shift(-1, fill_value=0)
df["nextEvent"] = next_event["subEventName"]

# Créer un DataFrame avec toutes les actions impliquant un mouvement du ballon
actions_df = df.loc[df['subEventName'].isin(['Simple pass', 'High pass', 'Head pass', 'Hand pass', 'Smart pass', 'Cross', 'Acceleration', 'Shot'])]

# Extraire les coordonnées
actions_df["x"] = actions_df.positions.apply(lambda cell: (cell[0]['x']*105/100))
actions_df["y"] = actions_df.positions.apply(lambda cell: (cell[0]['y']*68/100))
actions_df["end_x"] = actions_df.positions.apply(lambda cell: (cell[1]['x']*105/100))
actions_df["end_y"] = actions_df.positions.apply(lambda cell: (cell[1]['y']*68/100))

# Calculer les bins de départ et d'arrivée
actions_df['start_bins']= calculate_bin_number(actions_df, 'x', 'y')
actions_df['end_bins']= calculate_bin_number(actions_df, 'end_x', 'end_y')
# Créer deux DataFrames :
# move_df exclut les actions où le ballon est sorti du terrain
# shot_df inclut uniquement les tirs

move_df = actions_df[actions_df['subEventName'] != "Shot"]
move_df["kickedOut"] = move_df.apply(lambda x: 1 if x.nextEvent == "Ball out of the field" else 0, axis=1)
move_df = move_df.loc[(((move_df["end_x"] != 0) & (move_df["end_y"] != 68)) & ((move_df["end_x"] != 105) & (move_df["end_y"] != 0)))]

# Filtrer les passes hors du terrain
delete_passes = move_df.loc[move_df["kickedOut"] == 1]
move_df = move_df.drop(delete_passes.index)

shot_df = actions_df[actions_df['subEventName'] == "Shot"]



In [5]:
move_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,x,y,end_x,end_y,start_bins,end_bins,kickedOut
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171,High pass,51.45,33.32,32.55,53.04,89,57,0
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172,Head pass,32.55,53.04,53.55,51.00,57,105,0
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173,Head pass,53.55,51.00,36.75,48.28,105,68,0
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174,Simple pass,36.75,48.28,43.05,64.60,68,83,0
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175,Simple pass,43.05,64.60,75.60,59.84,83,142,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071375,8,Simple pass,[{'id': 1801}],70974,"[{'y': 58, 'x': 47}, {'y': 55, 'x': 35}]",2576338,Pass,3193,2H,2795.993433,85,253567130,Simple pass,49.35,39.44,36.75,37.40,90,66,0
3071376,8,Simple pass,[{'id': 1801}],349102,"[{'y': 55, 'x': 35}, {'y': 60, 'x': 33}]",2576338,Pass,3193,2H,2797.609376,85,253567133,Ground attacking duel,36.75,37.40,34.65,40.80,66,67,0
3071379,8,Simple pass,[{'id': 1801}],413041,"[{'y': 57, 'x': 25}, {'y': 43, 'x': 31}]",2576338,Pass,3193,2H,2801.914483,85,253567136,Touch,26.25,38.76,32.55,29.24,54,53,0
3071381,8,High pass,[{'id': 1801}],206318,"[{'y': 48, 'x': 46}, {'y': 87, 'x': 61}]",2576338,Pass,3193,2H,2808.430235,83,253567145,Touch,48.30,32.64,64.05,59.16,89,118,0


In [6]:
shot_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,x,y,end_x,end_y,start_bins,end_bins
46,10,Shot,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",25413,"[{'y': 41, 'x': 88}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,94.595788,100,177959212,Reflexes,92.40,27.88,0.0,0.0,172,0
62,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",26150,"[{'y': 52, 'x': 85}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,179.854785,100,177959247,Ball out of the field,89.25,35.36,105.0,68.0,162,191
91,10,Shot,"[{'id': 101}, {'id': 403}, {'id': 201}, {'id':...",14763,"[{'y': 52, 'x': 96}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,254.745027,100,177959280,Reflexes,100.80,35.36,105.0,68.0,186,191
128,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",7868,"[{'y': 33, 'x': 81}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,425.824035,100,177959289,Ball out of the field,85.05,22.44,0.0,0.0,147,0
249,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1205}, {'id'...",7868,"[{'y': 30, 'x': 75}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,815.462015,100,177959429,Save attempt,78.75,20.40,0.0,0.0,147,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3070893,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",116269,"[{'y': 45, 'x': 95}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,1152.032980,100,253566542,Ball out of the field,99.75,30.60,0.0,0.0,185,0
3070927,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1212}, {'id'...",3548,"[{'y': 38, 'x': 93}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,1251.730517,100,253566586,Touch,97.65,25.84,0.0,0.0,172,0
3071192,10,Shot,"[{'id': 101}, {'id': 401}, {'id': 201}, {'id':...",21177,"[{'y': 46, 'x': 90}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,2065.034482,100,253566910,Reflexes,94.50,31.28,0.0,0.0,173,0
3071241,10,Shot,"[{'id': 402}, {'id': 1212}, {'id': 1802}]",349102,"[{'y': 32, 'x': 79}, {'y': 0, 'x': 0}]",2576338,Shot,3193,2H,2367.252041,100,253566961,Ball out of the field,82.95,21.76,0.0,0.0,147,0


In [7]:
n_bins = max(move_df['start_bins'].max(), move_df['end_bins'].max()) + 1

In [8]:
bins = np.arange(n_bins)
all_combinations = pd.MultiIndex.from_product([bins, bins], names=['start_bins', 'end_bins']).to_frame(index=False)

# Calculer le nombre de mouvements de chaque bin de départ à chaque bin d'arrivée
move_counts = move_df.groupby(['start_bins', 'end_bins']).size().reset_index(name='counts')

# Fusionner les combinaisons possibles avec les mouvements réels
all_moves = pd.merge(all_combinations, move_counts, on=['start_bins', 'end_bins'], how='left').fillna(0)

# Calculer le nombre total de mouvements depuis chaque bin de départ
total_moves = move_df.groupby(['start_bins']).size().reset_index(name='total_counts')

# Joindre les deux DataFrames pour obtenir les comptes totaux pour chaque bin de départ
move_stats = pd.merge(all_moves, total_moves, on='start_bins', how='left').fillna(0)

# Calculer les probabilités de mouvement
move_stats['probability'] = move_stats['counts'] / move_stats['total_counts']

# Conserver uniquement les colonnes nécessaires
moving_probabilities = move_stats[['start_bins', 'end_bins', 'probability']]


moving_probabilities



Unnamed: 0,start_bins,end_bins,probability
0,0,0,0.065960
1,0,1,0.043581
2,0,2,0.029446
3,0,3,0.022379
4,0,4,0.010601
...,...,...,...
36859,191,187,0.051893
36860,191,188,0.029304
36861,191,189,0.017094
36862,191,190,0.026862


In [9]:
moving_probabilities['probability'].values.reshape((n_bins,n_bins), order = 'C')

array([[0.06595995, 0.04358068, 0.02944641, ..., 0.        , 0.        ,
        0.        ],
       [0.032     , 0.0352    , 0.0256    , ..., 0.        , 0.        ,
        0.        ],
       [0.02292264, 0.03008596, 0.01002865, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00052966, 0.        , 0.        , ..., 0.01324153, 0.01112288,
        0.01006356],
       [0.        , 0.        , 0.        , ..., 0.01116313, 0.02016565,
        0.01872524],
       [0.        , 0.        , 0.        , ..., 0.01709402, 0.02686203,
        0.04273504]])

In [10]:
moving_probabilities.groupby(['start_bins']).sum()

Unnamed: 0_level_0,end_bins,probability
start_bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,18336,1.0
1,18336,1.0
2,18336,1.0
3,18336,1.0
4,18336,1.0
...,...,...
187,18336,1.0
188,18336,1.0
189,18336,1.0
190,18336,1.0


### shooting probability from each bins

In [11]:
# Ajouter une colonne binaire 'is_shot'
actions_df['is_shot'] = actions_df['subEventName'].apply(lambda x: 1 if x == "Shot" else 0)

# Calculer le nombre total d'actions par bin de départ
total_actions_by_bin = actions_df.groupby('start_bins').size().reset_index(name='total_actions')

# Calculer le nombre de tirs par bin de départ
total_shots_by_bin = actions_df.groupby('start_bins')['is_shot'].sum().reset_index(name='total_shots')

# Joindre les deux DataFrames pour obtenir les comptes totaux pour chaque bin de départ
shot_stats = pd.merge(total_shots_by_bin, total_actions_by_bin, on='start_bins')

# Calculer les probabilités de tir
shot_stats['shot_probability'] = shot_stats['total_shots'] / shot_stats['total_actions']

# Conserver uniquement les colonnes nécessaires
shot_probabilities = shot_stats[['start_bins', 'shot_probability']]

shot_probabilities


Unnamed: 0,start_bins,shot_probability
0,0,0.000000
1,1,0.000000
2,2,0.000000
3,3,0.000000
4,4,0.002110
...,...,...
187,187,0.640893
188,188,0.141449
189,189,0.006035
190,190,0.001088


In [12]:
shot_probabilities.tail(20)

Unnamed: 0,start_bins,shot_probability
172,172,0.74873
173,173,0.879497
174,174,0.880698
175,175,0.725812
176,176,0.353078
177,177,0.040115
178,178,0.001717
179,179,0.000217
180,180,0.0
181,181,0.000599


In [13]:
shot_probabilities[shot_probabilities['shot_probability']!=0]

Unnamed: 0,start_bins,shot_probability
4,4,0.002110
5,5,0.001735
6,6,0.001416
7,7,0.001026
9,9,0.001269
...,...,...
186,186,0.955385
187,187,0.640893
188,188,0.141449
189,189,0.006035


In [14]:
# Charger le modèle xG depuis un fichier pkl
with open('xG_model.pkl', 'rb') as file:
    xg_model = pickle.load(file)

In [15]:
xg_model.feature_names_in_

array(['possession_percent', 'time_played', 'team_scores', 'bin_number',
       'bin_center_x', 'bin_center_y', 'distance_to_goal',
       'angle_to_goal'], dtype='<U18')

### Calculation of the expected threat

In [16]:
def expected_threat(moving_probabilities, shot_probabilities, xg_model, features_dict, bin_number='bin_number', n_bins=(16, 12), max_iter=10, epsilon=1e-6):
    # Vérifier que toutes les features nécessaires sont présentes dans le modèle xG
    feature_names = list(xg_model.feature_names_in_)
    
    
    # Définir le nombre total de bins
    total_bins = n_bins[0] * n_bins[1]
    
    # Générer le DataFrame 'events' avec les features fournies dans features_dict
    events_data = {feature: [value] * total_bins for feature, value in features_dict.items()}
    events_data['bin_number']= list(range(0,total_bins))
    events = pd.DataFrame(events_data)
    
    # Générer les propriétés des bins (centre du bin, distance au but, angle au but) en utilisant les numéros de bin
    events = generate_bins_properties(events, bin_number= 'bin_number')
    
    # Vérifier que toutes les propriétés des bins sont présentes pour correspondre au modèle xG
    missing_features = [feature for feature in feature_names if feature not in events.columns]
    if missing_features:
        raise ValueError(f"Propriétés manquantes dans les données générées : {missing_features}")
    
    # Vérifier l'ordre des colonnes pour s'assurer de la compatibilité avec le modèle
    xg_features_df = events[feature_names]
    
    # Calculer les probabilités de scoring à partir du modèle xG
    p_scoring = xg_model.predict_proba(xg_features_df)[:, 1]

    # Extraire les probabilités de tir sous forme de tableau numpy
    shot_probs = shot_probabilities['shot_probability'].values

    # Calculer la matrice des probabilités de mouvement
    move_probs_matrix = moving_probabilities['probability'].values.reshape(total_bins, total_bins)

    # Initialiser les valeurs xT
    xT = np.zeros(total_bins)
    
    # Boucle pour itérer jusqu'à convergence ou nombre maximal d'itérations
    for _ in range(max_iter):
        xT_old = xT.copy()
        # Calcul des nouvelles valeurs xT
        new_xT = p_scoring * shot_probs + np.dot(move_probs_matrix, xT_old * (1 - shot_probs))
        
        # Vérifier la convergence
        if np.max(np.abs(new_xT - xT)) < epsilon:
            break
        
        xT = new_xT
    
    return xT


### Example for testing our function 

In [17]:
user_features = { 
    'possession_percent': 55.0,  # pourcentage de possession de l'équipe
    'time_played': 5800.0 , # temps total de jeu en secondes
    'team_scores':0
}
xT = expected_threat(moving_probabilities, shot_probabilities, xg_model, user_features)
xT


array([0.01039009, 0.01013443, 0.01002251, 0.01013846, 0.01036728,
       0.01019906, 0.01038446, 0.01041261, 0.01040651, 0.01050677,
       0.01054423, 0.01078817, 0.01091051, 0.01069136, 0.01038598,
       0.01066336, 0.01075062, 0.01085512, 0.01098507, 0.01102959,
       0.01086366, 0.01075124, 0.01090348, 0.01148816, 0.0114911 ,
       0.01109206, 0.01071595, 0.01098736, 0.01120143, 0.0115187 ,
       0.01162955, 0.0115103 , 0.01112229, 0.0110712 , 0.01153676,
       0.01203878, 0.01217063, 0.01172215, 0.0114434 , 0.01139166,
       0.01135633, 0.01151921, 0.01151278, 0.01151743, 0.0115281 ,
       0.0116246 , 0.01213065, 0.01255662, 0.01289372, 0.01276362,
       0.01245629, 0.01251825, 0.01266226, 0.01274809, 0.01277349,
       0.01276643, 0.01272612, 0.01275739, 0.01317901, 0.01343984,
       0.01385093, 0.01384965, 0.01391652, 0.014055  , 0.01412328,
       0.01408169, 0.0141458 , 0.01416941, 0.01428424, 0.01427175,
       0.01434488, 0.01439422, 0.0147927 , 0.01507201, 0.01528

In [19]:
# Example of how to call the function with your data
user_features = {
    'team_scores': 1,
    'bin_number' : 154,
    'possession_percent': 62.0,  # pourcentage de possession de l'équipe
    'time_played': 2300.0  # temps joué en secondes
}
xT = expected_threat(moving_probabilities, shot_probabilities, xg_model, user_features)
xT

array([0.01096707, 0.01069961, 0.01058418, 0.01071214, 0.01095029,
       0.01078132, 0.01099121, 0.01103343, 0.01103873, 0.01114105,
       0.01119355, 0.0114573 , 0.01151734, 0.01129069, 0.01097367,
       0.01126818, 0.01136979, 0.01148537, 0.01163322, 0.01168717,
       0.01151803, 0.01140548, 0.01157271, 0.0121995 , 0.01213177,
       0.01171337, 0.01132043, 0.01161632, 0.01184761, 0.01218875,
       0.01231774, 0.01220066, 0.01179136, 0.01174564, 0.01224727,
       0.01278858, 0.01284661, 0.01237834, 0.01209149, 0.01204461,
       0.01201456, 0.01219345, 0.01219275, 0.0122043 , 0.01222267,
       0.012334  , 0.0128795 , 0.01333937, 0.01361064, 0.0134789 ,
       0.01316248, 0.01323636, 0.01339785, 0.01349496, 0.01352962,
       0.01353037, 0.01349703, 0.01353925, 0.01399832, 0.01428414,
       0.01462374, 0.01462626, 0.01470647, 0.01486148, 0.01494484,
       0.01490918, 0.01498585, 0.01502128, 0.01515554, 0.01515313,
       0.01524433, 0.01530497, 0.01561703, 0.0159161 , 0.01614