# Expected Goals (xG) Calculation Notebook

This notebook demonstrates the process of calculating Expected Goals (xG) for football match data. It encompasses data loading, preprocessing, exploratory data analysis, and building a logistic regression model to estimate the xG values based on shot characteristics.

In [1]:
# Importing libraries for data handling, visualization, statistical analysis, and machine learning
#The basics
import pandas as pd
import numpy as np
import json

#Plotting
import matplotlib.pyplot as plt
import FCPython 

#Statistical fitting of models
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
# Loading shot data from a CSV file for analysis
data= pd.read_csv(r"C:\Users\santo\Downloads\LFCNUFC.csv")

In [3]:
# Checking for missing values in each column to identify data cleaning needs
null_counts = data.isnull().sum()

# Display the count of null values for each column
print(null_counts)

id                              0
event_id                        0
minute                          0
second                          0
team_id                         0
player_id                       0
x                               0
y                               0
end_x                         595
end_y                         595
qualifiers                      0
is_touch                        0
blocked_x                    1586
blocked_y                    1586
goal_mouth_z                 1572
goal_mouth_y                 1572
is_shot                         0
card_type                       0
is_goal                         0
type_display_name               0
outcome_type_display_name       0
period_display_name             0
dtype: int64


In [4]:
# Filling in all missing values with 0 to ensure consistency in data for analysis
data.fillna(0, inplace=True)

In [5]:
# Checking for missing values in each column to identify data cleaning needs
data.isnull().sum()


id                           0
event_id                     0
minute                       0
second                       0
team_id                      0
player_id                    0
x                            0
y                            0
end_x                        0
end_y                        0
qualifiers                   0
is_touch                     0
blocked_x                    0
blocked_y                    0
goal_mouth_z                 0
goal_mouth_y                 0
is_shot                      0
card_type                    0
is_goal                      0
type_display_name            0
outcome_type_display_name    0
period_display_name          0
dtype: int64

In [6]:
data.columns

Index(['id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y',
       'end_x', 'end_y', 'qualifiers', 'is_touch', 'blocked_x', 'blocked_y',
       'goal_mouth_z', 'goal_mouth_y', 'is_shot', 'card_type', 'is_goal',
       'type_display_name', 'outcome_type_display_name',
       'period_display_name'],
      dtype='object')

In [7]:


# Calculate distance and angle from goal for all events
data['distance'] = np.sqrt((data['end_x'] - data['x'])**2 + (data['end_y'] - data['y'])**2)
data['angle'] = np.arctan2(data['end_y'] - data['y'], data['end_x'] - data['x'])

# Filter only shot events
shot_data = data[data['type_display_name'] == 'Shot']




In [8]:
shot_data

Unnamed: 0,id,event_id,minute,second,team_id,player_id,x,y,end_x,end_y,...,goal_mouth_z,goal_mouth_y,is_shot,card_type,is_goal,type_display_name,outcome_type_display_name,period_display_name,distance,angle
10,2633727783,10,0,20,26,369875,86.9,45.5,0.0,0.0,...,19.0,49.2,True,False,False,Shot,Successful,FirstHalf,98.09108,-2.659251
37,2633727903,29,1,23,26,318871,81.3,46.4,0.0,0.0,...,19.0,49.5,True,False,False,Shot,Successful,FirstHalf,93.609027,-2.622977
121,2633728303,56,7,32,23,474264,89.3,34.2,0.0,0.0,...,19.0,49.5,True,False,False,Shot,Successful,FirstHalf,95.624944,-2.775845
195,2633728625,122,11,26,26,400828,93.1,45.4,0.0,0.0,...,4.4,50.7,True,False,False,Shot,Successful,FirstHalf,103.579776,-2.687876
197,2633728627,123,11,28,26,355354,88.0,58.3,0.0,0.0,...,19.0,50.1,True,False,False,Shot,Successful,FirstHalf,105.559888,-2.55648
204,2633728649,126,11,36,26,86829,76.5,65.8,0.0,0.0,...,66.7,37.5,True,False,False,Shot,Successful,FirstHalf,100.905352,-2.431247
286,2633729015,173,16,1,26,400828,90.4,61.8,0.0,0.0,...,63.9,48.7,True,False,False,Shot,Successful,FirstHalf,109.505251,-2.541939
340,2633729463,207,21,58,26,108226,88.5,50.0,0.0,0.0,...,20.3,48.5,True,False,False,Shot,Successful,FirstHalf,101.647676,-2.627328
342,2633729467,208,22,0,26,318871,94.2,41.1,0.0,0.0,...,72.2,46.7,True,False,False,Shot,Successful,FirstHalf,102.775727,-2.730185
359,2633729575,220,23,17,26,355354,77.0,57.9,0.0,0.0,...,9.5,51.6,True,False,False,Shot,Successful,FirstHalf,96.340075,-2.496846


In [9]:
# Split the data into training and testing sets
X = shot_data[['distance', 'angle']]
y = shot_data['is_goal']  # Assuming you have 'is_goal' column for shot events
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Now, you can use the trained model to predict xG values for new shot events
shot_data['distance'] = np.sqrt((shot_data['end_x'] - shot_data['x'])**2 + (shot_data['end_y'] - shot_data['y'])**2)
shot_data['angle'] = np.arctan2(shot_data['end_y'] - shot_data['y'], shot_data['end_x'] - shot_data['x'])
shot_data['xG'] = model.predict_proba(shot_data[['distance', 'angle']])[:, 1]


Accuracy: 0.875


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shot_data['distance'] = np.sqrt((shot_data['end_x'] - shot_data['x'])**2 + (shot_data['end_y'] - shot_data['y'])**2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shot_data['angle'] = np.arctan2(shot_data['end_y'] - shot_data['y'], shot_data['end_x'] - shot_data['x'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

In [10]:
shot_data

Unnamed: 0,id,event_id,minute,second,team_id,player_id,x,y,end_x,end_y,...,goal_mouth_y,is_shot,card_type,is_goal,type_display_name,outcome_type_display_name,period_display_name,distance,angle,xG
10,2633727783,10,0,20,26,369875,86.9,45.5,0.0,0.0,...,49.2,True,False,False,Shot,Successful,FirstHalf,98.09108,-2.659251,0.042867
37,2633727903,29,1,23,26,318871,81.3,46.4,0.0,0.0,...,49.5,True,False,False,Shot,Successful,FirstHalf,93.609027,-2.622977,0.013018
121,2633728303,56,7,32,23,474264,89.3,34.2,0.0,0.0,...,49.5,True,False,False,Shot,Successful,FirstHalf,95.624944,-2.775845,0.02254
195,2633728625,122,11,26,26,400828,93.1,45.4,0.0,0.0,...,50.7,True,False,False,Shot,Successful,FirstHalf,103.579776,-2.687876,0.166609
197,2633728627,123,11,28,26,355354,88.0,58.3,0.0,0.0,...,50.1,True,False,False,Shot,Successful,FirstHalf,105.559888,-2.55648,0.25364
204,2633728649,126,11,36,26,86829,76.5,65.8,0.0,0.0,...,37.5,True,False,False,Shot,Successful,FirstHalf,100.905352,-2.431247,0.086714
286,2633729015,173,16,1,26,400828,90.4,61.8,0.0,0.0,...,48.7,True,False,False,Shot,Successful,FirstHalf,109.505251,-2.541939,0.498448
340,2633729463,207,21,58,26,108226,88.5,50.0,0.0,0.0,...,48.5,True,False,False,Shot,Successful,FirstHalf,101.647676,-2.627328,0.105296
342,2633729467,208,22,0,26,318871,94.2,41.1,0.0,0.0,...,46.7,True,False,False,Shot,Successful,FirstHalf,102.775727,-2.730185,0.138715
359,2633729575,220,23,17,26,355354,77.0,57.9,0.0,0.0,...,51.6,True,False,False,Shot,Successful,FirstHalf,96.340075,-2.496846,0.026779
