In [1]:
import os
import pandas as pd
import pathlib
import numpy as np

from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf

import joblib

In [2]:
DATA_PATH = os.path.join(str(pathlib.Path().resolve().parents[1]), 'data', 'Wyscout')

In [3]:
PITCH_MAX_X = 105
PITCH_MAX_Y = 68
GOAL_WIDTH = 7.32

WYSCOUT_PITCH_MAX_X = 100
WYSCOUT_PITCH_MAX_Y = 100
WYSCOUT_BOX_X = 84
WYSCOUT_BOX_MIN_Y = 19
WYSCOUT_BOX_MAX_Y = 81

In [4]:
TRANSFORM_X = PITCH_MAX_X / WYSCOUT_PITCH_MAX_X
TRANSFORM_Y = PITCH_MAX_Y / WYSCOUT_PITCH_MAX_Y

BOX_X = WYSCOUT_BOX_X * TRANSFORM_X
BOX_MIN_Y = WYSCOUT_BOX_MIN_Y * TRANSFORM_Y
BOX_MAX_Y = WYSCOUT_BOX_MAX_Y * TRANSFORM_Y

In [5]:
leagues = ['England', 'France', 'Germany', 'Italy', 'Spain']
all_events_df = pd.DataFrame() 

for league in leagues:
    events_file_name = f'events_{league}.json'
    events_path = os.path.join(DATA_PATH, 'events', events_file_name)

    events_df = pd.read_json(events_path, encoding='unicode-escape')
    events_df = events_df.loc[events_df.apply (lambda x: len(x.positions) == 2, axis = 1)]
    
    all_events_df = pd.concat([all_events_df, events_df], ignore_index=True)
    
# chains_df[["subEventName", "playerId", "matchId", "possession_chain", ]]
all_events_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175
...,...,...,...,...,...,...,...,...,...,...,...,...
3070681,8,Simple pass,[{'id': 1801}],20623,"[{'y': 25, 'x': 66}, {'y': 2, 'x': 88}]",2565927,Pass,682,2H,2939.077491,85,253302671
3070682,7,Acceleration,[{'id': 1801}],122832,"[{'y': 2, 'x': 88}, {'y': 21, 'x': 97}]",2565927,Others on the ball,682,2H,2940.515560,70,253302673
3070683,8,Cross,"[{'id': 401}, {'id': 1802}]",122832,"[{'y': 21, 'x': 97}, {'y': 26, 'x': 92}]",2565927,Pass,682,2H,2942.098761,80,253302674
3070684,8,Simple pass,[{'id': 1801}],40756,"[{'y': 74, 'x': 8}, {'y': 56, 'x': 9}]",2565927,Pass,675,2H,2943.089232,85,253302698


In [6]:
def transform_x(row, i):
    return row[i]['x'] * TRANSFORM_X

def transform_y(row, i):
    return (WYSCOUT_PITCH_MAX_Y - row[i]['y']) * TRANSFORM_Y

all_events_df["x"] = all_events_df.positions.apply(lambda row: transform_x(row, 0))
all_events_df["y"] = all_events_df.positions.apply(lambda row: transform_y(row, 0))
all_events_df["end_x"] = all_events_df.positions.apply(lambda row: transform_x(row, 1))
all_events_df["end_y"] = all_events_df.positions.apply(lambda row: transform_y(row, 1))
all_events_df

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,x,y,end_x,end_y
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171,51.45,34.68,32.55,14.96
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.946850,83,177959172,32.55,14.96,53.55,17.00
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173,53.55,17.00,36.75,19.72
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174,36.75,19.72,43.05,3.40
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175,43.05,3.40,75.60,8.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3070681,8,Simple pass,[{'id': 1801}],20623,"[{'y': 25, 'x': 66}, {'y': 2, 'x': 88}]",2565927,Pass,682,2H,2939.077491,85,253302671,69.30,51.00,92.40,66.64
3070682,7,Acceleration,[{'id': 1801}],122832,"[{'y': 2, 'x': 88}, {'y': 21, 'x': 97}]",2565927,Others on the ball,682,2H,2940.515560,70,253302673,92.40,66.64,101.85,53.72
3070683,8,Cross,"[{'id': 401}, {'id': 1802}]",122832,"[{'y': 21, 'x': 97}, {'y': 26, 'x': 92}]",2565927,Pass,682,2H,2942.098761,80,253302674,101.85,53.72,96.60,50.32
3070684,8,Simple pass,[{'id': 1801}],40756,"[{'y': 74, 'x': 8}, {'y': 56, 'x': 9}]",2565927,Pass,675,2H,2943.089232,85,253302698,8.40,17.68,9.45,29.92


In [7]:
shots = all_events_df[all_events_df["eventName"] == "Shot"].copy()
shots

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,x,y,end_x,end_y
46,10,Shot,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",25413,"[{'y': 41, 'x': 88}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,94.595788,100,177959212,92.40,40.12,0.0,68.0
62,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",26150,"[{'y': 52, 'x': 85}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,179.854785,100,177959247,89.25,32.64,105.0,0.0
91,10,Shot,"[{'id': 101}, {'id': 403}, {'id': 201}, {'id':...",14763,"[{'y': 52, 'x': 96}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,254.745027,100,177959280,100.80,32.64,105.0,0.0
128,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",7868,"[{'y': 33, 'x': 81}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,425.824035,100,177959289,85.05,45.56,0.0,68.0
249,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1205}, {'id'...",7868,"[{'y': 30, 'x': 75}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,815.462015,100,177959429,78.75,47.60,0.0,68.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3070350,10,Shot,"[{'id': 1901}, {'id': 402}, {'id': 2101}, {'id...",267134,"[{'y': 66, 'x': 93}, {'y': 0, 'x': 0}]",2565927,Shot,682,2H,1776.027412,100,253302272,97.65,23.12,0.0,68.0
3070401,10,Shot,"[{'id': 402}, {'id': 2101}, {'id': 1802}]",267134,"[{'y': 32, 'x': 90}, {'y': 0, 'x': 0}]",2565927,Shot,682,2H,1944.188119,100,253302329,94.50,46.24,0.0,68.0
3070567,10,Shot,"[{'id': 101}, {'id': 401}, {'id': 201}, {'id':...",134174,"[{'y': 25, 'x': 92}, {'y': 0, 'x': 0}]",2565927,Shot,682,2H,2385.837008,100,253302547,96.60,51.00,0.0,68.0
3070623,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1206}, {'id'...",3321,"[{'y': 30, 'x': 82}, {'y': 100, 'x': 100}]",2565927,Shot,675,2H,2722.835144,100,253302642,86.10,47.60,105.0,0.0


In [8]:
def calculate_distance_and_angle(x, y):
    c = abs(y - PITCH_MAX_Y / 2)
    d = PITCH_MAX_X - x
    distance = np.sqrt(d ** 2 + c ** 2)
    angle = np.arctan((GOAL_WIDTH * d) / (d ** 2 + c ** 2 - (GOAL_WIDTH / 2) ** 2))
    angle = angle + np.pi * (angle <= 0)
    
    return distance, angle

shots["distance"], shots["angle"] = zip(*shots.apply(lambda row: calculate_distance_and_angle(row["x"], row["y"]), axis=1))

scaler = StandardScaler()
# shots["distance"] = scaler.fit_transform(shots["distance"].values.reshape(-1, 1))
# shots["angle"] = scaler.fit_transform(shots["angle"].values.reshape(-1, 1))

# find goals
shots["isGoal"] = shots.tags.apply(lambda x: 1 if {'id': 101} in x else 0).astype(object)


# headers have id = 403
headers = shots.loc[shots.apply (lambda x: {'id': 403} in x.tags, axis = 1)]
non_headers = shots.drop(headers.index)

In [9]:
def build_xG_model(formula, data):
    return smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit()

formula = "isGoal ~ distance + angle"
headers_model = build_xG_model(formula=formula, data=headers)
non_headers_model = build_xG_model(formula=formula, data=non_headers)

In [10]:
def predict_xG(model, df):
    params = model.params
    return 1 / (1 + np.exp(params[0] + params[1] * df['distance'] + params[2] * df['angle']))

headers_xG = predict_xG(headers_model, headers)
non_headers_xG = predict_xG(non_headers_model, non_headers)

In [11]:
headers = headers.assign(xG = headers_xG)
non_headers = non_headers.assign(xG = non_headers_xG)

all_shots_xg = pd.concat([non_headers, headers])
all_shots_xg

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,x,y,end_x,end_y,distance,angle,isGoal,xG
46,10,Shot,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",25413,"[{'y': 41, 'x': 88}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,94.595788,100,177959212,92.40,40.12,0.0,68.0,14.007655,0.467241,1,0.138584
62,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",26150,"[{'y': 52, 'x': 85}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,179.854785,100,177959247,89.25,32.64,105.0,0.0,15.808608,0.453557,0,0.114647
128,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",7868,"[{'y': 33, 'x': 81}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,425.824035,100,177959289,85.05,45.56,0.0,68.0,23.057235,0.274666,0,0.042662
249,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1205}, {'id'...",7868,"[{'y': 30, 'x': 75}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,815.462015,100,177959429,78.75,47.60,0.0,68.0,29.563872,0.219665,0,0.019770
394,10,Shot,"[{'id': 401}, {'id': 2101}, {'id': 1802}]",7945,"[{'y': 39, 'x': 90}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,1286.061650,100,177959606,94.50,41.48,0.0,68.0,12.891873,0.466040,0,0.153457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3068556,10,Shot,"[{'id': 101}, {'id': 403}, {'id': 201}, {'id':...",225946,"[{'y': 60, 'x': 91}, {'y': 100, 'x': 100}]",2565926,Shot,684,2H,1832.038735,100,253222172,95.55,27.20,105.0,0.0,11.642272,0.515285,1,0.076085
3068790,10,Shot,"[{'id': 403}, {'id': 201}, {'id': 1202}, {'id'...",3335,"[{'y': 54, 'x': 91}, {'y': 100, 'x': 100}]",2565926,Shot,684,2H,2828.726041,100,253222439,95.55,31.28,105.0,0.0,9.833662,0.692985,0,0.117039
3069492,10,Shot,"[{'id': 101}, {'id': 403}, {'id': 201}, {'id':...",3322,"[{'y': 48, 'x': 93}, {'y': 100, 'x': 100}]",2565927,Shot,675,1H,1893.736105,100,253301228,97.65,35.36,105.0,0.0,7.474764,0.902493,1,0.193131
3070177,10,Shot,"[{'id': 403}, {'id': 201}, {'id': 1203}, {'id'...",274435,"[{'y': 40, 'x': 91}, {'y': 0, 'x': 0}]",2565927,Shot,682,2H,1263.415273,100,253302113,95.55,40.80,0.0,68.0,11.642272,0.515285,0,0.076085


In [12]:
headers_model.params

Intercept    1.657052
distance     0.131463
angle       -1.340644
dtype: float64

In [13]:
non_headers_model.params

Intercept    1.034281
distance     0.108651
angle       -1.560485
dtype: float64

In [14]:
model_filename = 'headers_xG_model.sav'
path_model = os.path.join(str(pathlib.Path().resolve().parents[1]), 'models', model_filename)
joblib.dump(headers_model, path_model)

['C:\\Users\\Alvaro\\repos\\soccermatics\\models\\headers_xG_model.sav']

In [15]:
model_filename = 'non_headers_xG_model.sav'
path_model = os.path.join(str(pathlib.Path().resolve().parents[1]), 'models', model_filename)
joblib.dump(non_headers_model, path_model)

['C:\\Users\\Alvaro\\repos\\soccermatics\\models\\non_headers_xG_model.sav']