In [1]:
!pip install -U mplsoccer



In [2]:
import json
import pandas as pd
import openpyxl
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from mplsoccer import Pitch, VerticalPitch, FontManager
from scipy.stats import gaussian_kde

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [3]:
def draw_pitch(ax):

    pitch_width = 80    
    pitch_length = 120  
    center_circle_radius = 9.15 
    penalty_area_length = 18  
    penalty_area_width = 44   
    goal_area_length = 6      
    goal_area_width = 20     


    ax.plot([0, 0], [0, pitch_length], color='black') 
    ax.plot([pitch_width, pitch_width], [0, pitch_length], color='black')  
    ax.plot([0, pitch_width], [0, 0], color='black')  
    ax.plot([0, pitch_width], [pitch_length, pitch_length], color='black')  

    ax.plot([0, pitch_width], [pitch_length / 2, pitch_length / 2], color='black')

    center_circle = plt.Circle((pitch_width / 2, pitch_length / 2), center_circle_radius, color='black', fill=False)
    ax.add_artist(center_circle)


    ax.plot([(pitch_width - penalty_area_width) / 2, (pitch_width - penalty_area_width) / 2],
            [0, penalty_area_length], color='black')
    ax.plot([(pitch_width + penalty_area_width) / 2, (pitch_width + penalty_area_width) / 2],
            [0, penalty_area_length], color='black')
    ax.plot([(pitch_width - penalty_area_width) / 2, (pitch_width + penalty_area_width) / 2],
            [penalty_area_length, penalty_area_length], color='black')

    ax.plot([(pitch_width - penalty_area_width) / 2, (pitch_width - penalty_area_width) / 2],
            [pitch_length - penalty_area_length, pitch_length], color='black')
    ax.plot([(pitch_width + penalty_area_width) / 2, (pitch_width + penalty_area_width) / 2],
            [pitch_length - penalty_area_length, pitch_length], color='black')
    ax.plot([(pitch_width - penalty_area_width) / 2, (pitch_width + penalty_area_width) / 2],
            [pitch_length - penalty_area_length, pitch_length - penalty_area_length], color='black')

    ax.plot([(pitch_width - goal_area_width) / 2, (pitch_width - goal_area_width) / 2],
            [0, goal_area_length], color='black')
    ax.plot([(pitch_width + goal_area_width) / 2, (pitch_width + goal_area_width) / 2],
            [0, goal_area_length], color='black')
    ax.plot([(pitch_width - goal_area_width) / 2, (pitch_width + goal_area_width) / 2],
            [goal_area_length, goal_area_length], color='black')

    ax.plot([(pitch_width - goal_area_width) / 2, (pitch_width - goal_area_width) / 2],
            [pitch_length - goal_area_length, pitch_length], color='black')
    ax.plot([(pitch_width + goal_area_width) / 2, (pitch_width + goal_area_width) / 2],
            [pitch_length - goal_area_length, pitch_length], color='black')
    ax.plot([(pitch_width - goal_area_width) / 2, (pitch_width + goal_area_width) / 2],
            [pitch_length - goal_area_length, pitch_length - goal_area_length], color='black')


    penalty_spot_distance = 12 


    ax.scatter(pitch_width / 2, penalty_spot_distance, color='black')


    ax.scatter(pitch_width / 2, pitch_length - penalty_spot_distance, color='black')

 
    ax.set_xlim(0, pitch_width)
    ax.set_ylim(0, pitch_length)
    ax.set_aspect('equal')
    ax.axis('off')


In [4]:
# D:/SoccerMatchData/data/events
folder_path = r"C:\Users\bouza\OneDrive\Desktop\data\events"
output_path = 'D:/SoccerMatchData/data/try'
goal = []
non_goal = []
json_files = glob.glob(f'{folder_path}/*.json')

for json_file in json_files:
    flag = True
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # lineup_data = []
        # if (data[0]['type']['name'] == 'Starting XI' and data[0]['team']['name'] == 'Barcelona') or data[1]['type']['name'] == 'Starting XI' and data[1]['team']['name'] == 'Barcelona':
        #     flag = False

        
    game_data = []
    # i = 0
    # if flag:
    #     continue    
    for match in data:
        if match['type']['name'] == "Shot" and match['shot']['type']['name'] != "Penalty":
            if 'freeze_frame' in match['shot']:
                fLocations = [frame['location'] for frame in match['shot']['freeze_frame']]
            else:
                fLocations = []
            game_data.append({
                'Time': match['timestamp'],
                'Team': match['team']['name'],
                'Player': match['player']['name'],
                'Location': match['location'],
                'Outcome': match['shot']['outcome']['name'],
                'Technique': match['shot']['technique']['name'],
                'Body_part': match['shot']['body_part']['name'],
                'xG': match['shot']['statsbomb_xg'],
                'fLocations': fLocations
            })

df = pd.DataFrame(game_data)
df_goals = (df[df['Outcome'] == 'Goal'])
df_non_goal = (df[df['Outcome'] != 'Goal'])
shot = []
goal_bool = np.array([])
goal = []
non_goal = []
xG_goal = []
xG_non_goal = []     
for index, row in df.iterrows():
    shot.append(row['Location'])
    if row['Outcome'] == 'Goal':
        goal_bool = np.append(goal_bool, 1)
    else:
        goal_bool = np.append(goal_bool, 0)
                
for index, row in df_goals.iterrows():
    location = row['Location']
    goal.append(location)
    xg = row['xG']
    xG_goal.append(xg)
    
for index, row in df_non_goal.iterrows():
        location = row['Location']
        non_goal.append(location)
        xg = row['xG']
        xG_non_goal.append(xg)
    
goal_coor = np.array(goal)
non_goal_coor = np.array(non_goal)
xG_g = np.array(xG_goal)
xG_non_g = np.array(xG_non_goal)
shoot = np.array(shot)

# np.save(output_path+'/goal_coor.npy', goal_coor)
# np.save(output_path+'/non_goal_coor.npy', non_goal_coor)
# np.save(output_path+'/xG_g.npy', xG_g)
# np.save(output_path+'/xG_non_g.npy', xG_non_g) 
# np.save(output_path+'/goal_bool.npy', goal_bool) 
# np.save(output_path+'/shot_location.npy', shoot)      
# pitch = VerticalPitch(pad_bottom=0.5,  
#                       half=True,  
#                       goal_type='box',
#                       goal_alpha=0.8, pitch_color='#22312b', line_color='#c7d5cc')  # control the goal transparency

# fig, ax = pitch.draw(figsize=(12, 10))

# ax.scatter(goal_coor[:, 1], goal_coor[:, 0], c='#ad993c', s = 5, label='Goal')

# ax.scatter(non_goal_coor[:, 1], non_goal_coor[:, 0], c='#ba4f45', s = 5, label='No goal')

# ax.legend()
# plt.show()
print(df)


            Time           Team                          Player  \
0   00:01:46.003  Real Sociedad           Willian José da Silva   
1   00:06:45.406  Real Sociedad                   Adnan Januzaj   
2   00:08:19.864  Real Sociedad           Willian José da Silva   
3   00:09:37.349      Barcelona            Andrés Iniesta Luján   
4   00:11:29.077      Barcelona                Jordi Alba Ramos   
5   00:16:48.437      Barcelona        Luis Alberto Suárez Díaz   
6   00:17:47.753      Barcelona           Gerard Piqué Bernabéu   
7   00:24:46.176      Barcelona                    Ivan Rakitić   
8   00:26:37.104      Barcelona        Luis Alberto Suárez Díaz   
9   00:29:16.940  Real Sociedad          Mikel Oyarzabal Ugarte   
10  00:39:13.086  Real Sociedad           Willian José da Silva   
11  00:01:29.786  Real Sociedad           Willian José da Silva   
12  00:01:53.054  Real Sociedad          Mikel Oyarzabal Ugarte   
13  00:11:21.786      Barcelona       Philippe Coutinho Correi

In [5]:
def extract_data(output_path):
    return np.load(output_path)

In [6]:
# import numpy as np
left_post_team1_2d = np.array([0, 36]) 
right_post_team1_2d =  np.array([0, 44])  
left_post_team2_2d = np.array([120, 44]) 
right_post_team2_2d = np.array([120, 36])
  
def perceivedLength(shot_location):
    global left_post_team1_2d, right_post_team1_2d, left_post_team2_2d, right_post_team2_2d
    first_dist = np.linalg.norm(shot_location - left_post_team1_2d)
    second_dist = np.linalg.norm(shot_location - right_post_team2_2d)
    # print("First distance is " + str(first_dist))
    # print("Second distance is " + str(second_dist))
    if first_dist < second_dist: 
        left_post = left_post_team1_2d
        right_post = right_post_team1_2d
    else:  
        left_post = left_post_team2_2d
        right_post = right_post_team2_2d

    post_to_post_dist = np.linalg.norm(left_post - right_post) 
    shot_to_left_post = np.linalg.norm(shot_location - left_post)  
    shot_to_right_post = np.linalg.norm(shot_location - right_post) 

    cos_theta = (shot_to_left_post**2 + shot_to_right_post**2 - post_to_post_dist**2) / (2 * shot_to_left_post * shot_to_right_post)
    angle_rad = np.arccos(cos_theta)  
    
    angle_deg = np.degrees(angle_rad)
    return angle_deg

# df['angle'] = df['Location'].apply(lambda loc: perceivedLength(np.array(loc)))
# print(df)
# test = np.array([20,43.7])
def sign(p1, p2, p3):
    return (p1[0] - p3[0]) * (p2[1] - p3[1]) - (p2[0] - p3[0]) * (p1[1] - p3[1])

def is_point_in_triangle(pt, v1, v2, v3):
    d1 = sign(pt, v1, v2)
    d2 = sign(pt, v2, v3)
    d3 = sign(pt, v3, v1)

    has_neg = (d1 < 0) or (d2 < 0) or (d3 < 0)
    has_pos = (d1 > 0) or (d2 > 0) or (d3 > 0)

    return not (has_neg and has_pos)

def trianglePlayers(df):
    shot_location = np.array(df['Location'])  
    freeze_frame = df['fLocations']
    # print(f"Processing shot at location: {shot_location}, with freeze_frame: {freeze_frame}")
    if shot_location[0] < 60:  
        print("Shot loc is " + str(shot_location))
        left_post = left_post_team1_2d
        right_post = right_post_team1_2d
    else:
        left_post = left_post_team2_2d
        right_post = right_post_team2_2d
    players_in_window = []
    players_in_window_location = []
    for frame in freeze_frame:
        player_location = np.array(frame)
        print(player_location)
        if is_point_in_triangle(player_location, shot_location, left_post, right_post):
            print("In here")
            distance_to_shot = np.linalg.norm(player_location - shot_location)
            players_in_window.append(distance_to_shot) 
            players_in_window_location.append(player_location.tolist())
    
    # Find closest player if any are in the window
    best_dist = min(players_in_window) if players_in_window else np.nan
    return pd.Series([players_in_window_location, best_dist])

def closestPlayerInTriangle(shot_location, freeze_frame):

    shot_location = np.array(shot_location)


    if shot_location[0] < 60:  
        left_post = left_post_team1_2d
        right_post = right_post_team1_2d
    else:  
        left_post = left_post_team2_2d
        right_post = right_post_team2_2d

    closest_player_location = None
    min_distance = float('inf')

    for player_location in freeze_frame:
        player_location = np.array(player_location)

        if is_point_in_triangle(player_location, shot_location, left_post, right_post):
            distance_to_shot = np.linalg.norm(player_location - shot_location)
            if distance_to_shot < min_distance:
                min_distance = distance_to_shot
                closest_player_location = player_location.tolist()

    return closest_player_location

def defender_line(freeze_frame, delta_y):
    filtered_freeze_frame = []

    for i, player1 in enumerate(freeze_frame):
        for j, player2 in enumerate(freeze_frame):
            if i < j:  
                y1 = player1[1]
                y2 = player2[1]
                if abs(y1 - y2) <= delta_y:
                    filtered_freeze_frame.append(player1)
                    filtered_freeze_frame.append(player2)

    # set_loc = set()
    # for loc in filtered_freeze_frame:
    #     set_loc.add(tuple(loc))

    # # Now, convert each tuple back to a list
    # filtered_freeze_frame = []
    # for item in set_loc:
    #     filtered_freeze_frame.append(list(item))
    return filtered_freeze_frame

# df[['Players_in_Window_Locations', 'Closest_Player_Distance']] = df.apply(trianglePlayers, axis=1)
# df['Filtered_Freeze_Frame'] = df['Players_in_Window_Locations'].apply(lambda freeze_frame: defender_line(freeze_frame, 0.5))
print(df)


            Time           Team                          Player  \
0   00:01:46.003  Real Sociedad           Willian José da Silva   
1   00:06:45.406  Real Sociedad                   Adnan Januzaj   
2   00:08:19.864  Real Sociedad           Willian José da Silva   
3   00:09:37.349      Barcelona            Andrés Iniesta Luján   
4   00:11:29.077      Barcelona                Jordi Alba Ramos   
5   00:16:48.437      Barcelona        Luis Alberto Suárez Díaz   
6   00:17:47.753      Barcelona           Gerard Piqué Bernabéu   
7   00:24:46.176      Barcelona                    Ivan Rakitić   
8   00:26:37.104      Barcelona        Luis Alberto Suárez Díaz   
9   00:29:16.940  Real Sociedad          Mikel Oyarzabal Ugarte   
10  00:39:13.086  Real Sociedad           Willian José da Silva   
11  00:01:29.786  Real Sociedad           Willian José da Silva   
12  00:01:53.054  Real Sociedad          Mikel Oyarzabal Ugarte   
13  00:11:21.786      Barcelona       Philippe Coutinho Correi

In [7]:
left_post_team1 = np.array([0, 36, 0]) 
right_post_team1 =  np.array([0, 44, 0])
mid_ground_team1 = np.array([0, 40, 0])
mid_top_team1 = np.array([0, 40, 2.67])
left_post_team2 = np.array([120, 44, 0]) 
right_post_team2 = np.array([120, 36, 0])
mid_ground_team2 = np.array([120, 40, 0])
mid_top_team2 = np.array([120, 40, 2.67])

def perceivedLength3d(shot_location):
    global left_post_team1, right_post_team2, left_post_team2, right_post_team1, mid_top_team1, mid_top_team2, mid_ground_team1, mid_ground_team2
    first_dist = np.linalg.norm(shot_location - left_post_team1)
    second_dist = np.linalg.norm(shot_location - right_post_team2)
    # print("First distance is " + str(first_dist))
    # print("Second distance is " + str(second_dist))
    if first_dist < second_dist: 
        left_post = left_post_team1
        right_post = right_post_team1
        mid_top = mid_top_team1
        mid_ground = mid_ground_team1
    else:  
        left_post = left_post_team2
        right_post = right_post_team2
        mid_top = mid_top_team2
        mid_ground = mid_ground_team2

    #2d 
    post_to_post_dist = np.linalg.norm(left_post - right_post) 
    shot_to_left_post = np.linalg.norm(shot_location - left_post)  
    shot_to_right_post = np.linalg.norm(shot_location - right_post) 

    top_to_shot = np.linalg.norm(shot_location - mid_top) 
    ground_mid_to_shot = np.linalg.norm(shot_location - mid_ground) 
    top_to_mid = np.linalg.norm(mid_top - mid_ground) 

    cos_theta_1d = (shot_to_left_post**2 + shot_to_right_post**2 - post_to_post_dist**2) / (2 * shot_to_left_post * shot_to_right_post)
    cos_theta_2d = cos_theta_2d = (top_to_shot**2 + ground_mid_to_shot**2 - top_to_mid**2) / (2 * top_to_shot * ground_mid_to_shot)

    angle_rad = np.arccos(cos_theta_1d)  
    angle_rad_2 = np.arccos(cos_theta_2d)

    angle_deg = np.degrees(angle_rad)
    angle_deg_2 = np.degrees(angle_rad_2)
    pLength = angle_deg * angle_deg_2
    return pLength

test = np.array([35,43.7, 0])
# def updated_3d_length()

df['Location3d'] = df['Location'].apply(lambda loc: np.append(loc, [0]))

def perceivedLength3d_defender(shot_location, defender_location):
    x, y = defender_location
    height = 1.75  
    width = 0.3    
    
    bottom_left = np.array([x, y - width, 0])
    bottom_right = np.array([x, y + width, 0])
    top_left = np.array([x, y - width, height])
    top_right = np.array([x, y + width, height])


    post_to_post_dist = np.linalg.norm(bottom_left - bottom_right)  
    shot_to_left_post = np.linalg.norm(shot_location - bottom_left)
    shot_to_right_post = np.linalg.norm(shot_location - bottom_right)

    top_to_shot = np.linalg.norm(shot_location - top_left)
    ground_mid_to_shot = np.linalg.norm(shot_location - bottom_left)
    top_to_mid = np.linalg.norm(top_left - bottom_left) 

    cos_theta_1d = (shot_to_left_post**2 + shot_to_right_post**2 - post_to_post_dist**2) / (2 * shot_to_left_post * shot_to_right_post)
    cos_theta_2d = (top_to_shot**2 + ground_mid_to_shot**2 - top_to_mid**2) / (2 * top_to_shot * ground_mid_to_shot)


    angle_rad_horizontal = np.arccos(cos_theta_1d)
    angle_rad_vertical = np.arccos(cos_theta_2d)

    angle_deg_horizontal = np.degrees(angle_rad_horizontal)
    angle_deg_vertical = np.degrees(angle_rad_vertical)

    # Perceived length is the product of horizontal and vertical angles
    perceived_length = angle_deg_horizontal * angle_deg_vertical

    return perceived_length

def is_header(Body_Part):
    if Body_Part == 'Head':
        return 1
    return 0
# df['pl3D'] = df['Location3d'].apply(lambda loc: perceivedLength3d(np.array(loc)))
# print(df)

perceived_length_3d = []
perceived_length_3d_defender = []

for _, row in df.iterrows():
    shot_location = np.array(row['Location3d'])
    freeze_frame = row['fLocations']

    goal_perceived_length = perceivedLength3d(shot_location)
    perceived_length_3d.append(goal_perceived_length)


    closest_defender_location = closestPlayerInTriangle(
        row['Location'],  
        freeze_frame     
    )
    print(freeze_frame)
    print(closest_defender_location)

    if closest_defender_location is not None:
        defender_perceived_length = perceivedLength3d_defender(
            shot_location, closest_defender_location
        )
    else:
        defender_perceived_length = 0  

    perceived_length_3d_defender.append(defender_perceived_length)

df['3dPerceivedLength'] = perceived_length_3d
df['3dPerceivedLength_Defender'] = perceived_length_3d_defender



[[91.6, 59.6], [86.1, 32.6], [97.7, 54.8], [108.6, 59.8], [110.2, 57.1], [104.6, 43.4], [109.7, 47.4], [111.8, 40.4], [110.9, 35.4], [118.0, 41.3], [84.0, 32.3], [98.3, 50.8], [110.3, 61.4], [109.6, 45.9], [106.2, 33.0]]
[111.8, 40.4]
[[116.8, 41.4], [94.1, 57.0], [99.3, 56.1], [95.5, 41.0], [103.6, 45.9], [102.0, 39.4], [96.9, 43.5], [105.0, 45.2], [97.6, 30.1]]
[116.8, 41.4]
[[116.9, 38.5], [93.0, 48.7], [100.9, 55.4], [102.9, 41.6], [100.6, 33.3], [100.9, 31.3], [97.9, 60.1], [96.0, 39.3], [102.3, 39.7], [102.1, 20.4]]
[116.9, 38.5]
[[103.3, 21.4], [101.4, 23.2], [102.2, 29.0], [104.8, 33.4], [99.7, 28.1], [92.5, 36.8], [98.5, 18.2], [103.3, 46.0], [117.7, 37.6], [103.8, 14.5], [100.7, 51.7], [101.5, 31.0]]
[117.7, 37.6]
[[108.8, 50.8], [112.5, 24.8], [105.9, 36.7], [97.1, 55.1], [100.5, 41.7], [98.4, 35.1], [112.0, 49.1], [113.1, 43.7], [112.4, 40.9], [114.7, 33.2], [103.3, 53.1], [111.4, 31.4], [103.3, 42.9], [104.9, 33.7], [117.4, 37.3]]
[111.4, 31.4]
[[105.4, 27.4], [105.3, 28.3

In [None]:
def logistic_regression(pLength, Body_part ,goal):
    model_name = 'Logistic'
    models = {}
    models['Logistic'] = {}
    
    # Stack features into a 2D array for regression
    X = np.column_stack((pLength, Body_part))
    y = goal
    
    # Train Logistic Regression model
    models['Logistic']['model'] = LogisticRegression()
    models[model_name]['model'].fit(X, y)
    models[model_name]['y_pred'] = models[model_name]['model'].predict_proba(X)[:, 1]
    coefficients = models[model_name]['model'].coef_
    intercept = models[model_name]['model'].intercept_
    models[model_name]['coefficients'] = coefficients
    models[model_name]['intercept'] = intercept
    
    return models

def calculate_r2(y_true, y_pred):
    
    return metrics.r2_score(y_true, y_pred)

def logistic_regression_formula(coef, intercept, feature_names):
    terms = [f"{c}*{name}" for c, name in zip(coef, feature_names)]
    formula = f"1 / (1 + exp(-({intercept} + {' + '.join(terms)})))"
    return formula

pLength = []
defender_length = []
adjusted_plength = []
dist = []
gk_dist = []
in_triangle = []
three_away = []
Body_part = []
goal_bool = []

# Iterate through the dataframe to populate features
for _, row in df.iterrows():
    # if np.isnan(row['GK_dist']):  # Skip rows with missing GK_dist
    #     continue
    p_length = row['3dPerceivedLength']
    defender_length = row['3dPerceivedLength_Defender']
    adjusted_length = p_length - defender_length 
    pLength.append(perceivedLength3d(row['Location3d']))
    adjusted_plength.append(adjusted_length)
    # dist.append(calculate_distance(row['Location']))
    # gk_dist.append(row['GK_dist'])
    # in_triangle.append(closestPlayerInTriangle(row['Location'], row['fLocations']))
    # three_away.append(three_meters_away(row['Location'], row['freeze_frame']))
    Body_part.append(is_header(row['Body_part']))
    goal_bool.append(1 if row['Outcome'] == 'Goal' else 0)

model = logistic_regression(adjusted_plength, Body_part,goal_bool)
target = goal_bool
intercept = model['Logistic']['intercept']
coefficients = model['Logistic']['coefficients']
print("Intercept:", intercept)
print("Coefficients:", coefficients)
y_pred = model['Logistic']['y_pred']
r2_value = calculate_r2(goal_bool, y_pred)
print("R^2 Value:", r2_value)
r2_statsbomb = calculate_r2(target, xg)

Intercept: [-0.61193509]
Coefficients: [[-0.02213141  0.00088754]]
R^2 Value: 0.029461491927391115


InvalidParameterError: The 'y_pred' parameter of r2_score must be an array-like. Got 0.009380965 instead.